COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX

Introducing new CLArgMinMax kernel

Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2202
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index cd65eaf..7e549be 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -190,5 +190,16 @@
  * @return An opencl kernel
  */
 cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts);
+
+/** Creates a suitable LWS hint object for parallel implementations. Sets the number of WG based on the input size.
+ *  If input width is smaller than 128 we can use fewer threads than 8.
+ *
+ * @param[in] input_dimension number of elements along the dimension to apply the parallellization
+ * @param[in] vector_size     size of the vector in OpenCL
+ *
+ * @return An LWS hint object
+ */
+cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size);
+
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLHELPERS_H__ */