COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX

Introducing new CLArgMinMax kernel

Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2202
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 28b1a32..9754beb 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -365,4 +365,12 @@
         return static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
 }
+
+cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
+{
+    const unsigned int width_leftover = input_dimension % vector_size;
+    const unsigned int border_width   = (width_leftover != 0) ? vector_size - width_leftover : 0;
+    const unsigned int num_of_threads = ((input_dimension + border_width) / 16);
+    return cl::NDRange(std::min(8U, num_of_threads));
+}
 } // namespace arm_compute