COMPMID-1816: Use parallel reduction on 0 axis in CL ARG_MIN/ARG_MAX
Introducing new CLArgMinMax kernel
Change-Id: I0b8254207cc3859d19ceef9b6429cf5c1c586db0
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2202
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 28b1a32..9754beb 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -365,4 +365,12 @@
return static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
}
}
+
+cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
+{
+ const unsigned int width_leftover = input_dimension % vector_size;
+ const unsigned int border_width = (width_leftover != 0) ? vector_size - width_leftover : 0;
+ const unsigned int num_of_threads = ((input_dimension + border_width) / 16);
+ return cl::NDRange(std::min(8U, num_of_threads));
+}
} // namespace arm_compute