Rework direct convolution heuristic on OpenCL

Resolves COMPMID-5634

Change-Id: I075de70d509d0c4430b4bcf3f218384e237a3a56
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/453708
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8473
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index c739b9d..a53fdbb 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -61,7 +61,14 @@
 
     if(gws_x < gws_y)
     {
-        return cl::NDRange(4, 16, 1);
+        if(gws_x < 4)
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
+        }
+        else
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(4u)), 8, 1);
+        }
     }
     else
     {