Rework direct convolution heuristic on OpenCL

Resolves COMPMID-5634

Change-Id: I075de70d509d0c4430b4bcf3f218384e237a3a56
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/453708
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8473
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index c739b9d..a53fdbb 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp
@@ -61,7 +61,14 @@
 
     if(gws_x < gws_y)
     {
-        return cl::NDRange(4, 16, 1);
+        if(gws_x < 4)
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
+        }
+        else
+        {
+            return cl::NDRange(std::min(gws_x, static_cast<size_t>(4u)), 8, 1);
+        }
     }
     else
     {
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index e602fbb..2e7ed5a 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -116,12 +116,12 @@
     // In case of dynamic tensor support, the following dimensions should be passed as function argument.
 #define _IWEI_WIDTH WEI_WIDTH
 #define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH src_w
-#define _ISRC_HEIGHT src_h
-#define _ISRC_CHANNELS src_c
-#define _IDST_WIDTH dst_w
-#define _IDST_HEIGHT dst_h
-#define _IDST_CHANNELS dst_c
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
 
     // If quantized, the output tile has to be quantized first before being stored to global memory
@@ -159,12 +159,25 @@
 
     for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
     {
-        int ck = 0;
         int xk = i % _IWEI_WIDTH;
         int yk = i / _IWEI_WIDTH;
 
-        int k = 0;
-        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        TILE(int, M0, 1, my);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            int x_s = xi[i].v + xk;
+            int y_s = yi[i].v + yk;
+            my[i].v = x_s + y_s *_ISRC_WIDTH;
+            my[i].v = my[i].v + bout * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
+            my[i].v = select(-1, my[i].v, x_s >= 0);
+            my[i].v = select(-1, my[i].v, x_s < _ISRC_WIDTH);
+            my[i].v = select(-1, my[i].v, y_s >= 0);
+            my[i].v = select(-1, my[i].v, y_s < _ISRC_HEIGHT);
+        })
+
+        int ck = 0;
+        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
         {
             TILE(SRC_DATA_TYPE, M0, K0, a);
             TILE(WEI_DATA_TYPE, N0, K0, b);
@@ -175,13 +188,8 @@
                 a[i].v = ZERO_VALUE;
             })
 
-            LOOP_UNROLLING(int, i, 0, 1, N0,
-            {
-                b[i].v = ZERO_VALUE;
-            })
-
             // Load tile from the src tensor
-            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+            T_LOAD2D_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, my, a);
 
             // Load tile from the weights tensor
             T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
@@ -192,15 +200,13 @@
             // Apply the offset correction (correction usually needed for asymmetric quantized computation)
             // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
             T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ck += K0;
         }
 
         // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
         // This #if directive should be removed in case of dynamic tensor support
 #if defined(LEFTOVER_LOOP)
         // Left-over accumulations
-        for(; k < _ISRC_CHANNELS; ++k)
+        for(; ck < _ISRC_CHANNELS; ++ck)
         {
             TILE(SRC_DATA_TYPE, M0, 1, a);
             TILE(WEI_DATA_TYPE, N0, 1, b);
@@ -229,8 +235,6 @@
             // Apply the offset correction (operation usually needed for asymmetric quantized computation)
             // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
             T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ++ck;
         }
 #endif // defined(LEFTOVER_LOOP)
     }
@@ -249,17 +253,6 @@
 
 #endif // HAS_BIAS
 
-    TILE(uint, M0, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
-        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-    })
-
-    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
 #if defined(IS_QUANTIZED)
 
     TILE(DST_DATA_TYPE, M0, N0, cq);
@@ -271,6 +264,17 @@
     // Apply activation
     T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
 
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
     // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
     // Store the tile in reverse order so the invalid values are overwritten with the valid ones
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 4b6144a..6279fb4 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -653,6 +653,17 @@
         })                                                                                                                                                            \
     })
 
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, yi, dst)                \
+    ({                                                                                                                                                                \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
+        {                                                                                                                                                             \
+            if(yi[_i].v >= 0)                                                                                                                                     \
+            {                                                                                                                                                         \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[_i].v, STRIDE_Y);                                                               \
+            }                                                                                                                                                         \
+        })                                                                                                                                                            \
+    })
+
 /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
  *
  * @param[in]  DATA_TYPE     Data type