Use #if directive instead of regular condition in CLDirectConv2D

Resolve COMPMID-5004

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ib3e1b5a891234316c411ea9825ec10c68c4ab5a3
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6788
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Sheri Zhang <sheri.zhang@arm.com>
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 35ff86a..f1b422a 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -192,36 +192,35 @@
 
         // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
         // This #if directive should be removed in case of dynamic tensor support
-        if((_ISRC_CHANNELS % K0) != 0)
+#if defined(LEFTOVER_LOOP)
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
         {
-            // Left-over accumulations
-            for(; k < _ISRC_CHANNELS; ++k)
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
             {
-                TILE(SRC_DATA_TYPE, M0, 1, a);
-                TILE(WEI_DATA_TYPE, N0, 1, b);
+                a[i].v = ZERO_VALUE;
+            })
 
-                LOOP_UNROLLING(int, i, 0, 1, M0,
-                {
-                    a[i].v = ZERO_VALUE;
-                })
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
 
-                // Load tile from the src tensor
-                T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
 
-                // Load tile from the weights tensor
-                // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
-                T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
 
-                // Compute the matrix multiplication between two tiles
-                T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
 
-                // Apply the offset correction (operation usually needed for asymmetric quantized computation)
-                // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-                T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-                ++ck;
-            }
+            ++ck;
         }
+#endif // defined(LEFTOVER_LOOP)
     }
 
     // Offset correction required for the quantized asymmetric computation
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 7107def..5af7aa9 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -453,6 +453,7 @@
         build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
         build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
         build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+        build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
         build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
 
         if(is_data_type_quantized(data_type))
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index 92b22e7..2b9a51d 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -258,20 +258,15 @@
                 const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
                 const bool is_ifm_ge_16       = src->dimension(idx_c) >= 16;
                 const bool is_ifm_gt_ofm      = weights->dimension(0U) * weights->dimension(1U) * weights->dimension(2U) > weights->dimension(3U);
-                const bool is_ofm_le_4        = weights->dimension(3U) <= 4;
 
                 // Run Winograd if valid and IFM >= 16
                 if(is_wino_valid && is_ifm_ge_16)
                 {
                     return ConvolutionMethod::WINOGRAD;
                 }
-                // Run Direct for Large kernel size
-                if(is_large_kernel_sz && is_ifm_gt_ofm && is_direct_valid)
-                {
-                    return ConvolutionMethod::DIRECT;
-                }
 
-                if(is_ofm_le_4 && is_ifm_gt_ofm && is_direct_valid)
+                // Run Direct for Large kernel size
+                if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm)
                 {
                     return ConvolutionMethod::DIRECT;
                 }