COMPMID-895 - Optimizing CLDepthwiseConvolution3x3Kernel

This patch brings the MACs utilisation up to 25 % when both stride_x and stride_y are equal to 1
Performance reported in the following confluence page:
https://confluence.arm.com/display/MLENG/Depthwise+convolution+3x3+FP32+performance%3A+ACL+18.02

Change-Id: Ida1b64be9a88805902a3d90194559b58eb1224a3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/119068
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index a9167ee..2a60f60 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -98,9 +98,82 @@
     build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
 
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+
+    // Configure kernel window
+    const unsigned int conv_pad_left   = std::max(conv_info.pad_left(), 1U);
+    const unsigned int conv_pad_top    = std::max(conv_info.pad_top(), 1U);
+    const unsigned int conv_pad_right  = std::max(conv_info.pad_right(), 1U);
+    const unsigned int conv_pad_bottom = std::max(conv_info.pad_bottom(), 1U);
+
+    unsigned int num_elems_read_per_iteration_x    = 0;
+    unsigned int num_elems_read_per_iteration_y    = 0;
+    unsigned int num_elems_written_per_iteration_x = 0;
+    unsigned int num_elems_written_per_iteration_y = 0;
+
     // Create kernel
-    std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+    std::string kernel_name;
+
+    if(input->info()->data_type() == DataType::F32 && gpu_target == GPUTarget::BIFROST)
+    {
+        if(_conv_stride_x == 1 && _conv_stride_y == 1)
+        {
+            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost";
+            num_elems_read_per_iteration_x    = 4;
+            num_elems_read_per_iteration_y    = 6;
+            num_elems_written_per_iteration_x = 2;
+            num_elems_written_per_iteration_y = 4;
+        }
+        else if(_conv_stride_x == 2 && _conv_stride_y == 2)
+        {
+            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost";
+            num_elems_read_per_iteration_x    = 6;
+            num_elems_read_per_iteration_y    = 5;
+            num_elems_written_per_iteration_x = 2;
+            num_elems_written_per_iteration_y = 2;
+        }
+        else
+        {
+            kernel_name                       = "depthwise_convolution_3x3";
+            num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
+            num_elems_written_per_iteration_y = 1;
+            num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
+            num_elems_read_per_iteration_y    = 3;
+        }
+    }
+    else
+    {
+        kernel_name                       = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
+        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
+        num_elems_written_per_iteration_y = 1;
+        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
+        num_elems_read_per_iteration_y    = 3;
+    }
+
+    // Calculate right and bottom border
+    int input_width  = input->info()->dimension(0) + conv_pad_left + conv_pad_right;
+    int input_height = input->info()->dimension(1) + conv_pad_top + conv_pad_bottom;
+
+    // Add padding only if necessary or it would always result in a window_changed
+    input_width  = ceil_to_multiple(input_width, num_elems_read_per_iteration_x);
+    input_height = ceil_to_multiple(input_height, num_elems_read_per_iteration_y);
+
+    // Create window and update padding
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+    AccessWindowStatic    input_access(input->info(), -conv_pad_left, -conv_pad_top, input_width, input_height);
+    AccessWindowStatic    weights_access(weights->info(), 0, 0, 3, 3);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set static arguments
     if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
@@ -119,64 +192,9 @@
         _kernel.setArg(idx++, output_shift);
     }
 
-    // Configure the local work size for Bifrost with a value obtained
-    // via exhaustive autotuning for the MobileNets tensor shapes.
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-    if(gpu_target == GPUTarget::BIFROST)
-    {
-        // Assume uniform padding and striding.
-        const size_t pad    = _conv_pad_left;
-        const size_t stride = _conv_stride_x;
-        const size_t width  = input->info()->dimension(0);
-        if(pad == 1)
-        {
-            const size_t width_by_stride = width / stride;
-            if(width_by_stride == 28) // 56/2 or 28/1
-            {
-                _lws_hint = cl::NDRange(7, 4, 3);
-            }
-            else if(width_by_stride == 14) // 28/2 or 14/1
-            {
-                _lws_hint = cl::NDRange(7, 7, 4);
-            }
-        }
-        else if(pad == 0)
-        {
-            if(width >= 56) // 56 or 112
-            {
-                _lws_hint = cl::NDRange(8, 5, 2);
-            }
-            else if(width >= 14) // 14 or 28
-            {
-                _lws_hint = cl::NDRange(1, 5, 2);
-            }
-            else // 7
-            {
-                _lws_hint = cl::NDRange(1, 1, 2);
-            }
-        }
-    }
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 8 / data_size_from_type(input->info()->data_type());
-    const unsigned int num_elems_written_per_iteration   = num_elems_processed_per_iteration;
-    const unsigned int num_elems_read_per_iteration      = 3 + (num_elems_processed_per_iteration - 1) * _conv_stride_x;
-    const unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
-
-    update_window_and_padding(win, input_access, weights_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
-
     // Set config_id for enabling LWS tuning
-    _config_id = "depthwise_convolution3x3_";
+    _config_id = kernel_name;
+    _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(0));