Rework DepthwiseConvolution heuristic on OpenCL

Resolves COMPMID-5632

Change-Id: I2bdbe69a610ca2510fbd74d5d412842679299762
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8365
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 277cba4..cded319 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -59,7 +59,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_weights_to_cl_image(weights) == false), "Export to cl_image not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -161,7 +162,8 @@
       _depth_multiplier(1),
       _output_multipliers(nullptr),
       _output_shifts(nullptr),
-      _export_to_cl_image(false),
+      _export_input_to_cl_image(false),
+      _export_weights_to_cl_image(false),
       _is_quantized(false)
 {
     _type = CLKernelType::DEPTHWISE;
@@ -192,15 +194,16 @@
     const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info);
     auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
 
-    _input              = input;
-    _output             = output;
-    _weights            = weights;
-    _biases             = biases;
-    _depth_multiplier   = conv_info.depth_multiplier;
-    _output_multipliers = output_multipliers;
-    _output_shifts      = output_shifts;
-    _export_to_cl_image = dwc_info.export_weights_to_cl_image;
-    _is_quantized       = is_data_type_quantized(input->info()->data_type());
+    _input                      = input;
+    _output                     = output;
+    _weights                    = weights;
+    _biases                     = biases;
+    _depth_multiplier           = conv_info.depth_multiplier;
+    _output_multipliers         = output_multipliers;
+    _output_shifts              = output_shifts;
+    _export_input_to_cl_image   = dwc_info.export_input_to_cl_image;
+    _export_weights_to_cl_image = dwc_info.export_weights_to_cl_image;
+    _is_quantized               = is_data_type_quantized(input->info()->data_type());
 
     const unsigned int n0          = adjust_vec_size(dwc_info.n0, output->info()->dimension(0));
     const unsigned int m0          = std::min(dwc_info.m0, (unsigned int)output->info()->dimension(1));
@@ -208,8 +211,13 @@
 
     CLBuildOptions build_opts;
 
-    // Update the padding for the weights tensor if we can export to cl_image
-    if(_export_to_cl_image)
+    // Update the padding for the input/weights tensor if we can export to cl_image
+    if(_export_input_to_cl_image)
+    {
+        arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
+    }
+
+    if(_export_weights_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
     }
@@ -234,14 +242,18 @@
 
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(conv_info.depth_multiplier));
-    build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+    build_opts.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
     // Note: SRC_DATA_TYPE must have the same data type of WEI_DATA_TYPE. In quantized, we could
     // have a case where the data types for the activation and weights are different. However, since the implementation
     // only works when both have same data type, we have to change the offset to take into account this aspect
     build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
     build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
     build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-    build_opts.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+    build_opts.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(1)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(2)));
+    build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(1)));
+    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(2)));
     build_opts.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(1)));
     build_opts.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(2)));
     build_opts.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
@@ -353,24 +365,39 @@
 
     Window slice = window_collapsed.first_slice_window_4D();
 
+    cl::Image2D input_cl_image;
     cl::Image2D weights_cl_image;
 
-    if(_export_to_cl_image)
+    if(_export_input_to_cl_image || _export_weights_to_cl_image)
     {
-        const size_t      image_w = _weights->info()->dimension(0) / 4;
-        const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
-        const TensorShape shape2d(image_w, image_h);
-        const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
-
         // Export cl_buffer to cl_image
-        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch);
+        if(_export_input_to_cl_image)
+        {
+            const size_t      image_w = _input->info()->dimension(0) / 4;
+            const size_t      image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
+            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch);
+        }
+
+        if(_export_weights_to_cl_image)
+        {
+            const size_t      image_w = _weights->info()->dimension(0) / 4;
+            const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
+            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch);
+        }
     }
 
     unsigned int idx = 0;
+    if(_export_input_to_cl_image)
+    {
+        _kernel.setArg(idx++, input_cl_image);
+    }
     add_4d_tensor_nhwc_argument(idx, _input);
     add_4d_tensor_nhwc_argument(idx, _output);
-
-    if(_export_to_cl_image)
+    if(_export_weights_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }