[Review Shape] CLDepthwiseConvolutionLayer mismatches - Fixed a bug that corrected the number of dimensions of a TensorShape for added trailing 1s - Avoided adding offset_first_element for the Depthwise 3x3 NCHW OpenCL kernels, since it wouldn't align with the window which is based on the output - Adjusted padding requirements along the x for Depthwise 3x3 NCHW. The kernel should always add 2 * dilation_(x/y) to the num_elems_read_x/y - Adjusted the kernel's border_size given to the border handler at function level - Added the dataset that previously made the tests fail Resolves: COMPMID-4041 Change-Id: Ifab7d38b263f12173fcc96a5f0bd3375756c3c53 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4673 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: 15bc8485ef463508838a549b7e8518bf05883155 [log] [tgz]
author: Giorgio Arena <giorgio.arena@arm.com> Tue Dec 08 14:34:00 2020 +0000
committer: Giorgio Arena <giorgio.arena@arm.com> Thu Dec 10 18:55:25 2020 +0000
tree: 3c9beb3956c9cf4eae95b09ef678cb4bb0506a9e
parent: 63bb7ca40e30b2db48d7bdd1adbc8223b53ac23c [diff] [blame]
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index 25d0d27..ba7a782 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp

@@ -211,8 +211,11 @@
         num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x + (conv_stride_x > 1 ? 1 : 0);
         num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
     }
-    num_elems_read_per_iteration_x += (num_elems_read_per_iteration_x - 1) * (dilation.x() - 1);
-    num_elems_read_per_iteration_y += (num_elems_read_per_iteration_y - 1) * (dilation.y() - 1);
+    // The OpenCL routine convolution1x3 does loadn(addr), loadn(addr + dilation_x) and loadn(addr + 2 * dilation_x) on the input.
+    // Each of the three convolution1x3 gets called by passing addr, (addr + dilation_y) and (addr + 2 * dilation_y)
+    // Hence we must add 2 * dilation.x/y() to the number of elements read in those axes per thread
+    num_elems_read_per_iteration_x += 2 * dilation.x();
+    num_elems_read_per_iteration_y += 2 * dilation.y();
 
     // Create window and update padding
     Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
@@ -267,7 +270,6 @@
     _conv_stride_y      = conv_info.stride().second;
     _conv_pad_left      = conv_info.pad_left();
     _conv_pad_top       = conv_info.pad_top();
-    _border_size        = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
     _output_multipliers = output_multipliers;
     _output_shifts      = output_shifts;
     _is_quantized       = is_data_type_quantized_asymmetric(input->info()->data_type());
@@ -280,6 +282,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
+    _border_size = BorderSize(input->info()->padding());
+
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
commit	15bc8485ef463508838a549b7e8518bf05883155	[log] [tgz]
author	Giorgio Arena <giorgio.arena@arm.com>	Tue Dec 08 14:34:00 2020 +0000
committer	Giorgio Arena <giorgio.arena@arm.com>	Thu Dec 10 18:55:25 2020 +0000
tree	3c9beb3956c9cf4eae95b09ef678cb4bb0506a9e
parent	63bb7ca40e30b2db48d7bdd1adbc8223b53ac23c [diff] [blame]