COMPMID-765: Fix windows in DirectConvLayer and DepthwiseConvLayer

Change-Id: I6c68733c8a2ada12aa3994e3e5213d20222df861
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/121637
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index c24420a..29564b3 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -121,11 +121,6 @@
     const GPUTarget gpu_target = get_arch_from_target(get_target());
 
     // Configure kernel window
-    const unsigned int conv_pad_left   = conv_info.pad_left();
-    const unsigned int conv_pad_top    = conv_info.pad_top();
-    const unsigned int conv_pad_right  = conv_info.pad_right();
-    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-
     unsigned int num_elems_read_per_iteration_x    = 0;
     unsigned int num_elems_read_per_iteration_y    = 0;
     unsigned int num_elems_written_per_iteration_x = 0;
@@ -139,8 +134,22 @@
         kernel_name                       = "depthwise_convolution_3x3_f16";
         num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
         num_elems_written_per_iteration_y = 1;
-        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
         num_elems_read_per_iteration_y    = 3;
+        switch(_conv_stride_x)
+        {
+            case 1:
+                num_elems_read_per_iteration_x = 8;
+                break;
+            case 2:
+                num_elems_read_per_iteration_x = 9;
+                break;
+            case 3:
+                num_elems_read_per_iteration_x = 16;
+                break;
+            default:
+                num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
+                break;
+        }
     }
     else if(input->info()->data_type() == DataType::F32 && gpu_target == GPUTarget::BIFROST)
     {
@@ -178,18 +187,12 @@
         num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
     }
 
-    // Calculate right and bottom border
-    int input_width  = input->info()->dimension(0) + conv_pad_left + conv_pad_right;
-    int input_height = input->info()->dimension(1) + conv_pad_top + conv_pad_bottom;
-
-    // Add padding only if necessary or it would always result in a window_changed
-    input_width  = ceil_to_multiple(input_width, num_elems_read_per_iteration_x);
-    input_height = ceil_to_multiple(input_height, num_elems_read_per_iteration_y);
-
     // Create window and update padding
     Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
 
-    AccessWindowStatic    input_access(input->info(), -conv_pad_left, -conv_pad_top, input_width, input_height);
+    AccessWindowRectangle input_access(input->info(), -_conv_pad_left, -_conv_pad_top,
+                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                                       _conv_stride_x, _conv_stride_y);
     AccessWindowStatic    weights_access(weights->info(), 0, 0, 3, 3);
     AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
 
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 6f5c7a3..c01a666 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -241,7 +241,9 @@
     bool   window_changed = false;
     Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
 
-    AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
+    AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                                       conv_stride_x, conv_stride_y);
     AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
     AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
 
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index dad4fee..f5ee608 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -238,6 +238,7 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_output->info()->tensor_shape(), output_shape);
 
     const unsigned int conv_stride_x   = _conv_info.stride().first;
+    const unsigned int conv_stride_y   = _conv_info.stride().second;
     const unsigned int conv_pad_top    = _conv_info.pad_top();
     const unsigned int conv_pad_right  = _conv_info.pad_right();
     const unsigned int conv_pad_bottom = _conv_info.pad_bottom();
@@ -264,15 +265,10 @@
     // Configure kernel window
     Window win = calculate_max_window(*_output->info(), Steps(_num_elems_written_per_iteration));
 
-    const unsigned int num_x_steps               = (output_shape.x() + _num_elems_written_per_iteration - 1) / _num_elems_written_per_iteration;
-    const int          input_num_elems_processed = get_input_num_elems_processed(_num_elems_written_per_iteration, conv_stride_x);
-
-    AccessWindowStatic input_access(_input->info(),
-                                    -conv_pad_left,
-                                    -conv_pad_top,
-                                    (num_x_steps - 1) * input_num_elems_processed + num_elems_read_per_iteration,
-                                    _input->info()->tensor_shape().y() + conv_pad_bottom);
-    AccessWindowStatic     weights_access(_weights->info(), 0, 0, _weights->info()->dimension(0), _weights->info()->dimension(1));
+    AccessWindowRectangle input_access(_input->info(), -conv_pad_left, -conv_pad_top,
+                                       num_elems_read_per_iteration, 3,
+                                       conv_stride_x, conv_stride_y);
+    AccessWindowStatic     weights_access(_weights->info(), 0, 0, 3, 3);
     AccessWindowHorizontal output_access(_output->info(), 0, _num_elems_written_per_iteration);
 
     update_window_and_padding(win, input_access, weights_access, output_access);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 4dc186a..285ec2d 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -1053,8 +1053,8 @@
     // Calculate right and bottom border
     unsigned int kernel_size   = weights->dimension(0);
     const int    conv_stride_x = std::get<0>(conv_info.stride());
+    const int    conv_stride_y = std::get<1>(conv_info.stride());
     const int    input_width   = input->dimension(0);
-    const int    input_height  = input->dimension(1);
 
     switch(kernel_size)
     {
@@ -1135,8 +1135,12 @@
     border_size.right  = conv_pad_right;
     border_size.bottom = conv_pad_bottom;
 
-    Window                 win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-    AccessWindowStatic     input_access(input, -conv_pad_left, -conv_pad_top, input_width + conv_pad_right, input_height + conv_pad_bottom);
+    // Configure window
+    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+    AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+                                       num_elems_read_per_iteration, kernel_size,
+                                       conv_stride_x, conv_stride_y);
     AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
     AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 20bf6cd..8ac882c 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -120,7 +120,7 @@
 {
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthwiseConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
 {