IVGCVSW-601: support for asymetric padding in cl conv and depthwise conv

Change-Id: I5c6c95091ae77dba96459c0640f9f6167a988c8c
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/91700
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/tests/datasets/DepthwiseConvolutionDataset.h b/tests/datasets/DepthwiseConvolutionDataset.h
index 593b823..8cceae0 100644
--- a/tests/datasets/DepthwiseConvolutionDataset.h
+++ b/tests/datasets/DepthwiseConvolutionDataset.h
@@ -125,6 +125,13 @@
         add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 9U, 2U), TensorShape(15U, 13U, 2U), PadStrideInfo(1, 2, 1, 1));
         add_config(TensorShape(23U, 27U, 5U), TensorShape(11U, 3U, 5U), TensorShape(13U, 13U, 5U), PadStrideInfo(1, 2, 0, 0));
         add_config(TensorShape(17U, 31U, 2U, 3U), TensorShape(5U, 9U, 2U), TensorShape(15U, 13U, 2U, 3U), PadStrideInfo(1, 2, 1, 1));
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(11U, 12U, 7U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(10U, 11U, 7U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U), TensorShape(10U, 11U, 7U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
 
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index 8eda2e8..aa9d9f8 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -58,6 +58,14 @@
         add_config(TensorShape(17U, 31U, 2U, 4U), TensorShape(5U, 3U, 2U, 19U), TensorShape(19U), TensorShape(15U, 16U, 19U, 4U), PadStrideInfo(1, 2, 1, 1));
         // Arbitrary batch size
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0));
+
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
     }
 };
 } // namespace datasets
diff --git a/tests/validation/CPP/ConvolutionLayer.cpp b/tests/validation/CPP/ConvolutionLayer.cpp
index 656cd2e..ab3690a 100644
--- a/tests/validation/CPP/ConvolutionLayer.cpp
+++ b/tests/validation/CPP/ConvolutionLayer.cpp
@@ -26,6 +26,8 @@
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
 
+#include "tests/framework/Asserts.h"
+
 namespace arm_compute
 {
 namespace test
@@ -149,21 +151,24 @@
     const int width_weights  = weights.shape().x();
     const int height_weights = weights.shape().y();
     const int depth_weights  = weights.shape().z();
-    const int pad_xi         = std::min(static_cast<int>(info.pad().first), width_weights / 2);
-    const int pad_yi         = std::min(static_cast<int>(info.pad().second), height_weights / 2);
-    const int start_xi       = width_weights / 2 - pad_xi;
-    const int start_yi       = height_weights / 2 - pad_yi;
-    const int end_xi         = width_in - start_xi;
-    const int end_yi         = height_in - start_yi;
-    const int stride_xi      = info.stride().first;
-    const int stride_yi      = info.stride().second;
-    const int num_batches    = src.shape().total_size() / (width_in * height_in * depth_in);
+    const int pad_left       = std::min(static_cast<int>(info.pad_left()), width_weights / 2);
+    const int pad_top        = std::min(static_cast<int>(info.pad_top()), height_weights / 2);
+    const int pad_right      = std::min(static_cast<int>(info.pad_right()), width_weights / 2);
+    const int pad_bottom     = std::min(static_cast<int>(info.pad_bottom()), height_weights / 2);
+
+    const int start_xi    = width_weights / 2 - pad_left;
+    const int start_yi    = height_weights / 2 - pad_top;
+    const int end_xi      = width_in + pad_left - width_weights / 2 + pad_right - width_weights / 2;
+    const int end_yi      = height_in + pad_top - height_weights / 2 + pad_bottom - height_weights / 2;
+    const int stride_xi   = info.stride().first;
+    const int stride_yi   = info.stride().second;
+    const int num_batches = src.shape().total_size() / (width_in * height_in * depth_in);
 
     for(int r = 0; r < num_batches; ++r)
     {
-        for(int yi = start_yi; yi < end_yi; yi += stride_yi)
+        for(int yi = start_yi; yi < start_yi + end_yi; yi += stride_yi)
         {
-            for(int xi = start_xi; xi < end_xi; xi += stride_xi)
+            for(int xi = start_xi; xi < start_xi + end_xi; xi += stride_xi)
             {
                 for(int ofm = 0; ofm < depth_out; ++ofm)
                 {
@@ -173,6 +178,9 @@
                     const int yo         = (yi - start_yi) / stride_yi;
                     const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
 
+                    ARM_COMPUTE_ASSERT(xo < width_out);
+                    ARM_COMPUTE_ASSERT(yo < height_out);
+
                     // Compute 3D convolution
                     convolution3d(src.data() + offset_in,
                                   weights.data() + ofm * width_weights * height_weights * depth_weights,
diff --git a/tests/validation/CPP/DepthwiseConvolution.cpp b/tests/validation/CPP/DepthwiseConvolution.cpp
index ae54494..b57c268 100644
--- a/tests/validation/CPP/DepthwiseConvolution.cpp
+++ b/tests/validation/CPP/DepthwiseConvolution.cpp
@@ -51,29 +51,35 @@
     SimpleTensor<T> dst{ dst_shape, src.data_type(), 1, src.fixed_point_position() };
 
     // Compute reference
-    const size_t filter_width  = weights.shape().x();
-    const size_t filter_height = weights.shape().y();
-    const size_t filter_plane  = filter_width * filter_height;
-    const size_t input_width   = src.shape().x();
-    const size_t input_height  = src.shape().y();
-    const size_t input_depth   = src.shape().z();
-    const int    num_batches   = src.shape().total_size() / (input_width * input_height * input_depth);
+    const int filter_width  = weights.shape().x();
+    const int filter_height = weights.shape().y();
+    const int filter_plane  = filter_width * filter_height;
+    const int input_width   = src.shape().x();
+    const int input_height  = src.shape().y();
+    const int input_depth   = src.shape().z();
+    const int num_batches   = src.shape().total_size() / (input_width * input_height * input_depth);
 
-    const size_t filter_half_width  = filter_width / 2;
-    const size_t filter_half_height = filter_height / 2;
-    const size_t pad_x              = std::min(filter_half_width, static_cast<size_t>(conv_info.pad().first));
-    const size_t pad_y              = std::min(filter_half_height, static_cast<size_t>(conv_info.pad().second));
-    const size_t minimum_x          = -pad_x + filter_half_width;
-    const size_t minimum_y          = -pad_y + filter_half_height;
+    const int filter_half_width  = filter_width / 2;
+    const int filter_half_height = filter_height / 2;
+
+    const int pad_left   = std::min(static_cast<int>(conv_info.pad_left()), filter_half_width);
+    const int pad_top    = std::min(static_cast<int>(conv_info.pad_top()), filter_half_height);
+    const int pad_right  = std::min(static_cast<int>(conv_info.pad_right()), filter_half_width);
+    const int pad_bottom = std::min(static_cast<int>(conv_info.pad_bottom()), filter_half_height);
+
+    const int minimum_x = -pad_left + filter_half_width;
+    const int minimum_y = -pad_top + filter_half_height;
+    const int maximum_x = input_width + pad_left - filter_half_width + pad_right - filter_half_width;
+    const int maximum_y = input_height + pad_top - filter_half_height + pad_bottom - filter_half_height;
 
     int out_pos = 0;
     for(int r = 0; r < num_batches; ++r)
     {
-        for(size_t z = 0; z < input_depth; ++z)
+        for(int z = 0; z < input_depth; ++z)
         {
-            for(size_t y = minimum_y; y < input_height - minimum_y; y += conv_info.stride().second)
+            for(int y = minimum_y; y < minimum_y + maximum_y; y += conv_info.stride().second)
             {
-                for(size_t x = minimum_x; x < input_width - minimum_x; x += conv_info.stride().first)
+                for(int x = minimum_x; x < minimum_x + maximum_x; x += conv_info.stride().first)
                 {
                     Coordinates coords(static_cast<int>(x), static_cast<int>(y), static_cast<int>(z), static_cast<int>(r));
                     size_t      filter_offset = filter_plane * z;