COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4)

Implemented Winograd Output Transform (2x2,3x3) on OpenCL
Implemented CLWinogradConvolutionLayer on OpenCL

Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 3ed55fb..c760663 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -39,79 +39,6 @@
 namespace
 {
 template <typename T>
-void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
-{
-    TensorShape shape4x4(4u, 4u);
-
-    // Simple tensor for the 4x4 input tile
-    SimpleTensor<T> src_tile{ shape4x4, src.data_type() };
-
-    // Simple tensor for the 4x4 temporary tile
-    SimpleTensor<T> tmp_tile{ shape4x4, src.data_type() };
-
-    // Simple tensor for the 4x4 output tile
-    SimpleTensor<T> dst_tile{ shape4x4, src.data_type() };
-
-    // Simple tensor for the transformation matrix
-    SimpleTensor<T> matrix{ shape4x4, src.data_type() };
-
-    // Simple tensor for the transformation matrix transposed
-    SimpleTensor<T> matrix_transposed{ shape4x4, src.data_type() };
-
-    const float matrix_values[] = { 1.f, 0.f, -1.f, 0.f,
-                                    0.f, 1.f, 1.f, 0.f,
-                                    0.f, -1.f, 1.f, 0.f,
-                                    0.f, 1.f, 0.f, -1.f
-                                  };
-
-    for(int i = 0; i < matrix.num_elements(); ++i)
-    {
-        matrix[i] = matrix_values[i];
-    }
-
-    transpose_matrix(matrix, matrix_transposed);
-
-    const int in_w        = src.shape().x();
-    const int in_h        = src.shape().y();
-    const int in_d        = src.shape().z();
-    const int num_batches = src.shape().total_size() / (in_w * in_h * in_d);
-    const int num_tiles_x = std::ceil((in_w - 2 + conv_info.pad_left() + conv_info.pad_right()) / 2.0f);
-    const int num_tiles_y = std::ceil((in_h - 2 + conv_info.pad_top() + conv_info.pad_bottom()) / 2.0f);
-
-    ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(dst.shape().y()));
-
-    for(int b = 0; b < num_batches; ++b)
-    {
-        for(int z = 0; z < in_d; ++z)
-        {
-            for(int y = 0; y < num_tiles_y; ++y)
-            {
-                for(int x = 0; x < num_tiles_x; ++x)
-                {
-                    int xi = x * 2 - conv_info.pad_left();
-                    int yi = y * 2 - conv_info.pad_top();
-
-                    // Get the 4x4 tile from the input tensor
-                    get_tile(src, src_tile, Coordinates(xi, yi, z, b));
-
-                    // Compute the transformation
-                    matrix_multiply(matrix, src_tile, tmp_tile);
-                    matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
-
-                    // Store the 4x4 output tile across the 16 channels
-                    for(int i = 0; i < 16; ++i)
-                    {
-                        int xo = z;
-                        int yo = x + y * num_tiles_x;
-                        dst[coords2index(dst.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
-                    }
-                }
-            }
-        }
-    }
-}
-
-template <typename T>
 void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
 {
     // Simple tensor for the 3x3 input tile
@@ -191,6 +118,179 @@
         }
     }
 }
+
+template <typename T>
+void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
+{
+    TensorShape shape4x4(4u, 4u);
+
+    // Simple tensor for the 4x4 input tile
+    SimpleTensor<T> src_tile{ shape4x4, src.data_type() };
+
+    // Simple tensor for the 4x4 temporary tile
+    SimpleTensor<T> tmp_tile{ shape4x4, src.data_type() };
+
+    // Simple tensor for the 4x4 output tile
+    SimpleTensor<T> dst_tile{ shape4x4, src.data_type() };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> matrix{ shape4x4, src.data_type() };
+
+    // Simple tensor for the transformation matrix transposed
+    SimpleTensor<T> matrix_transposed{ shape4x4, src.data_type() };
+
+    const float matrix_values[] = { 1.f, 0.f, -1.f, 0.f,
+                                    0.f, 1.f, 1.f, 0.f,
+                                    0.f, -1.f, 1.f, 0.f,
+                                    0.f, 1.f, 0.f, -1.f
+                                  };
+
+    for(int i = 0; i < matrix.num_elements(); ++i)
+    {
+        matrix[i] = matrix_values[i];
+    }
+
+    transpose_matrix(matrix, matrix_transposed);
+
+    const int in_w        = src.shape().x();
+    const int in_h        = src.shape().y();
+    const int in_d        = src.shape().z();
+    const int num_batches = src.shape().total_size() / (in_w * in_h * in_d);
+    const int num_tiles_x = std::ceil((in_w - 2 + conv_info.pad_left() + conv_info.pad_right()) / 2.0f);
+    const int num_tiles_y = std::ceil((in_h - 2 + conv_info.pad_top() + conv_info.pad_bottom()) / 2.0f);
+
+    ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(dst.shape().y()));
+
+    for(int b = 0; b < num_batches; ++b)
+    {
+        for(int z = 0; z < in_d; ++z)
+        {
+            for(int y = 0; y < num_tiles_y; ++y)
+            {
+                for(int x = 0; x < num_tiles_x; ++x)
+                {
+                    int xi = x * 2 - conv_info.pad_left();
+                    int yi = y * 2 - conv_info.pad_top();
+
+                    // Get the 4x4 tile from the input tensor
+                    get_tile(src, src_tile, Coordinates(xi, yi, z, b));
+
+                    // Compute the transformation
+                    matrix_multiply(matrix, src_tile, tmp_tile);
+                    matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
+
+                    // Store the 4x4 output tile across the 16 channels
+                    for(int i = 0; i < 16; ++i)
+                    {
+                        int xo = z;
+                        int yo = x + y * num_tiles_x;
+                        dst[coords2index(dst.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T>
+void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
+{
+    ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
+    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
+    // Simple tensor for the 3x3 input tile
+    SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
+
+    // Simple tensor for the 4x3 temporary tile
+    SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
+
+    // Simple tensor for the 4x4 output tile
+    SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
+
+    // Initialize transformation matrix
+    // 1   | 1   | 1   | 1
+    // 0   | 1   | -1  | -1
+    trans_matrix[0 + 0 * 4] = 1.0f;
+    trans_matrix[1 + 0 * 4] = 1.0f;
+    trans_matrix[2 + 0 * 4] = 1.0f;
+    trans_matrix[3 + 0 * 4] = 0.0f;
+    trans_matrix[0 + 1 * 4] = 0.0f;
+    trans_matrix[1 + 1 * 4] = 1.0f;
+    trans_matrix[2 + 1 * 4] = -1.0f;
+    trans_matrix[3 + 1 * 4] = -1.0f;
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int w_in        = in.shape()[0];
+    const int h_in        = in.shape()[1];
+    const int c_in        = in.shape()[2];
+    const int w_out       = out.shape()[0];
+    const int h_out       = out.shape()[1];
+    const int c_out       = out.shape()[2];
+    const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+    // Input strides
+    const int stridey_in = w_in;
+    const int stridez_in = stridey_in * h_in;
+    const int stridew_in = stridez_in * c_in;
+
+    // Output strides
+    const int stridey_out = w_out;
+    const int stridez_out = stridey_out * h_out;
+    const int stridew_out = stridez_out * c_out;
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int y = 0; y < h_in; ++y)
+        {
+            for(int x = 0; x < w_in; ++x)
+            {
+                // Load the 4x4 tile across the 16 channels of the input tensor
+                for(int z = 0; z < c_in; ++z)
+                {
+                    input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+                }
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+                // Store the 2x2 output tile
+                const int xo = (y % num_tiles_x) * 2;
+                const int yo = (y / num_tiles_x) * 2;
+                const int zo = x;
+
+                const int output_offset                  = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+                out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
+
+                // Check out-of-bound writes
+                if(xo + 1 < w_out)
+                {
+                    out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
+                }
+
+                if(yo + 1 < h_out)
+                {
+                    out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
+                }
+
+                if((yo + 1 < h_out) && (xo + 1 < w_out))
+                {
+                    out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
+                }
+            }
+        }
+    }
+}
 } // namespace
 
 template <typename T>
@@ -234,8 +334,32 @@
     return out;
 }
 
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+    ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
+    ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    switch(kernel_dims.width)
+    {
+        case 3:
+            winograd_output_transform3x3(in, out, num_tiles.width);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
+            break;
+    }
+
+    return out;
+}
+
 template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
 template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
 } // namespace reference
 } // namespace validation
 } // namespace test