COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4)
Implemented Winograd Output Transform (2x2,3x3) on OpenCL
Implemented CLWinogradConvolutionLayer on OpenCL
Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 3ed55fb..c760663 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -39,79 +39,6 @@
namespace
{
template <typename T>
-void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
-{
- TensorShape shape4x4(4u, 4u);
-
- // Simple tensor for the 4x4 input tile
- SimpleTensor<T> src_tile{ shape4x4, src.data_type() };
-
- // Simple tensor for the 4x4 temporary tile
- SimpleTensor<T> tmp_tile{ shape4x4, src.data_type() };
-
- // Simple tensor for the 4x4 output tile
- SimpleTensor<T> dst_tile{ shape4x4, src.data_type() };
-
- // Simple tensor for the transformation matrix
- SimpleTensor<T> matrix{ shape4x4, src.data_type() };
-
- // Simple tensor for the transformation matrix transposed
- SimpleTensor<T> matrix_transposed{ shape4x4, src.data_type() };
-
- const float matrix_values[] = { 1.f, 0.f, -1.f, 0.f,
- 0.f, 1.f, 1.f, 0.f,
- 0.f, -1.f, 1.f, 0.f,
- 0.f, 1.f, 0.f, -1.f
- };
-
- for(int i = 0; i < matrix.num_elements(); ++i)
- {
- matrix[i] = matrix_values[i];
- }
-
- transpose_matrix(matrix, matrix_transposed);
-
- const int in_w = src.shape().x();
- const int in_h = src.shape().y();
- const int in_d = src.shape().z();
- const int num_batches = src.shape().total_size() / (in_w * in_h * in_d);
- const int num_tiles_x = std::ceil((in_w - 2 + conv_info.pad_left() + conv_info.pad_right()) / 2.0f);
- const int num_tiles_y = std::ceil((in_h - 2 + conv_info.pad_top() + conv_info.pad_bottom()) / 2.0f);
-
- ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(dst.shape().y()));
-
- for(int b = 0; b < num_batches; ++b)
- {
- for(int z = 0; z < in_d; ++z)
- {
- for(int y = 0; y < num_tiles_y; ++y)
- {
- for(int x = 0; x < num_tiles_x; ++x)
- {
- int xi = x * 2 - conv_info.pad_left();
- int yi = y * 2 - conv_info.pad_top();
-
- // Get the 4x4 tile from the input tensor
- get_tile(src, src_tile, Coordinates(xi, yi, z, b));
-
- // Compute the transformation
- matrix_multiply(matrix, src_tile, tmp_tile);
- matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
-
- // Store the 4x4 output tile across the 16 channels
- for(int i = 0; i < 16; ++i)
- {
- int xo = z;
- int yo = x + y * num_tiles_x;
- dst[coords2index(dst.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
- }
- }
- }
- }
- }
-}
-
-template <typename T>
void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
{
// Simple tensor for the 3x3 input tile
@@ -191,6 +118,179 @@
}
}
}
+
+template <typename T>
+void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
+{
+ TensorShape shape4x4(4u, 4u);
+
+ // Simple tensor for the 4x4 input tile
+ SimpleTensor<T> src_tile{ shape4x4, src.data_type() };
+
+ // Simple tensor for the 4x4 temporary tile
+ SimpleTensor<T> tmp_tile{ shape4x4, src.data_type() };
+
+ // Simple tensor for the 4x4 output tile
+ SimpleTensor<T> dst_tile{ shape4x4, src.data_type() };
+
+ // Simple tensor for the transformation matrix
+ SimpleTensor<T> matrix{ shape4x4, src.data_type() };
+
+ // Simple tensor for the transformation matrix transposed
+ SimpleTensor<T> matrix_transposed{ shape4x4, src.data_type() };
+
+ const float matrix_values[] = { 1.f, 0.f, -1.f, 0.f,
+ 0.f, 1.f, 1.f, 0.f,
+ 0.f, -1.f, 1.f, 0.f,
+ 0.f, 1.f, 0.f, -1.f
+ };
+
+ for(int i = 0; i < matrix.num_elements(); ++i)
+ {
+ matrix[i] = matrix_values[i];
+ }
+
+ transpose_matrix(matrix, matrix_transposed);
+
+ const int in_w = src.shape().x();
+ const int in_h = src.shape().y();
+ const int in_d = src.shape().z();
+ const int num_batches = src.shape().total_size() / (in_w * in_h * in_d);
+ const int num_tiles_x = std::ceil((in_w - 2 + conv_info.pad_left() + conv_info.pad_right()) / 2.0f);
+ const int num_tiles_y = std::ceil((in_h - 2 + conv_info.pad_top() + conv_info.pad_bottom()) / 2.0f);
+
+ ARM_COMPUTE_ERROR_ON((num_tiles_x * num_tiles_y) != static_cast<int>(dst.shape().y()));
+
+ for(int b = 0; b < num_batches; ++b)
+ {
+ for(int z = 0; z < in_d; ++z)
+ {
+ for(int y = 0; y < num_tiles_y; ++y)
+ {
+ for(int x = 0; x < num_tiles_x; ++x)
+ {
+ int xi = x * 2 - conv_info.pad_left();
+ int yi = y * 2 - conv_info.pad_top();
+
+ // Get the 4x4 tile from the input tensor
+ get_tile(src, src_tile, Coordinates(xi, yi, z, b));
+
+ // Compute the transformation
+ matrix_multiply(matrix, src_tile, tmp_tile);
+ matrix_multiply(tmp_tile, matrix_transposed, dst_tile);
+
+ // Store the 4x4 output tile across the 16 channels
+ for(int i = 0; i < 16; ++i)
+ {
+ int xo = z;
+ int yo = x + y * num_tiles_x;
+ dst[coords2index(dst.shape(), Coordinates(xo, yo, i, b))] = dst_tile[i];
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
+{
+ ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
+ ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
+ // Simple tensor for the 3x3 input tile
+ SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+
+ // Simple tensor for the transformation matrix
+ SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
+
+ // Simple tensor for the transformation matrix transpose
+ SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
+
+ // Simple tensor for the 4x3 temporary tile
+ SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
+
+ // Simple tensor for the 4x4 output tile
+ SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
+
+ // Initialize transformation matrix
+ // 1 | 1 | 1 | 1
+ // 0 | 1 | -1 | -1
+ trans_matrix[0 + 0 * 4] = 1.0f;
+ trans_matrix[1 + 0 * 4] = 1.0f;
+ trans_matrix[2 + 0 * 4] = 1.0f;
+ trans_matrix[3 + 0 * 4] = 0.0f;
+ trans_matrix[0 + 1 * 4] = 0.0f;
+ trans_matrix[1 + 1 * 4] = 1.0f;
+ trans_matrix[2 + 1 * 4] = -1.0f;
+ trans_matrix[3 + 1 * 4] = -1.0f;
+
+ // Transpose the transformation matrix
+ transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+ const int w_in = in.shape()[0];
+ const int h_in = in.shape()[1];
+ const int c_in = in.shape()[2];
+ const int w_out = out.shape()[0];
+ const int h_out = out.shape()[1];
+ const int c_out = out.shape()[2];
+ const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+ // Input strides
+ const int stridey_in = w_in;
+ const int stridez_in = stridey_in * h_in;
+ const int stridew_in = stridez_in * c_in;
+
+ // Output strides
+ const int stridey_out = w_out;
+ const int stridez_out = stridey_out * h_out;
+ const int stridew_out = stridez_out * c_out;
+
+ for(int n = 0; n < num_batches; ++n)
+ {
+ for(int y = 0; y < h_in; ++y)
+ {
+ for(int x = 0; x < w_in; ++x)
+ {
+ // Load the 4x4 tile across the 16 channels of the input tensor
+ for(int z = 0; z < c_in; ++z)
+ {
+ input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+ }
+
+ // First transformation
+ matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+ // Second transformation
+ matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+ // Store the 2x2 output tile
+ const int xo = (y % num_tiles_x) * 2;
+ const int yo = (y / num_tiles_x) * 2;
+ const int zo = x;
+
+ const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+ out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
+
+ // Check out-of-bound writes
+ if(xo + 1 < w_out)
+ {
+ out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
+ }
+
+ if(yo + 1 < h_out)
+ {
+ out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
+ }
+
+ if((yo + 1 < h_out) && (xo + 1 < w_out))
+ {
+ out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
+ }
+ }
+ }
+ }
+}
} // namespace
template <typename T>
@@ -234,8 +334,32 @@
return out;
}
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+ ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
+ ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
+
+ // Create reference
+ SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+ switch(kernel_dims.width)
+ {
+ case 3:
+ winograd_output_transform3x3(in, out, num_tiles.width);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
+ break;
+ }
+
+ return out;
+}
+
template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
} // namespace reference
} // namespace validation
} // namespace test