COMPMID-1623: NEWinograd reduce the number of output tiles.

Change-Id: I4d9240924fe483d2dd127ad6a4ae6f8066f61bd1
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/151893
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Andrew Mundy <andrew.mundy@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
index cfd2029..ea842a4 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
@@ -23,38 +23,33 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<1, 2, 1, 7>::OutputTransform<float>;
-using TransformTransposed = WinogradGEMM<2, 1, 7, 1>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  (void) shape;
-  return 0;  // TODO
-}
-
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_2_7_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
   (void) output_row_stride;
-  constexpr int cells_j = output_tile_cols - pad_right;
+  (void) _pad_bottom;
+  constexpr int output_tile_cols = 2;
+  constexpr int inner_tile_cols = 8;
+
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  const int cells_j = output_tile_cols - pad_right;
+
 
   // Construct a map to the output cells
   float *outptrs[cells_j];
@@ -149,22 +144,20 @@
     }
   }
 }
+}  // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>;
 
 template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile<true>;
+
 template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
-{
-  {
-    Transform::template process_tile<0, 0>,
-    Transform::template process_tile<0, 1>,
-  },
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_2_7_fp32_process_tile<true, 1>
 };
 
-
-template <>
-template <>
-const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {};
-
-template struct WinogradGEMM<1, 2, 1, 7>::OutputTransform<float>;
-template struct WinogradGEMM<2, 1, 7, 1>::OutputTransform<float>;
+template class OutputTransform<1, 7, 1, 8, float>;
+template class OutputTransform<7, 1, 8, 1, float>;
 }  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index 3b3cda0..597b074 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
@@ -23,59 +23,34 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(shape.n_rows, 2);
-  const int tile_N = iceildiv(shape.n_cols, 2);
-  return 24 * tile_M * tile_N * shape.n_channels;
-}
-
-/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain 0 or 1
- * padded values to the right and bottom columns or rows of the tile, e.g.:
- *
- *      ___     ___
- *     |   |   |  X|
- *     |___|   |__X|
- *
- *      ___     ___
- *     |   |   |  X|
- *     |X_X|   |X_X|
- *
- *
- * We provide a specialised output transform for each of these instances.
- * Consequently we below construct an array of the various padding options, the
- * array contains pointers to the specific implementations.
- */
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_2x2_3x3_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
-  constexpr int cells_i = 2 - pad_bottom;
-  constexpr int cells_j = 2 - pad_right;
+  constexpr int OutputTileRows = 2, OutputTileCols = 2;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
+  const int cells_i = OutputTileRows - pad_bottom;
+  const int cells_j = OutputTileCols - pad_right;
 
   // Construct a map to the output cells
-  float *outptrs[cells_i][cells_j];
+  float *outptrs[OutputTileRows][OutputTileCols];
   for (int i = 0; i < cells_i; i++)
   {
     for (int j = 0; j < cells_j; j++)
@@ -373,19 +348,28 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,  // No padding
-    Transform::template process_tile<0, 1>,  // Right padding
-  },
-  {
-    Transform::template process_tile<1, 0>,  // Bottom padding
-    Transform::template process_tile<1, 1>,  // Bottom and right padding
-  }
+using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_output_transform_2x2_3x3_fp32_process_tile<true, 1, 0>
 };
 
-template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_2x2_3x3_fp32_process_tile<true, 0, 1>
+};
+
+template class OutputTransform<3, 3, 4, 4, float>;
 }  // namespace winograd
+
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index cafce95..60d7181 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
@@ -23,57 +23,34 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  (void) shape;
-  return 0;  // TODO
-}
-
-/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain 0 or 1
- * padded values to the right and bottom columns or rows of the tile, e.g.:
- *
- *      ___     ___
- *     |   |   |  X|
- *     |___|   |__X|
- *
- *      ___     ___
- *     |   |   |  X|
- *     |X_X|   |X_X|
- *
- *
- * We provide a specialised output transform for each of these instances.
- * Consequently we below construct an array of the various padding options, the
- * array contains pointers to the specific implementations.
- */
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_2x2_5x5_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
-  constexpr int cells_i = 2 - pad_bottom;
-  constexpr int cells_j = 2 - pad_right;
+  constexpr int OutputTileRows = 2, OutputTileCols = 2;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
+  const int cells_i = 2 - pad_bottom;
+  const int cells_j = 2 - pad_right;
 
   // Construct a map to the output cells
-  float *outptrs[cells_i][cells_j];
+  float *outptrs[OutputTileRows][OutputTileCols];
   for (int i = 0; i < cells_i; i++)
   {
     for (int j = 0; j < cells_j; j++)
@@ -365,19 +342,28 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,  // No padding
-    Transform::template process_tile<0, 1>,  // Right padding
-  },
-  {
-    Transform::template process_tile<1, 0>,  // Bottom padding
-    Transform::template process_tile<1, 1>,  // Bottom and right padding
-  }
+using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_output_transform_2x2_5x5_fp32_process_tile<true, 1, 0>
 };
 
-template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_2x2_5x5_fp32_process_tile<true, 0, 1>
+};
+
+template class OutputTransform<5, 5, 6, 6, float>;
 }  // namespace winograd
+
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
index 2417f52..911759b 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
@@ -23,38 +23,32 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<1, 4, 1, 5>::OutputTransform<float>;
-using TransformTransposed = WinogradGEMM<4, 1, 5, 1>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  (void) shape;
-  return 0;  // TODO
-}
-
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_4_5_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
   (void) output_row_stride;
-  constexpr int cells_j = output_tile_cols - pad_right;
+  (void) _pad_bottom;
+  constexpr int output_tile_cols = 4;
+  constexpr int inner_tile_cols = 8;
+
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  const int cells_j = output_tile_cols - pad_right;
 
   // Construct a map to the output cells
   float *outptrs[cells_j];
@@ -156,23 +150,22 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,
-    Transform::template process_tile<0, 1>,
-    Transform::template process_tile<0, 2>,
-    Transform::template process_tile<0, 3>,
-  },
+using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_4_5_fp32_process_tile<true, 1>,
+  winograd_output_transform_4_5_fp32_process_tile<true, 2>,
+  winograd_output_transform_4_5_fp32_process_tile<true, 3>
 };
 
-template <>
-template <>
-const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {};
-
-
-template struct WinogradGEMM<1, 4, 1, 5>::OutputTransform<float>;
-template struct WinogradGEMM<4, 1, 5, 1>::OutputTransform<float>;
+template class OutputTransform<1, 5, 1, 8, float>;
+template class OutputTransform<5, 1, 8, 1, float>;
 }  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index cd3bdef..15cc04b 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
@@ -23,73 +23,34 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(shape.n_rows, 4);
-  const int tile_N = iceildiv(shape.n_cols, 4);
-  return 170 * tile_M * tile_N * shape.n_channels;
-}
-
-/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain up to 3
- * padded values to the right and bottom columns or rows of the tile, e.g.:
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |_______|   |______X|  |____X_X|  |__X_X_X|
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
-*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
-*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
-*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-*
-* We provide a specialised output transform for each of these instances.
-*/
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_4x4_3x3_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
-  constexpr int cells_i = 4 - pad_bottom;
-  constexpr int cells_j = 4 - pad_right;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  constexpr int TileRows = 4, TileCols = 4;
+
+  const int cells_i = TileRows - pad_bottom;
+  const int cells_j = TileCols - pad_right;
 
   // Construct a map to the output cells
-  float *outptrs[cells_i][cells_j];
+  float *outptrs[TileRows][TileCols];
   for (int i = 0; i < cells_i; i++)
   {
     for (int j = 0; j < cells_j; j++)
@@ -437,35 +398,31 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,
-    Transform::template process_tile<0, 1>,
-    Transform::template process_tile<0, 2>,
-    Transform::template process_tile<0, 3>,
-  },
-  {
-    Transform::template process_tile<1, 0>,
-    Transform::template process_tile<1, 1>,
-    Transform::template process_tile<1, 2>,
-    Transform::template process_tile<1, 3>,
-  },
-  {
-    Transform::template process_tile<2, 0>,
-    Transform::template process_tile<2, 1>,
-    Transform::template process_tile<2, 2>,
-    Transform::template process_tile<2, 3>,
-  },
-  {
-    Transform::template process_tile<3, 0>,
-    Transform::template process_tile<3, 1>,
-    Transform::template process_tile<3, 2>,
-    Transform::template process_tile<3, 3>,
-  }
+using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>,
 };
 
-template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>,
+};
+
+template class OutputTransform<3, 3, 6, 6, float>;
 }  // namespace winograd
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
index 16667cc..58bed71 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
@@ -23,38 +23,32 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<1, 6, 1, 3>::OutputTransform<float>;
-using TransformTransposed = WinogradGEMM<6, 1, 3, 1>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  (void) shape;
-  return 0;  // TODO
-}
-
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_6_3_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
   (void) output_row_stride;
-  constexpr int cells_j = output_tile_cols - pad_right;
+  (void) _pad_bottom;
+  constexpr int output_tile_cols = 6;
+  constexpr int inner_tile_cols = 8;
+
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  const int cells_j = output_tile_cols - pad_right;
 
   // Construct a map to the output cells
   float *outptrs[cells_j];
@@ -162,25 +156,24 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,
-    Transform::template process_tile<0, 1>,
-    Transform::template process_tile<0, 2>,
-    Transform::template process_tile<0, 3>,
-    Transform::template process_tile<0, 4>,
-    Transform::template process_tile<0, 5>,
-  },
+using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_6_3_fp32_process_tile<true, 1>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 2>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 3>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 4>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 5>,
 };
 
-template <>
-template <>
-const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {};
-
-
-template struct WinogradGEMM<1, 6, 1, 3>::OutputTransform<float>;
-template struct WinogradGEMM<6, 1, 3, 1>::OutputTransform<float>;
+template class OutputTransform<1, 3, 1, 8, float>;
+template class OutputTransform<3, 1, 8, 1, float>;
 }  // namespace winograd