src/core/NEON/kernels/convolution/winograd/input_transform.hpp - ml/ComputeLibrary - Gitiles

 /*
  * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #pragma once

 #include "arm_compute/core/Error.h"

 #include "src/core/NEON/kernels/assembly/winograd.hpp"

 #include "src/core/NEON/kernels/arm_conv/addressing.hpp"
 #include <algorithm>
 #include <cstring>
 #include <functional>

 namespace arm_conv {
 namespace winograd {
 namespace input_transform {

 namespace {

 template <typename T>
 constexpr T iceildiv(const T a, const T b)
 {
   return (a + b - 1) / b;
 }

 }

 /* Driver class for the Winograd input transforms.
  *
  * This provides a base implementation which handles iteration over the input
  * tensor; subclasses are responsible for managing working space and executing
  * the transform on individual tiles.
  */
 template <typename TIn, typename TOut=TIn>
 class TransformBase : public ITransform
 {
   const std::string m_name;
   const unsigned int m_input_rows, m_input_cols;

   protected:
   virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
   {
     return 0;
   }

   virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
   {
     // Nothing to do
   }

   virtual void execute_tile(
     unsigned int n_channels,
     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
     TOut *outptr, size_t ld_out_matrix,
     unsigned int pad_top, unsigned int valid_rows,
     unsigned int pad_left, unsigned int valid_cols,
     void *working_space
   ) const = 0;

   void execute_internal(
     const ConvolutionArgs &args,
     const TIn *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
     TOut *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
     void *working_space, unsigned int thread_id, unsigned int n_threads
   ) const
   {
     // Get the working space for this thread, and initialise it.
     working_space = reinterpret_cast<char *>(working_space) +
                     this->get_working_space_per_thread(args) * thread_id;
     this->initialise_thread_working_space(args, working_space);

     // Get tile traversal parameters
     const auto tile_stride_rows = std::max(1u, m_input_rows - args.kernel_shape.rows + 1);
     const auto tile_stride_cols = std::max(1u, m_input_cols - args.kernel_shape.cols + 1);
     const auto n_tile_rows = iceildiv(
       args.output_shape.rows, m_input_rows - args.kernel_shape.rows + 1);
     const auto n_tile_cols = iceildiv(
       args.output_shape.cols, m_input_cols - args.kernel_shape.cols + 1);

     // Execute over all batches
     for (unsigned int batch = 0; batch < args.n_batches; batch++)
     {
       auto outptr_tile = outptr + thread_id * n_tile_cols * ld_out_row;

       // For a single batch, stripe the rows over the threads.
       for (auto tile_i = thread_id; tile_i < n_tile_rows; tile_i += n_threads)
       {
         // Compute pointers and padding for this row of tiles
         const auto start_i = tile_i * tile_stride_rows;
         const auto pad_top = start_i < args.pad_top ? args.pad_top - start_i : 0;
         const auto inptr_row = inptr + (pad_top ? 0 : start_i - args.pad_top) * ld_in_row;
         const auto valid_rows = args.input_shape.rows - (pad_top ? 0 : start_i - args.pad_top);

         // Iterate over columns
         for (auto tile_j = 0u; tile_j < n_tile_cols; tile_j++)
         {
           // Compute pointers and padding for this tile, then delegate to
           // execute the kernel.
           const auto start_j = tile_j * tile_stride_cols;
           const auto pad_left = start_j < args.pad_left ? args.pad_left - start_j : 0;
           const auto inptr_tile = inptr_row + (pad_left ? 0 : start_j - args.pad_left) * ld_in_col;
           const auto valid_cols = args.input_shape.cols - (pad_left ? 0 : start_j - args.pad_left);

           this->execute_tile(
             args.n_input_channels,
             inptr_tile, ld_in_row, ld_in_col,
             outptr_tile, ld_out_matrix,
             pad_top, valid_rows, pad_left, valid_cols,
             working_space
           );
           outptr_tile += ld_out_row;
         }

         outptr_tile += (n_threads - 1) * n_tile_cols * ld_out_row;
       }

       inptr += ld_in_batch;
       outptr += ld_out_batch;
     }
   }

   public:
   TransformBase(const std::string &name, unsigned int input_rows, unsigned int input_cols)
   : m_name(name), m_input_rows(input_rows), m_input_cols(input_cols)
   {
   }

   const std::string &get_name(void) const override { return m_name; }

   unsigned int get_input_rows(void) const override final { return m_input_rows; }
   unsigned int get_input_cols(void) const override final { return m_input_cols; }

   size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
   {
     return n_threads * this->get_working_space_per_thread(args);
   }

   void execute(
     const ConvolutionArgs &args,
     const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
     void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
     void *working_space, unsigned int thread_id, unsigned int n_threads
   ) const override
   {
     execute_internal(
       args,
       reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_row, ld_in_col,
       reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_matrix, ld_out_row,
       working_space, thread_id, n_threads
     );
   }
 };

 template <typename TIn, typename TOut=TIn>
 class TransformDirect : public TransformBase<TIn, TOut>
 {
   using Kernel = std::function<void(
     unsigned int,  // Number of channels
     const TIn *,  size_t, size_t,  // Pointer to first valid input element, row and column stride
     unsigned int, unsigned int, unsigned int, unsigned int,  // Top, left, bottom and right padding
     TOut *, size_t  // Base output pointer, stride between matrices
   )>;
   const Kernel m_kernel;

   protected:
   void execute_tile(
     unsigned int n_channels,
     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
     TOut *outptr, size_t ld_out_matrix,
     unsigned int pad_top, unsigned int valid_rows,
     unsigned int pad_left, unsigned int valid_cols,
     void *working_space
   ) const override
   {
     ARM_COMPUTE_UNUSED(working_space);
     const auto end_i = this->get_input_rows() - pad_top;
     const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows;
     const auto end_j = this->get_input_cols() - pad_left;
     const auto pad_right = end_j < valid_cols ? 0 : end_j - valid_cols;

     // Execute the kernel
     m_kernel(
       n_channels, inptr, ld_in_row, ld_in_col,
       pad_top, pad_left, pad_bottom, pad_right,
       outptr, ld_out_matrix
     );
   }

   public:
   TransformDirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
   : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
   {
   }
 };

 template <typename TIn, typename TOut=TIn>
 class TransformIndirect : public TransformBase<TIn, TOut>
 {
   using Kernel = std::function<void(
     unsigned int,  // Number of channels
     const TIn *const *,  // Input pointers (one per point)
     TOut *, size_t   // Base output pointer, stride between matrices
   )>;
   const Kernel m_kernel;

   struct Workspace
   {
     const TIn **inptrs;
     const TIn *input_buffer;
   };

   size_t sizeof_inptr_array(void) const
   {
     return sizeof(const TIn **) * this->get_input_rows() * this->get_input_cols();
   }

   protected:
   size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
   {
     return sizeof(Workspace) + sizeof_inptr_array() + sizeof(TIn) * args.n_input_channels;
   }

   void initialise_thread_working_space(const ConvolutionArgs &args, void *buffer) const override
   {
     Workspace *ws = reinterpret_cast<Workspace *>(buffer);
     buffer = ws + 1;

     ws->inptrs = reinterpret_cast<const TIn **>(buffer);
     buffer = reinterpret_cast<char *>(buffer) + sizeof_inptr_array();

     ws->input_buffer = reinterpret_cast<const TIn *>(buffer);
     memset(buffer, 0, sizeof(TIn) * args.n_input_channels);
   }

   void execute_tile(
     unsigned int n_channels,
     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
     TOut *outptr, size_t ld_out_matrix,
     unsigned int pad_top, unsigned int valid_rows,
     unsigned int pad_left, unsigned int valid_cols,
     void *working_space
   ) const override
   {
     // Get the working space
     auto ws = reinterpret_cast<Workspace *>(working_space);

     // Construct the input pointer array based on the given arguments
     fill_pointer_array<const TIn>(
       ws->inptrs, this->get_input_rows(), this->get_input_cols(),
       inptr, ld_in_row, ld_in_col,
       ws->input_buffer,
       pad_top, valid_rows,
       pad_left, valid_cols
     );

     // Execute the kernel
     m_kernel(n_channels, ws->inptrs, outptr, ld_out_matrix);
   }

   public:
   TransformIndirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
   : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
   {
   }
 };

 template <typename TIn, typename TOut=TIn>
 class TransformUnpadded : public TransformBase<TIn, TOut>
 {
   using Kernel = std::function<void(
     unsigned int,  // Number of channels
     const TIn *,  size_t, size_t,  // Pointer to first input element, row and column stride
     TOut *, size_t // Base output pointer, stride between matrices
   )>;
   const Kernel m_kernel;

   protected:
   size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
   {
     const auto input_points = this->get_input_rows() * this->get_input_cols();
     return sizeof(TIn) * input_points * args.n_input_channels;
   }

   void execute_tile(
     unsigned int n_channels,
     const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
     TOut *const outptr, const size_t ld_out_matrix,
     const unsigned int pad_top, const unsigned int valid_rows,
     const unsigned int pad_left, const unsigned int valid_cols,
     void *const working_space
   ) const override
   {
     // If there's any padding, then copy the valid portion of the tensor into
     // the working space and reset the pointer, row and column strides to point
     // at this copy of the data.
     if (pad_top || valid_rows < this->get_input_rows() ||
         pad_left || valid_cols < this->get_input_cols())
     {
       const auto patch_ld_col = n_channels;
       const auto patch_ld_row = patch_ld_col * this->get_input_cols();
       auto patch = reinterpret_cast<TIn *>(working_space) +
                    pad_top*patch_ld_row + pad_left*patch_ld_col;

       // Fill the input patch with padding
       memset(working_space, 0, sizeof(TIn) * this->get_input_rows() * patch_ld_row);

       // Determine the bounds for which to copy
       const auto last_i = std::min(valid_rows + pad_top, this->get_input_rows());
       const auto last_j = std::min(valid_cols + pad_left, this->get_input_cols());

       // Copy across the valid portion of the patch
       for (auto i = pad_top; i < last_i; i++)
       {
         auto inptr_col = inptr;
         inptr += ld_in_row;

         auto patch_col = patch;
         patch += patch_ld_row;

         for (auto j = pad_left; j < last_j; j++)
         {
           // Perform the copy and progress both input and patch pointers
           memcpy(patch_col, inptr_col, n_channels * sizeof(TIn));
           inptr_col += ld_in_col;
           patch_col += patch_ld_col;
         }
       }

       // Override the input pointer and strides
       inptr = reinterpret_cast<const TIn *>(working_space);
       ld_in_col = patch_ld_col;
       ld_in_row = patch_ld_row;
     }

     // Call the kernel
     m_kernel(n_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix);
   }

   public:
   TransformUnpadded(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
   : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
   {
   }

   /* Utility method which can be used to get a transposed version of a kernel,
    * this just calls the kernel with the input row and column strides reversed.
    */
   static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
   {
     return [kernel] (
       const unsigned int n_channels,
       const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
       TOut *const outptr, const size_t ld_out_matrix
     ) {
       kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out_matrix);
     };
   }
 };

 }  // namespace input_transform
 }  // namespace winograd
 }  // namespace arm_conv
	/*
	* Copyright (c) 2022 Arm Limited.
	*
	* SPDX-License-Identifier: MIT
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to
	* deal in the Software without restriction, including without limitation the
	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	* sell copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in all
	* copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	* SOFTWARE.
	*/

	#pragma once

	#include "arm_compute/core/Error.h"

	#include "src/core/NEON/kernels/assembly/winograd.hpp"

	#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
	#include <algorithm>
	#include <cstring>
	#include <functional>

	namespace arm_conv {
	namespace winograd {
	namespace input_transform {

	namespace {

	template <typename T>
	constexpr T iceildiv(const T a, const T b)
	{
	return (a + b - 1) / b;
	}

	}

	/* Driver class for the Winograd input transforms.
	*
	* This provides a base implementation which handles iteration over the input
	* tensor; subclasses are responsible for managing working space and executing
	* the transform on individual tiles.
	*/
	template <typename TIn, typename TOut=TIn>
	class TransformBase : public ITransform
	{
	const std::string m_name;
	const unsigned int m_input_rows, m_input_cols;

	protected:
	virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
	{
	return 0;
	}

	virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
	{
	// Nothing to do
	}

	virtual void execute_tile(
	unsigned int n_channels,
	const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
	TOut *outptr, size_t ld_out_matrix,
	unsigned int pad_top, unsigned int valid_rows,
	unsigned int pad_left, unsigned int valid_cols,
	void *working_space
	) const = 0;

	void execute_internal(
	const ConvolutionArgs &args,
	const TIn *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
	TOut *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
	void *working_space, unsigned int thread_id, unsigned int n_threads
	) const
	{
	// Get the working space for this thread, and initialise it.
	working_space = reinterpret_cast<char *>(working_space) +
	this->get_working_space_per_thread(args) * thread_id;
	this->initialise_thread_working_space(args, working_space);

	// Get tile traversal parameters
	const auto tile_stride_rows = std::max(1u, m_input_rows - args.kernel_shape.rows + 1);
	const auto tile_stride_cols = std::max(1u, m_input_cols - args.kernel_shape.cols + 1);
	const auto n_tile_rows = iceildiv(
	args.output_shape.rows, m_input_rows - args.kernel_shape.rows + 1);
	const auto n_tile_cols = iceildiv(
	args.output_shape.cols, m_input_cols - args.kernel_shape.cols + 1);

	// Execute over all batches
	for (unsigned int batch = 0; batch < args.n_batches; batch++)
	{
	auto outptr_tile = outptr + thread_id * n_tile_cols * ld_out_row;

	// For a single batch, stripe the rows over the threads.
	for (auto tile_i = thread_id; tile_i < n_tile_rows; tile_i += n_threads)
	{
	// Compute pointers and padding for this row of tiles
	const auto start_i = tile_i * tile_stride_rows;
	const auto pad_top = start_i < args.pad_top ? args.pad_top - start_i : 0;
	const auto inptr_row = inptr + (pad_top ? 0 : start_i - args.pad_top) * ld_in_row;
	const auto valid_rows = args.input_shape.rows - (pad_top ? 0 : start_i - args.pad_top);

	// Iterate over columns
	for (auto tile_j = 0u; tile_j < n_tile_cols; tile_j++)
	{
	// Compute pointers and padding for this tile, then delegate to
	// execute the kernel.
	const auto start_j = tile_j * tile_stride_cols;
	const auto pad_left = start_j < args.pad_left ? args.pad_left - start_j : 0;
	const auto inptr_tile = inptr_row + (pad_left ? 0 : start_j - args.pad_left) * ld_in_col;
	const auto valid_cols = args.input_shape.cols - (pad_left ? 0 : start_j - args.pad_left);

	this->execute_tile(
	args.n_input_channels,
	inptr_tile, ld_in_row, ld_in_col,
	outptr_tile, ld_out_matrix,
	pad_top, valid_rows, pad_left, valid_cols,
	working_space
	);
	outptr_tile += ld_out_row;
	}

	outptr_tile += (n_threads - 1) * n_tile_cols * ld_out_row;
	}

	inptr += ld_in_batch;
	outptr += ld_out_batch;
	}
	}

	public:
	TransformBase(const std::string &name, unsigned int input_rows, unsigned int input_cols)
	: m_name(name), m_input_rows(input_rows), m_input_cols(input_cols)
	{
	}

	const std::string &get_name(void) const override { return m_name; }

	unsigned int get_input_rows(void) const override final { return m_input_rows; }
	unsigned int get_input_cols(void) const override final { return m_input_cols; }

	size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
	{
	return n_threads * this->get_working_space_per_thread(args);
	}

	void execute(
	const ConvolutionArgs &args,
	const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
	void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
	void *working_space, unsigned int thread_id, unsigned int n_threads
	) const override
	{
	execute_internal(
	args,
	reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_row, ld_in_col,
	reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_matrix, ld_out_row,
	working_space, thread_id, n_threads
	);
	}
	};

	template <typename TIn, typename TOut=TIn>
	class TransformDirect : public TransformBase<TIn, TOut>
	{
	using Kernel = std::function<void(
	unsigned int, // Number of channels
	const TIn *, size_t, size_t, // Pointer to first valid input element, row and column stride
	unsigned int, unsigned int, unsigned int, unsigned int, // Top, left, bottom and right padding
	TOut *, size_t // Base output pointer, stride between matrices
	)>;
	const Kernel m_kernel;

	protected:
	void execute_tile(
	unsigned int n_channels,
	const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
	TOut *outptr, size_t ld_out_matrix,
	unsigned int pad_top, unsigned int valid_rows,
	unsigned int pad_left, unsigned int valid_cols,
	void *working_space
	) const override
	{
	ARM_COMPUTE_UNUSED(working_space);
	const auto end_i = this->get_input_rows() - pad_top;
	const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows;
	const auto end_j = this->get_input_cols() - pad_left;
	const auto pad_right = end_j < valid_cols ? 0 : end_j - valid_cols;

	// Execute the kernel
	m_kernel(
	n_channels, inptr, ld_in_row, ld_in_col,
	pad_top, pad_left, pad_bottom, pad_right,
	outptr, ld_out_matrix
	);
	}

	public:
	TransformDirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
	: TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
	{
	}
	};

	template <typename TIn, typename TOut=TIn>
	class TransformIndirect : public TransformBase<TIn, TOut>
	{
	using Kernel = std::function<void(
	unsigned int, // Number of channels
	const TIn const , // Input pointers (one per point)
	TOut *, size_t // Base output pointer, stride between matrices
	)>;
	const Kernel m_kernel;

	struct Workspace
	{
	const TIn **inptrs;
	const TIn *input_buffer;
	};

	size_t sizeof_inptr_array(void) const
	{
	return sizeof(const TIn *) this->get_input_rows() * this->get_input_cols();
	}

	protected:
	size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
	{
	return sizeof(Workspace) + sizeof_inptr_array() + sizeof(TIn) * args.n_input_channels;
	}

	void initialise_thread_working_space(const ConvolutionArgs &args, void *buffer) const override
	{
	Workspace ws = reinterpret_cast<Workspace >(buffer);
	buffer = ws + 1;

	ws->inptrs = reinterpret_cast<const TIn **>(buffer);
	buffer = reinterpret_cast<char *>(buffer) + sizeof_inptr_array();

	ws->input_buffer = reinterpret_cast<const TIn *>(buffer);
	memset(buffer, 0, sizeof(TIn) * args.n_input_channels);
	}

	void execute_tile(
	unsigned int n_channels,
	const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
	TOut *outptr, size_t ld_out_matrix,
	unsigned int pad_top, unsigned int valid_rows,
	unsigned int pad_left, unsigned int valid_cols,
	void *working_space
	) const override
	{
	// Get the working space
	auto ws = reinterpret_cast<Workspace *>(working_space);

	// Construct the input pointer array based on the given arguments
	fill_pointer_array<const TIn>(
	ws->inptrs, this->get_input_rows(), this->get_input_cols(),
	inptr, ld_in_row, ld_in_col,
	ws->input_buffer,
	pad_top, valid_rows,
	pad_left, valid_cols
	);

	// Execute the kernel
	m_kernel(n_channels, ws->inptrs, outptr, ld_out_matrix);
	}

	public:
	TransformIndirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
	: TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
	{
	}
	};

	template <typename TIn, typename TOut=TIn>
	class TransformUnpadded : public TransformBase<TIn, TOut>
	{
	using Kernel = std::function<void(
	unsigned int, // Number of channels
	const TIn *, size_t, size_t, // Pointer to first input element, row and column stride
	TOut *, size_t // Base output pointer, stride between matrices
	)>;
	const Kernel m_kernel;

	protected:
	size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
	{
	const auto input_points = this->get_input_rows() * this->get_input_cols();
	return sizeof(TIn) * input_points * args.n_input_channels;
	}

	void execute_tile(
	unsigned int n_channels,
	const TIn *inptr, size_t ld_in_row, size_t ld_in_col,
	TOut *const outptr, const size_t ld_out_matrix,
	const unsigned int pad_top, const unsigned int valid_rows,
	const unsigned int pad_left, const unsigned int valid_cols,
	void *const working_space
	) const override
	{
	// If there's any padding, then copy the valid portion of the tensor into
	// the working space and reset the pointer, row and column strides to point
	// at this copy of the data.
	if (pad_top \|\| valid_rows < this->get_input_rows() \|\|
	pad_left \|\| valid_cols < this->get_input_cols())
	{
	const auto patch_ld_col = n_channels;
	const auto patch_ld_row = patch_ld_col * this->get_input_cols();
	auto patch = reinterpret_cast<TIn *>(working_space) +
	pad_toppatch_ld_row + pad_leftpatch_ld_col;

	// Fill the input patch with padding
	memset(working_space, 0, sizeof(TIn) * this->get_input_rows() * patch_ld_row);

	// Determine the bounds for which to copy
	const auto last_i = std::min(valid_rows + pad_top, this->get_input_rows());
	const auto last_j = std::min(valid_cols + pad_left, this->get_input_cols());

	// Copy across the valid portion of the patch
	for (auto i = pad_top; i < last_i; i++)
	{
	auto inptr_col = inptr;
	inptr += ld_in_row;

	auto patch_col = patch;
	patch += patch_ld_row;

	for (auto j = pad_left; j < last_j; j++)
	{
	// Perform the copy and progress both input and patch pointers
	memcpy(patch_col, inptr_col, n_channels * sizeof(TIn));
	inptr_col += ld_in_col;
	patch_col += patch_ld_col;
	}
	}

	// Override the input pointer and strides
	inptr = reinterpret_cast<const TIn *>(working_space);
	ld_in_col = patch_ld_col;
	ld_in_row = patch_ld_row;
	}

	// Call the kernel
	m_kernel(n_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix);
	}

	public:
	TransformUnpadded(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel)
	: TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel)
	{
	}

	/* Utility method which can be used to get a transposed version of a kernel,
	* this just calls the kernel with the input row and column strides reversed.
	*/
	static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
	{
	return [kernel] (
	const unsigned int n_channels,
	const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col,
	TOut *const outptr, const size_t ld_out_matrix
	) {
	kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out_matrix);
	};
	}
	};

	} // namespace input_transform
	} // namespace winograd
	} // namespace arm_conv