| /* |
| * Copyright (c) 2017-2019 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #pragma once |
| |
| #include "arm_gemm.hpp" |
| |
| #include <cstddef> |
| #include <utility> |
| |
| namespace winograd |
| { |
| |
| class ITransform |
| { |
| public: |
| virtual ~ITransform() = default; |
| |
| /** |
| * Get the working space required to perform the transformation. |
| * |
| * Note, the working space is only required when performing the |
| * transformation - hence it can be reused whenever the transformation is |
| * not running. |
| * |
| * @param nthreads The greatest number of threads that will be used to execute the transform. |
| * @return Size of working space required in bytes. |
| */ |
| virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; |
| |
| /** |
| * Set the working space to be used by the transformation. |
| * |
| * Note, the working space is only required when performing the |
| * transformation - hence it can be reused whenever the transformation is |
| * not running. |
| * |
| * @param Pointer to the working space. |
| */ |
| virtual void set_working_space(void *buffer) = 0; |
| |
| /** |
| * Get the window of work a given operator can perform. |
| */ |
| virtual unsigned int get_window() const = 0; |
| |
| /** |
| * Perform work upon a window of the transform. |
| */ |
| virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0; |
| }; |
| |
| class IInputTransform : public ITransform |
| { |
| public: |
| virtual ~IInputTransform() = default; |
| |
| /** |
| * Set the pointer to the (NHWC-ordered) tensor to be transformed. |
| */ |
| virtual void set_input_tensor(const void *input) = 0; |
| |
| /** |
| * Set the pointer to the (NHWC-ordered) tensor to be transformed. |
| * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). |
| */ |
| virtual void set_input_tensor(const void *input, int col_stride) = 0; |
| |
| /** |
| * Set the pointer to the (NHWC-ordered) tensor to be transformed. |
| * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). |
| * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). |
| */ |
| virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0; |
| |
| /** |
| * Set the pointer to the (NHWC-ordered) tensor to be transformed. |
| * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes). |
| * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). |
| * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). |
| */ |
| virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0; |
| |
| /** |
| * Set pointers to the matrices written by the transform. |
| * @param matrices Pointer to the start of the first matrix representing the transformed input. |
| * @param inter_matrix_stride Stride (in elements) between matrices. |
| * @param matrix_row_stride Stride (in elements) between the rows within a single matrix. |
| */ |
| virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0; |
| }; |
| |
| class IOutputTransform : public ITransform |
| { |
| public: |
| virtual ~IOutputTransform() = default; |
| |
| /** |
| * Set pointers to the matrices written by the transform. |
| * @param matrices Pointer to the start of the first matrix representing the input to the transform. |
| * @param inter_matrix_stride Stride (in elements) between matrices. |
| * @param matrix_row_stride Stride (in elements) between the rows within a single matrix. |
| */ |
| virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0; |
| |
| /** |
| * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias. |
| */ |
| virtual void set_bias(const void *bias=nullptr) = 0; |
| |
| /** |
| * Set pointer to the output tensor produced by the transform. |
| */ |
| virtual void set_output_tensor(void *output) = 0; |
| |
| /** |
| * Set pointer to the output tensor produced by the transform. |
| * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). |
| */ |
| virtual void set_output_tensor(void *output, int col_stride) = 0; |
| |
| /** |
| * Set pointer to the output tensor produced by the transform. |
| * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). |
| * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). |
| */ |
| virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0; |
| |
| /** |
| * Set pointer to the output tensor produced by the transform. |
| * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes). |
| * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). |
| * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). |
| */ |
| virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0; |
| }; |
| |
| class IWeightTransform : public ITransform |
| { |
| public: |
| virtual ~IWeightTransform() = default; |
| |
| /** Set pointer to the weight tensor read by the transform. */ |
| virtual void set_weight_tensor(const void *weights) = 0; |
| |
| /** |
| * Set pointers to the matrices written by the transform. |
| * @param matrices Pointer to the start of the first matrix representing the transformed input. |
| * @param inter_matrix_stride Stride (in elements) between matrices. |
| * @param matrix_row_stride Stride (in elements) between the rows within a single matrix. |
| */ |
| virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0; |
| }; |
| |
| enum class WinogradRoots |
| { |
| Integers, |
| }; |
| |
| template <int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots> |
| class InputTransform : public IInputTransform |
| { |
| public: |
| /** Create an InputTransform operator fixed on a given problem and set of |
| * pointers. |
| */ |
| InputTransform( |
| int kernel_rows, /**< Number of rows in the kernel */ |
| int kernel_cols, /**< Number of columns in the kernel */ |
| int n_batches, /**< Number of batches in input tensor. */ |
| int n_rows, /**< Number of rows in input tensor. */ |
| int n_cols, /**< Number of columns in input tensor. */ |
| int n_channels, /**< Number of channels in input tensor. */ |
| int padding_top, /**< Padding to apply to the top of the image. */ |
| int padding_left, /**< Padding to apply to the left of the image. */ |
| int padding_bottom, /**< Padding to apply to the bottom of the image. */ |
| int padding_right /**< Padding to apply to the right of the image. */ |
| ); |
| |
| InputTransform(InputTransform&) = delete; |
| InputTransform operator=(InputTransform&) = delete; |
| |
| /** Set pointers to the input tensor read by the transform. */ |
| void set_input_tensor(const void *input) override; |
| void set_input_tensor(const void *input, int col_stride) override; |
| void set_input_tensor(const void *input, int row_stride, int col_stride) override; |
| void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override; |
| |
| /** Set pointers to the matrices written by the transform. */ |
| void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override; |
| |
| /** Get the working space required to perform the transformation. */ |
| size_t get_working_space_size(unsigned int nthreads=1) const override; |
| void set_working_space(void *buffer) override; |
| |
| /** Get the window of work a given operator can perform. */ |
| unsigned int get_window() const override; |
| static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window |
| |
| /** Perform work upon a window of the input. */ |
| void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; |
| |
| protected: |
| const int _n_batches, _n_rows, _n_cols, _n_channels; |
| |
| private: |
| void transform_unpadded_tile( |
| unsigned int threadid, |
| int n_channels, |
| TOut *outptr, |
| const TIn *inptr |
| ); |
| |
| void transform_padded_tile( |
| unsigned int threadid, |
| int n_channels, |
| TOut *outptr, |
| const TIn *inptr, |
| int padding_top, |
| int padding_left, |
| int padding_bottom, |
| int padding_right |
| ); |
| |
| /* Tile implementation */ |
| static void transform_tile( |
| int n_channels, /** @param[in] Number of channels in the tensor. */ |
| const TIn* inptr_base, /** @param[in] Pointer to the base of the input tile. */ |
| int input_row_stride, /** @param[in] Stride between rows of the input tensor. */ |
| int input_col_stride, /** @param[in] Stride between columns of the input tensor. */ |
| TOut* mptr_base, /** @param[out] Base pointer to transformed input matrices. */ |
| int matrix_stride /** @param[in] Stride between matrices in the input space. */ |
| ); |
| |
| /** Get the working space for a thread. */ |
| void * get_working_space(unsigned int threadid) const; |
| |
| const TIn* _inptr; |
| TOut* _outptr; |
| |
| const int _overlap_rows, _overlap_cols; |
| const int _padding_top, _padding_left, _padding_bottom, _padding_right; |
| const int _tiles_M, _tiles_N; |
| int _matrix_stride, _matrix_row_stride, _matrix_batch_stride; |
| int _in_col_stride, _in_row_stride, _in_batch_stride; |
| |
| const int _working_space_col_stride, _working_space_row_stride; |
| TIn *_working_space; |
| }; |
| |
| template <int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots> |
| class InputTransform<InnerTileRows, 1, TIn, TOut, Roots> : |
| public InputTransform<1, InnerTileRows, TIn, TOut, Roots> |
| { |
| using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>; |
| |
| public: |
| InputTransform( |
| int kernel_rows, /**< Number of rows in the kernel. */ |
| int kernel_cols, /**< Number of columns in the kernel. */ |
| int n_batches, /**< Number of batches in input tensor. */ |
| int n_rows, /**< Number of rows in input tensor. */ |
| int n_cols, /**< Number of columns in input tensor. */ |
| int n_channels, /**< Number of channels in input tensor. */ |
| int padding_top, /**< Padding to apply to the top of the image. */ |
| int padding_left, /**< Padding to apply to the left of the image. */ |
| int padding_bottom, /**< Padding to apply to the bottom of the image. */ |
| int padding_right /**< Padding to apply to the right of the image. */ |
| ); |
| |
| /** Set pointers to the input tensor read by the transform. */ |
| void set_input_tensor(const void *input) override; |
| void set_input_tensor(const void *input, int col_stride) override; |
| void set_input_tensor(const void *input, int row_stride, int col_stride) override; |
| void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override; |
| }; |
| |
| template < |
| int KernelRows, int KernelCols, |
| int InnerTileRows, int InnerTileCols, |
| typename TIn, typename TOut, |
| WinogradRoots Roots |
| > |
| class OutputTransform : public IOutputTransform |
| { |
| public: |
| OutputTransform( |
| int n_batches, /**< Number of batches in output tensor. */ |
| int n_rows, /**< Number of rows in output tensor. */ |
| int n_cols, /**< Number of columns in output tensor. */ |
| int n_channels, /**< Number of channels in output tensor. */ |
| const arm_gemm::Activation &activation |
| ); |
| |
| OutputTransform(OutputTransform&) = delete; |
| OutputTransform operator=(OutputTransform&) = delete; |
| |
| /** Set pointers to the matrices read by the transform. */ |
| void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override; |
| |
| /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */ |
| void set_bias(const void *bias=nullptr) override; |
| |
| /** Set pointers to the output tensor written by the transform. */ |
| void set_output_tensor(void *output) override; |
| void set_output_tensor(void *output, int col_stride) override; |
| void set_output_tensor(void *output, int row_stride, int col_stride) override; |
| void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override; |
| |
| /** Get the working space required to perform the transformation. */ |
| size_t get_working_space_size(unsigned int nthreads=1) const override; |
| void set_working_space(void *buffer) override; |
| |
| /** Get the window of work a given operator can perform. */ |
| unsigned int get_window() const override; |
| static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window |
| |
| /** Perform work upon a window of the input. */ |
| void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; |
| |
| protected: |
| static constexpr int inner_tile_rows = InnerTileRows; |
| static constexpr int inner_tile_cols = InnerTileCols; |
| static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1; |
| static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1; |
| |
| const int _n_batches, _n_rows, _n_cols, _n_channels; |
| const TOut _output_min, _output_max; |
| |
| private: |
| void transform_uncropped_tile( |
| unsigned int threadid, |
| int n_channels, |
| TOut *outptr, |
| const TIn *inptr, |
| const TOut *biases |
| ); |
| |
| void transform_cropped_tile( |
| unsigned int threadid, |
| int n_channels, |
| TOut *outptr, |
| const TIn *inptr, |
| const TOut *biases, |
| int pad_bottom, |
| int pad_right |
| ); |
| |
| /** Implementation of the tile transformation method. */ |
| static void transform_tile( |
| int n_channels, |
| const TIn* matrix_base, |
| int matrix_stride, |
| const TOut* biases, |
| TOut* output, |
| int output_row_stride, |
| int output_col_stride, |
| TOut output_min, |
| TOut output_max |
| ); |
| |
| /** Get the working space for a thread. */ |
| void * get_working_space(unsigned int threadid) const; |
| |
| const TIn* _matrix_base; |
| const TOut* _biases; |
| int _matrix_stride, _matrix_row_stride, _matrix_batch_stride; |
| TOut* _outptr; |
| const int _tiles_M, _tiles_N; |
| int _out_col_stride, _out_row_stride, _out_batch_stride; |
| |
| const int _working_space_col_stride, _working_space_row_stride; |
| TOut *_working_space; |
| }; |
| |
| template < |
| int KernelRows, |
| int InnerTileRows, |
| typename TIn, typename TOut, |
| WinogradRoots Roots |
| > |
| class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> : |
| public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots> |
| { |
| using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>; |
| |
| public: |
| OutputTransform( |
| int n_batches, /**< Number of batches in output tensor. */ |
| int n_rows, /**< Number of rows in output tensor. */ |
| int n_cols, /**< Number of columns in output tensor. */ |
| int n_channels, /**< Number of channels in output tensor. */ |
| const arm_gemm::Activation &activation |
| ); |
| |
| /** Set pointers to the output tensor written by the transform. */ |
| void set_output_tensor(void *output) override; |
| void set_output_tensor(void *output, int col_stride) override; |
| void set_output_tensor(void *output, int row_stride, int col_stride) override; |
| void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override; |
| }; |
| |
| template < |
| int KernelRows, int KernelCols, |
| int InnerTileRows, int InnerTileCols, |
| typename TIn, typename TOut, |
| WinogradRoots Roots |
| > |
| class WeightTransform : public IWeightTransform |
| { |
| public: |
| WeightTransform( |
| int n_output_channels, /**< Number of output channels in the kernel. */ |
| int n_input_channels /**< Number of input channels in the kernel. */ |
| ); |
| |
| WeightTransform(WeightTransform&) = delete; |
| WeightTransform operator=(WeightTransform&) = delete; |
| |
| /** Set pointer to the weight tensor read by the transform. */ |
| void set_weight_tensor(const void *weights) override; |
| |
| /** Set pointer to the matrices written by the transform. */ |
| void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override; |
| |
| /** Get the working space required to perform the transformation. */ |
| size_t get_working_space_size(unsigned int nthreads=1) const override; |
| void set_working_space(void *buffer) override; |
| |
| /** Get the window of work a given operator can perform. */ |
| unsigned int get_window() const override; |
| static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window |
| |
| /** Perform work upon a window of the input. */ |
| void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; |
| |
| protected: |
| static const int kernel_rows = KernelRows; |
| static const int kernel_cols = KernelCols; |
| static const int inner_tile_rows = InnerTileRows; |
| static const int inner_tile_cols = InnerTileCols; |
| |
| private: |
| /** Apply the transform to a tensor. */ |
| static void execute( |
| int n_output_channels, |
| int n_input_channels, |
| const TIn* input, |
| TOut* output, |
| int matrix_stride, |
| int matrix_row_stride |
| ); |
| |
| const int _n_output_channels, _n_input_channels; |
| TOut *_matrices; |
| int _matrix_stride, _matrix_row_stride; |
| const TIn *_weights; |
| }; |
| |
| template <int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots> |
| class WeightTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> : |
| public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots> |
| { |
| public: |
| using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform; |
| }; |
| |
| template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, WinogradRoots Roots> |
| class WinogradGEMM |
| { |
| public: |
| // Information about the specific Winograd instance |
| static constexpr int output_tile_rows = OutputTileRows; |
| static constexpr int output_tile_cols = OutputTileCols; |
| static constexpr int kernel_rows = KernelRows; |
| static constexpr int kernel_cols = KernelCols; |
| static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1; |
| static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1; |
| static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols; |
| |
| /** Transform weights from the spatial to the Winograd domain. */ |
| template <typename TIn, typename TOut> |
| using WeightsTransform = WeightTransform< |
| KernelRows, KernelCols, inner_tile_rows, inner_tile_cols, |
| TIn, TOut, Roots |
| >; |
| |
| /** Transform input feature maps from the spatial to the Winograd domain. |
| */ |
| template <typename TIn, typename TOut> |
| using InputTransform = InputTransform< |
| inner_tile_rows, inner_tile_cols, TIn, TOut, Roots |
| >; |
| |
| /** Transform output feature maps from the Winograd to the spatial domain. |
| */ |
| template <typename TIn, typename TOut> |
| using OutputTransform = OutputTransform< |
| KernelRows, KernelCols, inner_tile_rows, inner_tile_cols, |
| TIn, TOut, Roots |
| >; |
| |
| /** Perform a convolution. |
| */ |
| template <typename TOut, typename TIn, typename TInGEMM=TIn, typename TOutGEMM=TOut> |
| class Convolution |
| { |
| public: |
| // Information about the typed Winograd instance |
| typedef TOut OutputType; |
| typedef TOutGEMM GemmOutputType; |
| typedef TInGEMM GemmInputType; |
| typedef TIn InputType; |
| |
| /** Get the output shape of a convolution. */ |
| static std::pair<unsigned int, unsigned int> get_output_shape( |
| const std::pair<unsigned int, unsigned int> input_shape, |
| bool padding_same); |
| |
| /** Get the memory required to store the kernel transformed into the |
| * Winograd domain. |
| */ |
| static size_t get_kernel_storage_size(unsigned int n_input_channels, |
| unsigned int n_output_channels); |
| |
| /** Get the memory required to store the input tensor transformed into |
| * the Winograd domain. |
| */ |
| static size_t get_input_storage_size( |
| unsigned int n_batches, // Number of batches |
| unsigned int n_rows, // Number of input rows |
| unsigned int n_cols, // Number of input columns |
| unsigned int n_channels, // Number of input channels |
| bool padding_same); |
| |
| /** Get the memory required to store the output tensor in the Winograd |
| * domain. |
| */ |
| static size_t get_output_storage_size( |
| unsigned int n_batches, // Number of batches |
| unsigned int n_rows, // Number of output rows |
| unsigned int n_cols, // Number of output columns |
| unsigned int n_channels // Number of output channels |
| ); |
| |
| /** Get the memory required to apply a Winograd operator to some input. |
| */ |
| static size_t get_working_space_size( |
| unsigned int n_batches, |
| unsigned int n_rows, // Number of input rows |
| unsigned int n_cols, // Number of input columns |
| unsigned int n_input_channels, // Number of input channels |
| unsigned int n_output_channels, // Number of output channels |
| bool padding_same); |
| |
| /* Get the memory required by a single "input" matrix. |
| */ |
| static size_t get_input_matrix_size( |
| unsigned int n_batches, // Number of batches |
| unsigned int n_rows, // Number of input rows |
| unsigned int n_cols, // Number of input columns |
| unsigned int n_channels, // Number of input channels |
| bool padding_same); |
| |
| static int get_input_matrix_stride( |
| unsigned int n_batches, // Number of batches |
| unsigned int n_rows, // Number of input rows |
| unsigned int n_cols, // Number of input columns |
| unsigned int n_channels, // Number of input channels |
| bool padding_same); |
| |
| /* Get the memory required by a single "output" matrix. |
| */ |
| static size_t get_output_matrix_size( |
| unsigned int n_batches, // Number of batches |
| unsigned int n_rows, // Number of output rows |
| unsigned int n_cols, // Number of output columns |
| unsigned int n_channels // Number of output channels |
| ); |
| |
| static int get_output_matrix_stride( |
| unsigned int n_batches, // Number of batches |
| unsigned int n_rows, // Number of output rows |
| unsigned int n_cols, // Number of output columns |
| unsigned int n_channels // Number of output channels |
| ); |
| |
| /* Get the memory required by a single "kernel" matrix. |
| */ |
| static size_t get_kernel_matrix_size(unsigned int n_input_channels, |
| unsigned int n_output_channels); |
| static int get_kernel_matrix_stride(unsigned int n_input_channels, |
| unsigned int n_output_channels); |
| |
| static constexpr int M_BLOCK = 4; /** Size of block used by GEMM. */ |
| static constexpr int N_BLOCK = 16; /** Size of block used by GEMM. */ |
| }; |
| }; |
| |
| } // namespace winograd |