| /* |
| * Copyright (c) 2017 ARM Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| #pragma once |
| #include <cstdint> |
| #include <cstdlib> |
| #include <cassert> |
| |
| #include "gemm.hpp" |
| #include "profiler.hpp" |
| #include "utils.hpp" |
| #include "shims.hpp" |
| |
| #include "transforms.hpp" |
| |
| namespace winograd { |
| /***************************************************************************/ |
| /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM |
| * internally. |
| */ |
| template <typename TOut, typename TIn> |
| class Winograd2x2_3x3GEMM { |
| public: |
| /* Instantiate a new Winograd operator. |
| */ |
| Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage); |
| virtual ~Winograd2x2_3x3GEMM(); |
| |
| /** Transform the weights into the Winograd domain. |
| */ |
| template <typename KernelTransform=winograd2x2_3x3_gemm_kernel_transform_impl<TIn>> |
| void transform_weights(const TIn* const kernel, void *transform_working_space); |
| |
| /* Initializes matrices pointers, to be called once before execute() |
| */ |
| template <typename InputTransform=Winograd2x2_3x3GemmInputChannelwise<TIn>> |
| void reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const TIn* const input, void* working_space); |
| |
| /* Apply the Winograd operator to some input. |
| */ |
| template <typename OutputTransform=Winograd2x2_3x3GemmOutput<TOut>> |
| void reshape_output(const Tensor4DShape& input_shape, const PaddingType padding_type, TOut* const output); |
| |
| |
| /* Apply the Winograd operator to some input. |
| */ |
| void execute(size_t first, size_t last); |
| |
| /* Get the memory required to transform the kernel. |
| */ |
| static inline size_t get_kernel_transform_working_size(const KernelShape &shape); |
| |
| /* Get the output shape of a convolution. |
| */ |
| static Tensor4DShape get_output_shape(const Tensor4DShape &input_shape, const KernelShape &k_shape, |
| const PaddingType padding_type); |
| |
| /* Get the memory required to instantiate a new Winograd operator. |
| */ |
| static size_t get_kernel_storage_size(const KernelShape &shape); |
| |
| /* Get the memory required to apply a Winograd operator to some input. |
| */ |
| static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, |
| const PaddingType padding); |
| |
| |
| Winograd2x2_3x3GEMM(const Winograd2x2_3x3GEMM &) = delete; |
| /** Prevent instances of this class from being copied (As this class contains pointers) */ |
| Winograd2x2_3x3GEMM &operator=(const Winograd2x2_3x3GEMM &) = delete; |
| /** Allow instances of this class to be moved */ |
| Winograd2x2_3x3GEMM(Winograd2x2_3x3GEMM &&) = default; |
| /** Allow instances of this class to be moved */ |
| Winograd2x2_3x3GEMM &operator=(Winograd2x2_3x3GEMM &&) = default; |
| |
| protected: |
| /* Get the memory required by a single "input" matrix. |
| */ |
| static size_t get_input_matrix_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, |
| const PaddingType padding); |
| |
| /* Get the memory required by a single "output" matrix. |
| */ |
| static size_t get_output_matrix_size(const Tensor4DShape &input_shape, const KernelShape &k_shape, |
| const PaddingType padding); |
| |
| /* Get the memory required by a single "kernel" matrix. |
| */ |
| static size_t get_kernel_matrix_size(const KernelShape &shape); |
| |
| const KernelShape kernel_shape; // Shape of applied kernel |
| const Tensor4DShape in_shape; |
| const PaddingType padding; |
| |
| const int kernel_matrix_row_stride; // Stride within kernel matrix |
| |
| const bool manage_kernel_storage; // Free kernel storage when done |
| void* const _kernel_storage; // Base pointer for kernel matrices |
| |
| profiler prof; // Profiler |
| |
| TIn *kernel_matrices[16]; // Prepared form of kernel |
| TIn *input_matrices[16]; |
| TOut *output_matrices[16]; |
| |
| |
| static const int M_BLOCK = 4; |
| static const int N_BLOCK = 16; |
| }; |
| } // namespace winograd |
| |
| template <typename TOut, typename TIn> |
| size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_transform_working_size( |
| const KernelShape &shape |
| ) |
| { |
| // Need to re-order the kernel into HWIO form, require enough space to |
| // represent the tensor. |
| return sizeof(TIn) * shape.size(); |
| } |
| |
| |
| template <typename TOut, typename TIn> |
| template <typename KernelTransform> |
| void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::transform_weights( |
| const TIn* const kernel, |
| void *transform_working_space |
| ) |
| { |
| const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape); |
| int8_t* const ks_bytes = reinterpret_cast<int8_t *>(_kernel_storage); |
| for (int i = 0; i < 16; i++) { |
| kernel_matrices[i] = reinterpret_cast<TIn *>( |
| ks_bytes + i*kernel_matrix_size_bytes); |
| } |
| |
| const TIn *kernel_hwio = kernel; |
| if( transform_working_space) |
| { |
| kernel_hwio = reinterpret_cast<TIn *>(transform_working_space); |
| ofm_ifm_h_w_to_h_w_ifm_ofm( |
| kernel, const_cast<TIn *>(kernel_hwio), |
| kernel_shape.n_output_channels, |
| kernel_shape.n_input_channels, |
| kernel_shape.n_rows, |
| kernel_shape.n_cols |
| ); |
| } |
| KernelTransform::execute( |
| kernel_shape, kernel_hwio, kernel_matrices[0], |
| kernel_matrix_size_bytes / sizeof(TIn), |
| kernel_matrix_row_stride |
| ); |
| } |
| |
| template <typename TOut, typename TIn> |
| winograd::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM( const KernelShape &kernel_shape, const Tensor4DShape input_shape, |
| const PaddingType padding_type, void *kernel_storage) |
| : kernel_shape(kernel_shape), in_shape(input_shape), padding(padding_type),kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), manage_kernel_storage(false), |
| _kernel_storage(kernel_storage), prof() { |
| memset(kernel_matrices, 0x00, sizeof(TIn)*16); |
| memset(input_matrices, 0x00, sizeof(TIn)*16); |
| memset(output_matrices, 0x00, sizeof(TOut)*16); |
| } |
| |
| /*****************************************************************************/ |
| template <typename TOut, typename TIn> |
| winograd::Winograd2x2_3x3GEMM<TOut, TIn>::~Winograd2x2_3x3GEMM() {} |
| |
| /*****************************************************************************/ |
| template <typename TOut, typename TIn> |
| template <typename InputTransform> |
| void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::reshape_input( |
| const Tensor4DShape& input_shape, |
| const PaddingType padding_type, |
| const TIn* const input, |
| void *working_space |
| ) { |
| assert(working_space); |
| int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space); |
| // Split the working space into that required for 16 input matrices and |
| // output matrices. |
| const int in_matrix_stride_bytes = get_input_matrix_size(input_shape, kernel_shape, padding_type); |
| const int out_matrix_stride_bytes = get_output_matrix_size(input_shape, kernel_shape, padding_type); |
| |
| for (int i = 0; i < 16; i++) { |
| input_matrices[i] = reinterpret_cast<TIn *>( |
| ws_bytes + i*in_matrix_stride_bytes); |
| output_matrices[i] = reinterpret_cast<TIn *>( |
| ws_bytes + 16*in_matrix_stride_bytes + i*out_matrix_stride_bytes); |
| } |
| |
| // Compute shape for the GEMM |
| const auto output_shape = get_output_shape(input_shape,kernel_shape, padding_type); |
| const int tile_rows = iceildiv(output_shape.n_rows, 2); |
| const int tile_cols = iceildiv(output_shape.n_cols, 2); |
| const int K = kernel_shape.n_input_channels; |
| |
| const int in_matrix_row_stride = K; |
| const int in_matrix_batch_stride = tile_rows*tile_cols*in_matrix_row_stride; |
| |
| // Transform the input tensor into an appropriate form |
| auto input_prep = [&] () { |
| InputTransform::execute( |
| input, input_shape, padding_type, tile_rows, tile_cols, |
| input_matrices[0], in_matrix_stride_bytes / sizeof(TIn), |
| in_matrix_batch_stride, in_matrix_row_stride |
| ); |
| }; |
| prof( |
| "Input Prep", input_prep, |
| InputTransform::bytes_read(input_shape, output_shape), |
| InputTransform::flops_performed(input_shape, output_shape), |
| InputTransform::bytes_written(input_shape, output_shape) |
| ); |
| |
| } |
| |
| /*****************************************************************************/ |
| template <typename TOut, typename TIn> |
| template <typename OutputTransform> |
| void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::reshape_output(const Tensor4DShape& input_shape, const PaddingType padding_type, TOut* const output) { |
| assert(output_matrices[0]); |
| const int out_matrix_stride_bytes = get_output_matrix_size(input_shape, kernel_shape, padding_type); |
| const auto output_shape = get_output_shape(input_shape,kernel_shape, padding_type); |
| const int out_matrix_row_stride = kernel_matrix_row_stride; |
| |
| // Transform the output tensor into an appropriate form |
| OutputTransform::execute( |
| output_shape, |
| output_matrices[0], |
| out_matrix_stride_bytes / sizeof(TOut), |
| out_matrix_row_stride, |
| output |
| ); |
| } |
| |
| |
| /*****************************************************************************/ |
| template <typename TOut, typename TIn> |
| void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::execute( size_t first, size_t last ) { |
| assert(input_matrices[0] && kernel_matrices[0] && output_matrices[0]); |
| assert(first < 16 && last < 16 && first < last); |
| // Compute shape for the GEMM |
| const auto output_shape = get_output_shape(in_shape,kernel_shape, padding); |
| const int tile_rows = iceildiv(output_shape.n_rows, 2); |
| const int tile_cols = iceildiv(output_shape.n_cols, 2); |
| const int M = in_shape.n_batches * tile_rows * tile_cols; |
| const int K = kernel_shape.n_input_channels; |
| const int N = kernel_shape.n_output_channels; |
| |
| const int in_matrix_row_stride = K; |
| const int out_matrix_row_stride = kernel_matrix_row_stride; |
| // Perform the GEMMs |
| for (size_t i = first; i <= last; i++) { |
| BlockedGemm<M_BLOCK, N_BLOCK>( |
| input_matrices[i], kernel_matrices[i], output_matrices[i], M, K, N, |
| in_matrix_row_stride, kernel_matrix_row_stride, out_matrix_row_stride |
| ); |
| // prof("GEMM", perform_gemm, 0, 2*M*K*N, 0); // TODO Memory |
| } |
| |
| } |
| |
| /*****************************************************************************/ |
| template <typename TOut, typename TIn> |
| Tensor4DShape winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape( |
| const Tensor4DShape &in_shape, const KernelShape &k_shape, const PaddingType padding) { |
| return Tensor4DShape { |
| in_shape.n_batches, |
| (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - 2, |
| (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - 2, |
| k_shape.n_output_channels |
| }; |
| } |
| |
| template <typename TOut, typename TIn> |
| size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_storage_size( |
| const KernelShape &shape) { |
| return 16 * get_kernel_matrix_size(shape); |
| } |
| |
| template <typename TOut, typename TIn> |
| size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_matrix_size( |
| const KernelShape &shape) { |
| const int K = shape.n_input_channels; |
| const int N = roundup(shape.n_output_channels, N_BLOCK); |
| return sizeof(TIn) * K * N; |
| } |
| |
| template <typename TOut, typename TIn> |
| size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size( |
| const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type |
| ) { |
| return 16 * get_input_matrix_size(input_shape, k_shape, padding_type) + |
| 16 * get_output_matrix_size(input_shape, k_shape, padding_type); |
| } |
| |
| template <typename TOut, typename TIn> |
| size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size( |
| const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type |
| ) { |
| // Compute shape for the GEMM |
| const auto output_shape = get_output_shape(input_shape, k_shape, padding_type); |
| const int tile_rows = iceildiv(output_shape.n_rows, 2); |
| const int tile_cols = iceildiv(output_shape.n_cols, 2); |
| const int M = roundup(tile_rows * tile_cols, M_BLOCK); |
| const int K = k_shape.n_input_channels; |
| |
| return input_shape.n_batches * M * K * sizeof(TIn); |
| } |
| |
| template <typename TOut, typename TIn> |
| size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size( |
| const Tensor4DShape& input_shape, const KernelShape &k_shape,const PaddingType padding_type |
| ) { |
| // Compute shape for the GEMM |
| const auto output_shape = get_output_shape(input_shape, k_shape, padding_type); |
| const int tile_rows = iceildiv(output_shape.n_rows, 2); |
| const int tile_cols = iceildiv(output_shape.n_cols, 2); |
| const int M = roundup(tile_rows * tile_cols, M_BLOCK); |
| const int N = roundup(k_shape.n_output_channels, N_BLOCK); |
| |
| return input_shape.n_batches * M * N * sizeof(TOut); |
| } |