| /* |
| * Copyright (c) 2022 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #pragma once |
| |
| #include "src/core/NEON/kernels/assembly/winograd.hpp" |
| |
| #include "src/core/NEON/kernels/arm_conv/addressing.hpp" |
| |
| #include <algorithm> |
| #include <cstring> |
| #include <functional> |
| #include <limits> |
| |
| namespace arm_conv { |
| namespace winograd { |
| namespace output_transform { |
| |
| /* Driver class for the Winograd output transforms. |
| * |
| * This provides a base implementation which handles iteration over the output |
| * tensor; subclasses are responsible for managing working space and executing |
| * the transform on individual tiles. |
| */ |
| template <typename TIn, typename TOut=TIn> |
| class TransformBase : public ITransform |
| { |
| const std::string m_name; |
| const unsigned int m_output_rows, m_output_cols; |
| const unsigned int m_kernel_rows, m_kernel_cols; |
| |
| protected: |
| virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const |
| { |
| return 0; |
| } |
| |
| virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const |
| { |
| // Nothing to do |
| } |
| |
| virtual void execute_tile( |
| unsigned int n_channels, |
| const TIn *inptr, size_t ld_in_matrix, |
| const TIn *bias, |
| TOut *outptr, size_t ld_out_row, size_t ld_out_col, |
| TOut activation_min, TOut activation_max, |
| unsigned int valid_rows, unsigned int valid_cols, |
| void *working_space |
| ) const = 0; |
| |
| void execute_internal( |
| const ConvolutionArgs &args, |
| const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, |
| const TIn *bias, |
| TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, |
| void *working_space, unsigned int thread_id, unsigned int n_threads |
| ) const |
| { |
| // Get the working space for this thread, and initialise it. |
| working_space = reinterpret_cast<char *>(working_space) + |
| this->get_working_space_per_thread(args) * thread_id; |
| this->initialise_thread_working_space(args, working_space); |
| |
| // Get the activation values |
| auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity()); |
| auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity()); |
| switch (args.activation.type) |
| { |
| case arm_gemm::Activation::Type::BoundedReLU: |
| activation_max = static_cast<TOut>(args.activation.param1); |
| // Fall through |
| case arm_gemm::Activation::Type::ReLU: |
| activation_min = static_cast<TOut>(0); |
| break; |
| default: |
| break; |
| } |
| |
| // Determine the number of tiles in a row, we use this to get the right |
| // offset into the input data. |
| const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols(); |
| |
| // Execute over all batches |
| for (unsigned int batch = 0; batch < args.n_batches; batch++) |
| { |
| auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row; |
| auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows(); |
| inptr += ld_in_batch; |
| outptr += ld_out_batch; |
| |
| // Stripe rows of tiles over threads. |
| for (auto out_i = thread_id * this->get_output_rows(); |
| out_i < args.output_shape.rows; |
| out_i += n_threads * this->get_output_rows()) |
| { |
| auto inptr_tile = inptr_row; |
| auto outptr_tile = outptr_row; |
| inptr_row += n_threads * n_tile_cols * ld_in_row; |
| outptr_row += n_threads * this->get_output_rows() * ld_out_row; |
| |
| // Iterate over all columns |
| for (auto out_j = 0u; out_j < args.output_shape.cols; |
| out_j += this->get_output_cols()) |
| { |
| // Execute the tile |
| this->execute_tile( |
| args.n_output_channels, |
| inptr_tile, ld_in_matrix, |
| bias, |
| outptr_tile, ld_out_row, ld_out_col, |
| activation_min, activation_max, |
| args.output_shape.rows - out_i, // Number of valid rows remaining |
| args.output_shape.cols - out_j, // Number of valid columns remaining |
| working_space |
| ); |
| |
| // Progress the pointers |
| inptr_tile += ld_in_row; |
| outptr_tile += this->get_output_cols() * ld_out_col; |
| } |
| } |
| } |
| } |
| |
| public: |
| TransformBase(const std::string &name, |
| unsigned int output_rows, unsigned int output_cols, |
| unsigned int kernel_rows, unsigned int kernel_cols) |
| : m_name(name), |
| m_output_rows(output_rows), m_output_cols(output_cols), |
| m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols) |
| { |
| } |
| |
| const std::string &get_name(void) const override { return m_name; } |
| |
| unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; } |
| unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; } |
| |
| unsigned int get_output_rows(void) const override final { return m_output_rows; } |
| unsigned int get_output_cols(void) const override final { return m_output_cols; } |
| |
| unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; } |
| unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; } |
| |
| size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override |
| { |
| return n_threads * this->get_working_space_per_thread(args); |
| } |
| |
| void execute( |
| const ConvolutionArgs &args, |
| const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, |
| const void *bias, |
| void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, |
| void *working_space, unsigned int thread_id, unsigned int n_threads |
| ) const override |
| { |
| execute_internal( |
| args, |
| reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row, |
| reinterpret_cast<const TIn *>(bias), |
| reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col, |
| working_space, thread_id, n_threads |
| ); |
| } |
| }; |
| |
| template <typename TIn, typename TOut=TIn> |
| class TransformUnpadded : public TransformBase<TIn, TOut> |
| { |
| using Kernel = std::function<void( |
| unsigned int n_channels, |
| const TIn *inptr, size_t ld_in_matrix, |
| const TIn *bias, |
| TOut *outptr, size_t ld_out_row, size_t ld_out_col, |
| TOut activation_min, TOut activation_max |
| )>; |
| const Kernel m_kernel; |
| |
| protected: |
| size_t get_working_space_per_thread(const ConvolutionArgs &args) const override |
| { |
| // We create a buffer the size of the output tile |
| const auto n_output_points = this->get_output_rows() * this->get_output_cols(); |
| return sizeof(TOut) * n_output_points * args.n_output_channels; |
| } |
| |
| void execute_tile( |
| unsigned int n_channels, |
| const TIn *inptr, size_t ld_in_matrix, |
| const TIn *bias, |
| TOut *outptr, size_t ld_out_row, size_t ld_out_col, |
| TOut activation_min, TOut activation_max, |
| unsigned int valid_rows, unsigned int valid_cols, |
| void *working_space |
| ) const override final |
| { |
| // Get copies of the output tensor parameters |
| auto kernel_outptr = outptr; |
| auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col; |
| |
| // If there's padding on either the left or the right, then we execute the |
| // kernel into the output buffer and then perform a copy. |
| if (valid_rows < this->get_output_rows() || |
| valid_cols < this->get_output_cols()) |
| { |
| // Override the kernel output parameters |
| kernel_outptr = reinterpret_cast<TOut *>(working_space); |
| kernel_ld_out_col = n_channels; |
| kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols(); |
| } |
| |
| // Execute the kernel |
| m_kernel( |
| n_channels, |
| inptr, ld_in_matrix, |
| bias, |
| kernel_outptr, kernel_ld_out_row, kernel_ld_out_col, |
| activation_min, activation_max |
| ); |
| |
| // If necessary, copy from the working space into the destination tensor. |
| if (valid_rows < this->get_output_rows() || |
| valid_cols < this->get_output_cols()) |
| { |
| const auto last_row = std::min(valid_rows, this->get_output_rows()); |
| const auto last_col = std::min(valid_cols, this->get_output_cols()); |
| |
| for (auto i = 0u; i < last_row; i++) |
| { |
| auto patch_tile = kernel_outptr; |
| auto out_tile = outptr; |
| kernel_outptr += kernel_ld_out_row; |
| outptr += ld_out_row; |
| |
| for (auto j = 0u; j < last_col; j++) |
| { |
| memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels); |
| patch_tile += kernel_ld_out_col; |
| out_tile += ld_out_col; |
| } |
| } |
| } |
| } |
| |
| public: |
| TransformUnpadded(const std::string &name, |
| unsigned int output_rows, unsigned int output_cols, |
| unsigned int kernel_rows, unsigned int kernel_cols, |
| const Kernel kernel) |
| : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols), |
| m_kernel(kernel) |
| { |
| } |
| |
| /* Utility method to get a transposed variant of a kernel, this transposed |
| * version simply calls the original kernel with the output row and column |
| * strides swapped. |
| */ |
| static constexpr Kernel get_transposed_kernel(const Kernel &kernel) |
| { |
| return [kernel] ( |
| const unsigned int n_channels, |
| const TIn *const inptr, const size_t ld_in_matrix, |
| const TIn *const bias, |
| TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col, |
| const TOut activation_min, const TOut activation_max |
| ) { |
| kernel(n_channels, inptr, ld_in_matrix, bias, |
| outptr, ld_out_col, ld_out_row, |
| activation_min, activation_max); |
| }; |
| } |
| }; |
| |
| } // namespace output_transform |
| } // namespace winograd |
| } // namespace arm_conv |