| /* |
| * Copyright (c) 2017-2019 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include <cstring> |
| #include "utils.hpp" |
| #include "winograd.hpp" |
| |
| using namespace winograd; |
| using array2 = std::pair<unsigned int, unsigned int>; |
| |
| #define MEMBERFN(RTYPE) \ |
| template <int output_tile_rows, int output_tile_cols, int kernel_rows, \ |
| int kernel_cols, WinogradRoots roots> \ |
| template <typename TOut, typename TIn, typename TGEMMIn, typename TGEMMOut> \ |
| RTYPE WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, \ |
| kernel_cols, \ |
| roots>::Convolution<TOut, TIn, TGEMMIn, TGEMMOut> |
| |
| /** Get the output shape of a convolution. */ |
| MEMBERFN(array2) |
| ::get_output_shape(const std::pair<unsigned int, unsigned int> input_shape, |
| const bool padding_same) { |
| const unsigned int n_rows = |
| padding_same ? input_shape.first : input_shape.first - (kernel_rows - 1); |
| const unsigned int n_cols = padding_same |
| ? input_shape.second |
| : input_shape.second - (kernel_cols - 1); |
| return {n_rows, n_cols}; |
| } |
| |
| /** Get the memory required to store the kernel transformed into the |
| * Winograd domain. |
| */ |
| MEMBERFN(size_t) |
| ::get_kernel_storage_size(const unsigned int n_input_channels, |
| const unsigned int n_output_channels) { |
| return N_GEMMS * get_kernel_matrix_size(n_input_channels, n_output_channels); |
| } |
| |
| MEMBERFN(size_t) |
| ::get_input_storage_size(const unsigned int n_batches, |
| const unsigned int n_rows, const unsigned int n_cols, |
| const unsigned int n_channels, |
| const bool same_padding) { |
| return N_GEMMS * get_input_matrix_size(n_batches, n_rows, n_cols, n_channels, |
| same_padding); |
| } |
| |
| MEMBERFN(size_t) |
| ::get_output_storage_size(const unsigned int n_batches, |
| const unsigned int n_rows, const unsigned int n_cols, |
| const unsigned int n_channels) { |
| return N_GEMMS * |
| get_output_matrix_size(n_batches, n_rows, n_cols, n_channels); |
| } |
| |
| /** Get the memory required to apply a Winograd operator to some input. |
| */ |
| MEMBERFN(size_t) |
| ::get_working_space_size(const unsigned int n_batches, |
| const unsigned int n_rows, const unsigned int n_cols, |
| const unsigned int n_input_channels, |
| const unsigned int n_output_channels, |
| const bool padding_same) { |
| const auto output_shape = get_output_shape({n_rows, n_cols}, padding_same); |
| |
| // Get the memory required to store the matrices |
| const size_t matrix_sizes = |
| N_GEMMS * |
| (get_input_matrix_size(n_batches, n_rows, n_cols, n_input_channels, |
| padding_same) + |
| get_output_matrix_size(n_batches, output_shape.first, |
| output_shape.second, n_output_channels)); |
| return matrix_sizes; |
| } |
| |
| /* Get the memory required by a single "input" matrix. |
| */ |
| MEMBERFN(size_t) |
| ::get_input_matrix_size(const unsigned int n_batches, const unsigned int n_rows, |
| const unsigned int n_cols, |
| const unsigned int n_channels, |
| const bool same_padding) { |
| return get_input_matrix_stride(n_batches, n_rows, n_cols, n_channels, |
| same_padding) * |
| sizeof(TGEMMIn); |
| } |
| |
| MEMBERFN(int) |
| ::get_input_matrix_stride(const unsigned int n_batches, const unsigned int n_rows, |
| const unsigned int n_cols, |
| const unsigned int n_channels, |
| const bool same_padding) { |
| const auto output_shape = get_output_shape({n_rows, n_cols}, same_padding); |
| const unsigned int tile_rows = iceildiv(output_shape.first, output_tile_rows); |
| const unsigned int tile_cols = |
| iceildiv(output_shape.second, output_tile_cols); |
| const unsigned int M = |
| roundup<unsigned int>(n_batches * tile_rows * tile_cols, M_BLOCK); |
| const unsigned int K = n_channels; |
| |
| return M * K; |
| } |
| |
| /* Get the memory required by a single "output" matrix. |
| */ |
| MEMBERFN(size_t) |
| ::get_output_matrix_size(const unsigned int n_batches, |
| const unsigned int n_rows, const unsigned int n_cols, |
| const unsigned int n_channels) { |
| return get_output_matrix_stride(n_batches, n_rows, n_cols, n_channels) * |
| sizeof(TGEMMOut); |
| } |
| |
| MEMBERFN(int) |
| ::get_output_matrix_stride(const unsigned int n_batches, |
| const unsigned int n_rows, const unsigned int n_cols, |
| const unsigned int n_channels) { |
| // Compute shape for the GEMM |
| const int tile_rows = iceildiv(n_rows, output_tile_rows); |
| const int tile_cols = iceildiv(n_cols, output_tile_cols); |
| const int M = roundup<int>(tile_rows * tile_cols, M_BLOCK); |
| const int N = roundup<int>(n_channels, N_BLOCK); |
| |
| return n_batches * M * N; |
| } |
| |
| |
| /* Get the memory required by a single "kernel" matrix. |
| */ |
| MEMBERFN(size_t) |
| ::get_kernel_matrix_size(const unsigned int n_input_channels, |
| const unsigned int n_output_channels) { |
| return sizeof(TGEMMIn) * |
| get_kernel_matrix_stride(n_input_channels, n_output_channels); |
| } |
| |
| MEMBERFN(int) |
| ::get_kernel_matrix_stride(const unsigned int n_input_channels, |
| const unsigned int n_output_channels) { |
| return n_input_channels * roundup<int>(n_output_channels, N_BLOCK); |
| } |
| |
| // Instantiate required implementations |
| template class WinogradGEMM<2, 2, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| |
| template class WinogradGEMM<1, 6, 1, 3, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| template class WinogradGEMM<6, 1, 3, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| |
| template class WinogradGEMM<2, 2, 5, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| |
| template class WinogradGEMM<1, 4, 1, 5, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| template class WinogradGEMM<4, 1, 5, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| |
| template class WinogradGEMM<1, 2, 1, 7, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| template class WinogradGEMM<2, 1, 7, 1, WinogradRoots::Integers>::Convolution<float, float, float, float>; |
| |
| #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC |
| template class WinogradGEMM<4, 4, 3, 3, WinogradRoots::Integers>::Convolution<__fp16, __fp16, __fp16, __fp16>; |
| #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC |