blob: 5148495608ff952a011ab78738a3bd7393408640 [file] [log] [blame]
/*
* Copyright (c) 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include "src/core/NEON/kernels/assembly/winograd.hpp"
#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
#include <algorithm>
#include <cstring>
#include <functional>
#include <limits>
namespace arm_conv {
namespace winograd {
namespace output_transform {
/* Driver class for the Winograd output transforms.
*
* This provides a base implementation which handles iteration over the output
* tensor; subclasses are responsible for managing working space and executing
* the transform on individual tiles.
*/
template <typename TIn, typename TOut=TIn>
class TransformBase : public ITransform
{
const std::string m_name;
const unsigned int m_output_rows, m_output_cols;
const unsigned int m_kernel_rows, m_kernel_cols;
protected:
virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
{
return 0;
}
virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
{
// Nothing to do
}
virtual void execute_tile(
unsigned int n_channels,
const TIn *inptr, size_t ld_in_matrix,
const TIn *bias,
TOut *outptr, size_t ld_out_row, size_t ld_out_col,
TOut activation_min, TOut activation_max,
unsigned int valid_rows, unsigned int valid_cols,
void *working_space
) const = 0;
void execute_internal(
const ConvolutionArgs &args,
const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
const TIn *bias,
TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
void *working_space, unsigned int thread_id, unsigned int n_threads
) const
{
// Get the working space for this thread, and initialise it.
working_space = reinterpret_cast<char *>(working_space) +
this->get_working_space_per_thread(args) * thread_id;
this->initialise_thread_working_space(args, working_space);
// Get the activation values
auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
switch (args.activation.type)
{
case arm_gemm::Activation::Type::BoundedReLU:
activation_max = static_cast<TOut>(args.activation.param1);
// Fall through
case arm_gemm::Activation::Type::ReLU:
activation_min = static_cast<TOut>(0);
break;
default:
break;
}
// Determine the number of tiles in a row, we use this to get the right
// offset into the input data.
const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();
// Execute over all batches
for (unsigned int batch = 0; batch < args.n_batches; batch++)
{
auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
inptr += ld_in_batch;
outptr += ld_out_batch;
// Stripe rows of tiles over threads.
for (auto out_i = thread_id * this->get_output_rows();
out_i < args.output_shape.rows;
out_i += n_threads * this->get_output_rows())
{
auto inptr_tile = inptr_row;
auto outptr_tile = outptr_row;
inptr_row += n_threads * n_tile_cols * ld_in_row;
outptr_row += n_threads * this->get_output_rows() * ld_out_row;
// Iterate over all columns
for (auto out_j = 0u; out_j < args.output_shape.cols;
out_j += this->get_output_cols())
{
// Execute the tile
this->execute_tile(
args.n_output_channels,
inptr_tile, ld_in_matrix,
bias,
outptr_tile, ld_out_row, ld_out_col,
activation_min, activation_max,
args.output_shape.rows - out_i, // Number of valid rows remaining
args.output_shape.cols - out_j, // Number of valid columns remaining
working_space
);
// Progress the pointers
inptr_tile += ld_in_row;
outptr_tile += this->get_output_cols() * ld_out_col;
}
}
}
}
public:
TransformBase(const std::string &name,
unsigned int output_rows, unsigned int output_cols,
unsigned int kernel_rows, unsigned int kernel_cols)
: m_name(name),
m_output_rows(output_rows), m_output_cols(output_cols),
m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
{
}
const std::string &get_name(void) const override { return m_name; }
unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }
unsigned int get_output_rows(void) const override final { return m_output_rows; }
unsigned int get_output_cols(void) const override final { return m_output_cols; }
unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }
size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
{
return n_threads * this->get_working_space_per_thread(args);
}
void execute(
const ConvolutionArgs &args,
const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
const void *bias,
void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
void *working_space, unsigned int thread_id, unsigned int n_threads
) const override
{
execute_internal(
args,
reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
reinterpret_cast<const TIn *>(bias),
reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
working_space, thread_id, n_threads
);
}
};
template <typename TIn, typename TOut=TIn>
class TransformUnpadded : public TransformBase<TIn, TOut>
{
using Kernel = std::function<void(
unsigned int n_channels,
const TIn *inptr, size_t ld_in_matrix,
const TIn *bias,
TOut *outptr, size_t ld_out_row, size_t ld_out_col,
TOut activation_min, TOut activation_max
)>;
const Kernel m_kernel;
protected:
size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
{
// We create a buffer the size of the output tile
const auto n_output_points = this->get_output_rows() * this->get_output_cols();
return sizeof(TOut) * n_output_points * args.n_output_channels;
}
void execute_tile(
unsigned int n_channels,
const TIn *inptr, size_t ld_in_matrix,
const TIn *bias,
TOut *outptr, size_t ld_out_row, size_t ld_out_col,
TOut activation_min, TOut activation_max,
unsigned int valid_rows, unsigned int valid_cols,
void *working_space
) const override final
{
// Get copies of the output tensor parameters
auto kernel_outptr = outptr;
auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;
// If there's padding on either the left or the right, then we execute the
// kernel into the output buffer and then perform a copy.
if (valid_rows < this->get_output_rows() ||
valid_cols < this->get_output_cols())
{
// Override the kernel output parameters
kernel_outptr = reinterpret_cast<TOut *>(working_space);
kernel_ld_out_col = n_channels;
kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
}
// Execute the kernel
m_kernel(
n_channels,
inptr, ld_in_matrix,
bias,
kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
activation_min, activation_max
);
// If necessary, copy from the working space into the destination tensor.
if (valid_rows < this->get_output_rows() ||
valid_cols < this->get_output_cols())
{
const auto last_row = std::min(valid_rows, this->get_output_rows());
const auto last_col = std::min(valid_cols, this->get_output_cols());
for (auto i = 0u; i < last_row; i++)
{
auto patch_tile = kernel_outptr;
auto out_tile = outptr;
kernel_outptr += kernel_ld_out_row;
outptr += ld_out_row;
for (auto j = 0u; j < last_col; j++)
{
memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
patch_tile += kernel_ld_out_col;
out_tile += ld_out_col;
}
}
}
}
public:
TransformUnpadded(const std::string &name,
unsigned int output_rows, unsigned int output_cols,
unsigned int kernel_rows, unsigned int kernel_cols,
const Kernel kernel)
: TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
m_kernel(kernel)
{
}
/* Utility method to get a transposed variant of a kernel, this transposed
* version simply calls the original kernel with the output row and column
* strides swapped.
*/
static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
{
return [kernel] (
const unsigned int n_channels,
const TIn *const inptr, const size_t ld_in_matrix,
const TIn *const bias,
TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
const TOut activation_min, const TOut activation_max
) {
kernel(n_channels, inptr, ld_in_matrix, bias,
outptr, ld_out_col, ld_out_row,
activation_min, activation_max);
};
}
};
} // namespace output_transform
} // namespace winograd
} // namespace arm_conv