blob: 95ece8cdc82067d54e4dcd4816fc7ea12db8537e [file] [log] [blame]
/*
* Copyright (c) 2022-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#pragma once
#include "depthwise.hpp"
#include "utils.hpp"
namespace arm_conv {
namespace depthwise {
template <typename T> struct DefaultTAccum { using Type = T; };
template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
template <typename T> struct DefaultOutputStage { using Type = Nothing; };
template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
class IDepthfirstStrategy
{
public:
virtual ~IDepthfirstStrategy() = default;
virtual unsigned int get_input_rows() const = 0;
virtual unsigned int get_input_cols() const = 0;
virtual unsigned int get_output_rows() const = 0;
virtual unsigned int get_output_cols() const = 0;
};
template <typename T>
struct TensorSpec
{
T base;
size_t ld_row, ld_col;
TensorSpec(T ptr, size_t ld_row, size_t ld_col)
: base(ptr), ld_row(ld_row), ld_col(ld_col) {}
};
template <typename TInput, typename TWeight, typename TOutput>
class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
{
protected:
using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
// The strategy which we're applying to solve the depthwise convolution.
std::unique_ptr<const IDepthfirstStrategy> m_strat;
/* Compute the amount of working space required for a single thread. */
virtual size_t get_working_size_per_thread() const = 0;
/* Initialise the working space for a thread. */
virtual void initialise_working_space(void *) const = 0;
/* Compute a portion of the output tensor with padding. */
virtual void compute_tile_padded(
const DepthwiseArgs &args,
unsigned int output_i, unsigned int output_j,
unsigned int output_channel_start, unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
const TensorSpec<TOutput *> &output,
const void *parameters,
void *working_space
) const = 0;
/* Compute a portion of the work with only top/bottom padding.
*
* The default implementation of this repeatedly calls into the padded tile
* variant.
*/
virtual void compute_row_padded_tile_row(
const DepthwiseArgs &args,
const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
const unsigned int output_channel_start, const unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
const TensorSpec<TOutput *> &output,
const void *parameters,
void *working_space
) const
{
for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
{
this->compute_tile_padded(
args,
output_i, output_j, output_channel_start, output_channel_end,
input, output, parameters, working_space
);
}
}
/* Compute a portion of the output tensor with no padding.
*
* The default implementation of this repeatedly calls into the padded
* variant.
*/
virtual void compute_tiles_unpadded(
const DepthwiseArgs &args,
unsigned int start_output_i, unsigned int start_output_j,
unsigned int n_tile_rows, unsigned int n_tile_cols,
unsigned int output_channel_start, unsigned int output_channel_end,
const TensorSpec<const TInput *> &input,
const TensorSpec<TOutput *> &output,
const void *parameters,
void *working_space
) const
{
for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
{
unsigned int row_start_output_j = start_output_j;
for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
{
this->compute_tile_padded(
args,
start_output_i, row_start_output_j,
output_channel_start, output_channel_end,
input, output, parameters, working_space
);
row_start_output_j += m_strat->get_output_cols();
}
start_output_i += m_strat->get_output_rows();
}
}
void execute_internal(
const DepthwiseArgs &args,
const void *input,
size_t ld_input_col,
size_t ld_input_row,
size_t ld_input_batch,
const void *parameters,
void *output,
size_t ld_output_col,
size_t ld_output_row,
size_t ld_output_batch,
void *working_space,
unsigned int thread_id,
unsigned int n_threads
) const override
{
// Get and initialise the working space for this thread.
void *thread_working_space =
static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
this->initialise_working_space(thread_working_space);
// Construct convenient representations of the input/output tensors.
TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
const auto n_output_channels = args.input_channels * args.channel_multiplier;
// By default we parallelize over the rows, but if there's only 1 row, we
// try to parallize over batches
auto thread_id_for_rows = thread_id;
auto n_threads_for_rows = n_threads;
auto thread_id_for_batches = 0;
auto n_threads_for_batches = 1;
if (args.output_rows == 1) {
thread_id_for_rows = 0;
n_threads_for_rows = 1;
thread_id_for_batches = thread_id;
n_threads_for_batches = n_threads;
}
// Progress the pointers for the first batch.
input_tensor.base += ld_input_batch*thread_id_for_batches;
output_tensor.base += ld_output_batch*thread_id_for_batches;
for (unsigned int batch = thread_id_for_batches;
batch < args.n_batches;
batch += n_threads_for_batches)
{
// Iterate over rows of the output tensor; we stripe over the tiles.
for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
start_output_i < args.output_rows;
start_output_i += n_threads_for_rows * m_strat->get_output_rows())
{
// Determine what (if any padding) is required on the top/bottom of
// this row of the convolution.
const auto end_output_i = start_output_i + m_strat->get_output_rows();
const bool pad_output_bottom = args.output_rows < end_output_i;
const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
const bool pad_input_top = start_input_i < 0;
const int end_input_i = start_input_i + m_strat->get_input_rows();
const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
// We only need to account for input padding if direct padding is not supported.
const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())
|| pad_output_bottom;
// Iterate over the columns of the output tensor; we attempt to grab as
// much as possible of the unpadded regions, so the loop structure is a
// bit odd.
unsigned int start_output_j = 0;
while (start_output_j < args.output_cols)
{
const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
const bool pad_input_left = start_in_j < 0;
// Determine if we can process a number of unpadded tiles in one go.
int n_unpadded_tiles = 0;
if ((!pad_input_left) || this->supports_direct_padding())
{
// Determine the maximum number of tiles we could handle.
n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
// Handle padding on the right hand edge
const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
while (n_unpadded_tiles > 0 &&
(static_cast<int>(args.output_cols) < end_output_j ||
static_cast<int>(args.input_cols) < end_input_j))
{
n_unpadded_tiles--;
end_output_j -= m_strat->get_output_cols();
end_input_j -= tile_stride;
}
}
// Process unpadded tiles, if possible, otherwise process a padded tile.
if (n_unpadded_tiles)
{
if (!pad_row)
{
// Completely unpadded execution
this->compute_tiles_unpadded(
args,
start_output_i, start_output_j,
1, n_unpadded_tiles, // Compute a row of unpadded tiles
0, n_output_channels, // Compute all channels
input_tensor, output_tensor, parameters, thread_working_space
);
}
else
{
// Top/bottom padding only
this->compute_row_padded_tile_row(
args,
start_output_i, start_output_j, n_unpadded_tiles,
0, n_output_channels, // Compute all channels
input_tensor, output_tensor, parameters, thread_working_space
);
}
start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
}
else
{
this->compute_tile_padded(
args,
start_output_i, start_output_j,
0, n_output_channels, // Compute all channels
input_tensor, output_tensor, parameters, thread_working_space
);
start_output_j += m_strat->get_output_cols();
}
}
}
// Progress the pointers for the next batch.
input_tensor.base += ld_input_batch*n_threads_for_batches;
output_tensor.base += ld_output_batch*n_threads_for_batches;
}
}
public:
DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
: Parent(args), m_strat(strategy)
{
}
size_t get_working_size(unsigned int n_threads) const override final
{
return n_threads * this->get_working_size_per_thread();
}
virtual bool supports_direct_padding() const
{
return false;
}
};
} // namespace depthwise
} // namespace arm_conv