blob: bda875dfe123917b33bc5528bba16b0ccd590bf5 [file] [log] [blame]
/*
* Copyright (c) 2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*
* NOTE: Header to be included by implementation files only.
*
* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
*/
#include <limits>
#include "arm.hpp"
#include "impl_base.hpp"
#include "depthwise_quantized.hpp"
#pragma once
// Comment the following to use floating-point based quantisation, leave
// uncommented to use fixed-point.
#define FIXED_POINT_REQUANTISATION 1
using namespace neon_convolution_kernels;
using namespace qasymm8;
template <typename T>
struct clamp_to_limits
{
template <typename U>
static inline U clamp(const U& v)
{
const std::numeric_limits<T> limits;
const U min = static_cast<U>(limits.min());
const U max = static_cast<U>(limits.max());
return std::min(std::max(v, min), max);
}
template <typename U>
static inline T clamp_and_cast(const U& v)
{
return static_cast<U>(clamp(v));
}
};
template <typename T>
inline T saturating_doubling_high_mul(const T&, const int32_t&);
template <>
inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
{
return vqrdmulhq_n_s32(a, b);
}
template <>
inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
{
return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
}
template <typename T>
inline T rounding_divide_by_exp2(const T& x, const int exponent);
template <>
inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
{
const int32x4_t shift = vdupq_n_s32(-exponent);
const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
const int32x4_t fixed = vqaddq_s32(x, fixup);
return vrshlq_s32(fixed, shift);
}
template <>
inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
{
const int32x2_t shift = vdup_n_s32(-exponent);
const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
const int32x2_t fixed = vqadd_s32(x, fixup);
return vrshl_s32(fixed, shift);
}
template <>
inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
{
const int32x2_t xs = vdup_n_s32(x);
return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
}
namespace depthwise
{
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::QAsymm8DepthwiseConvolution(
int n_batches, int n_input_rows, int n_input_cols, int n_channels,
const ActivationFunction activation,
const QAsymm8Params& weight_quantisation,
const QAsymm8Params& input_quantisation,
const QAsymm8Params& output_quantisation,
unsigned int padding_top,
unsigned int padding_left,
unsigned int padding_bottom,
unsigned int padding_right
) : QAsymm8DepthwiseConvolution(
n_batches, n_input_rows, n_input_cols, n_channels,
activation, weight_quantisation, input_quantisation, output_quantisation,
QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
padding_top, padding_left, padding_bottom, padding_right
)
{
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::QAsymm8DepthwiseConvolution(
int n_batches, int n_input_rows, int n_input_cols, int n_channels,
int n_output_rows, int n_output_cols,
const ActivationFunction activation,
const QAsymm8Params& weight_quantisation,
const QAsymm8Params& input_quantisation,
const QAsymm8Params& output_quantisation,
unsigned int padding_top,
unsigned int padding_left,
unsigned int padding_bottom,
unsigned int padding_right
) : QAsymm8DepthwiseConvolution(
n_batches, n_input_rows, n_input_cols, n_channels,
n_output_rows, n_output_cols,
activation, weight_quantisation, input_quantisation, output_quantisation,
QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
padding_top, padding_left, padding_bottom, padding_right
)
{
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::QAsymm8DepthwiseConvolution(
int n_batches, int n_input_rows, int n_input_cols, int n_channels,
const ActivationFunction activation,
const QAsymm8Params& weight_quantisation,
const QAsymm8Params& input_quantisation,
const QAsymm8Params& output_quantisation,
const QAsymm8RescaleParams& rescale_params,
unsigned int padding_top,
unsigned int padding_left,
unsigned int padding_bottom,
unsigned int padding_right
) : Base(
n_batches, n_input_rows, n_input_cols, n_channels,
get_activation_fn(activation, output_quantisation),
padding_top, padding_left, padding_bottom, padding_right
),
_weights_quant(weight_quantisation),
_inputs_quant(input_quantisation),
_output_quant(output_quantisation),
rescale_parameters(rescale_params)
{
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::QAsymm8DepthwiseConvolution(
int n_batches, int n_input_rows, int n_input_cols, int n_channels,
int n_output_rows, int n_output_cols,
const ActivationFunction activation,
const QAsymm8Params& weight_quantisation,
const QAsymm8Params& input_quantisation,
const QAsymm8Params& output_quantisation,
const QAsymm8RescaleParams& rescale_params,
unsigned int padding_top,
unsigned int padding_left,
unsigned int padding_bottom,
unsigned int padding_right
) : Base(
n_batches, n_input_rows, n_input_cols, n_channels,
n_output_rows, n_output_cols,
get_activation_fn(activation, output_quantisation),
padding_top, padding_left, padding_bottom, padding_right
),
_weights_quant(weight_quantisation),
_inputs_quant(input_quantisation),
_output_quant(output_quantisation),
rescale_parameters(rescale_params)
{
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
ActivationFunction QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::get_activation_fn(
const ActivationFunction activation,
const QAsymm8Params& output_quant
)
{
if (
(activation == ActivationFunction::ReLU &&
output_quant.quantize(0) == 0) ||
(activation == ActivationFunction::ReLU6 &&
output_quant.quantize(0) == 0 &&
output_quant.dequantize(255) <= 6.0f)
)
{
// If the range of values which can be represented by a quantized value are
// within the range that would be produced by the activation function, then
// the activation function is redundant and can be skipped.
return ActivationFunction::None;
}
else if(
activation == ActivationFunction::ReLU6 &&
output_quant.dequantize(255) <= 6.0f
)
{
// If the largest value that can be represented by a quantized value is
// lower than the upper boundary, then the activation function can be
// relaxed to a ReLU.
return ActivationFunction::ReLU;
}
return activation;
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
uint8_t QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::_input_padding_value(void) const
{
return _inputs_quant.offset;
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
void QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::_pack_params(
void * const buffer,
const void * const weights,
const unsigned int weight_row_stride,
const unsigned int weight_col_stride,
const void * const biases
) const
{
const uint8_t *wptr = static_cast<const uint8_t *>(weights);
const int32_t *bptr = static_cast<const int32_t *>(biases);
uint8_t *outptr = static_cast<uint8_t *>(buffer);
// We set the vector length to use quad registers on Aarch64 and only doubles
// on Aarch32. NOTE For SVE set this to the actual vector length.
#if defined(__aarch64__)
unsigned int veclen = 16;
#else
#if defined(__arm__)
unsigned int veclen = 8;
#endif
#endif
// Compute the rank 0 offset arising from the quantisation parameters.
const int32_t rank0_offset = (KernelRows * KernelCols *
static_cast<int32_t>(_weights_quant.offset) *
static_cast<int32_t>(_inputs_quant.offset));
// While there are channels left to process, pack a vector length of them at
// a time and reduce the size of vector used as the size of the tensor
// decreases.
for (
unsigned int n_channels = this->n_channels(); n_channels;
n_channels -= veclen,
outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
)
{
// NOTE Ignore this section if using SVE, the vector length remains the
// same and we just don't fill a full register for the tail.
while (n_channels < veclen)
{
// Reduce the vector length to either 8 or 1 (scalar)
// TODO Support more vector lengths in `execute_tile`.
veclen = (veclen == 16) ? 8 : 1;
}
// Get pointers to bias and weight portions of the output structure.
int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
// Copy a vector length of elements
for (unsigned int n = 0; n < veclen && n < n_channels; n++)
{
int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
uint32_t weight_sum = 0;
for (unsigned int i = 0; i < KernelRows; i++)
{
uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
for (unsigned int j = 0; j < KernelCols; j++)
{
uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
row_outptr[j*veclen + n] = w;
weight_sum += static_cast<uint32_t>(w);
}
}
wptr++;
// Include in the bias contributions from the quantisation offset
int32_t rank1_offset = static_cast<int32_t>(
static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
);
out_bptr[n] = bias + rank0_offset - rank1_offset;
}
}
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
template<ActivationFunction Activation>
void QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::execute_tile(
int n_channels,
const void* packed_params,
const uint8_t* inptr,
const unsigned int in_row_stride,
const unsigned int in_col_stride,
uint8_t* outptr,
const unsigned int out_row_stride,
const unsigned int out_col_stride
)
{
// Activation parameters (unused if Activation is None)
const uint8_t aqmin = _output_quant.offset;
const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
// Byte type pointer to weights and biases
const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
#if defined(__aarch64__) // Under Aarch64 only use quad registers
for (; n_channels >= 16; n_channels -= 16)
{
// Load biases
const int32x4_t biases[4] = {
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
};
wbptr += 16*sizeof(int32_t);
// Load weights
uint8x16_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
weights[i][j] = vld1q_u8(wbptr);
wbptr += 16;
}
}
// Load the input activations
uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
inputs[i][j] = vld1q_u8(inptr + i*in_row_stride + j*in_col_stride);
}
}
inptr += 16;
// Perform the convolution
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
// Two sets of operations are required, we perform the
// multiply-accumulates for the convolution proper but must also sum
// the tile elements to account for the _weight_ offset.
uint32x4_t accs[4];
for (unsigned int i = 0; i < 4; i++)
{
accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
}
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
// Get relevant weight and activation pixel
const uint8x16_t w = weights[wi][wj];
const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
// Perform multiplication and accumulation
const uint16x8_t muls[2] = {
vmull_u8(vget_low_u8(w), vget_low_u8(x)),
vmull_u8(vget_high_u8(w), vget_high_u8(x))
};
const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
const uint16x8_t sum_elems[2] = {
vmull_u8(vget_low_u8(x), woffset),
vmull_u8(vget_high_u8(x), woffset)
};
const uint32x4_t tmps[4] = {
vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
};
for (unsigned int i = 0; i < 4; i++)
{
accs[i] = vaddq_u32(accs[i], tmps[i]);
}
}
}
// Rescale the accumulator and add in the new offset.
uint32x4_t final_accs[4];
for (unsigned int i = 0; i < 4; i++)
{
#ifdef FIXED_POINT_REQUANTISATION
const int32x4_t y = rounding_divide_by_exp2(
saturating_doubling_high_mul(
reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
),
rescale_parameters.shift
);
const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
#else // floating point requantisation
float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
final_accs[i] = vcvtq_u32_f32(fp_acc);
#endif
}
uint8x16_t output = vcombine_u8(
vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
);
// Apply the activation function
if (Activation == ActivationFunction::ReLU ||
Activation == ActivationFunction::ReLU6)
{
output = vmaxq_u8(output, vdupq_n_u8(aqmin));
}
if (Activation == ActivationFunction::ReLU6)
{
output = vminq_u8(output, vdupq_n_u8(aqmax));
}
vst1q_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
}
}
outptr += 16;
}
#endif // defined(__aarch64__)
for (; n_channels >= 8; n_channels -= 8)
{
const int32x4_t biases[2] = {
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
};
wbptr += 8*sizeof(int32_t);
uint8x8_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
weights[i][j] = vld1_u8(wbptr);
wbptr += 8;
}
}
uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
inputs[i][j] = vld1_u8(inptr + i*in_row_stride + j*in_col_stride);
}
}
inptr += 8;
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
uint32x4_t accs[2];
for (unsigned int i = 0; i < 2; i++)
{
accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
}
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
const uint8x8_t w = weights[wi][wj];
const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
const uint16x8_t muls = vmull_u8(w, x);
const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
const uint16x8_t sum_elems = vmull_u8(x, woffset);
const uint32x4_t tmps[2] = {
vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
};
for (unsigned int i = 0; i < 2; i++)
{
accs[i] = vaddq_u32(accs[i], tmps[i]);
}
}
}
uint32x4_t final_accs[2];
for (unsigned int i = 0; i < 2; i++)
{
#ifdef FIXED_POINT_REQUANTISATION
const int32x4_t y = rounding_divide_by_exp2(
saturating_doubling_high_mul(
reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
),
rescale_parameters.shift
);
const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
#else // floating point requantisation
float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
final_accs[i] = vcvtq_u32_f32(fp_acc);
#endif
}
uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
// Apply the activation function
if (Activation == ActivationFunction::ReLU ||
Activation == ActivationFunction::ReLU6)
{
output = vmax_u8(output, vdup_n_u8(aqmin));
}
if (Activation == ActivationFunction::ReLU6)
{
output = vmin_u8(output, vdup_n_u8(aqmax));
}
vst1_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
}
}
outptr += 8;
}
for (; n_channels; n_channels--)
{
// Load bias
const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
wbptr += sizeof(int32_t);
// Load weights
uint8_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
weights[i][j] = *(wbptr++);
}
}
// Load the input activations
uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
inputs[i][j] = *(inptr + i*in_row_stride + j*in_col_stride);
}
}
inptr++;
// Perform the convolution
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
int32_t acc = bias;
uint32_t element_sum = 0;
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
element_sum += static_cast<uint32_t>(x);
}
}
acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
// Requantize
#ifdef FIXED_POINT_REQUANTISATION
acc = rounding_divide_by_exp2(
saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
rescale_parameters.shift
);
acc += _output_quant.offset;
uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
#else // floating point requantization
float fp_acc = static_cast<float>(acc);
fp_acc *= rescale_parameters.rescale;
fp_acc += static_cast<float>(_output_quant.offset);
fp_acc = std::max<float>(fp_acc, 0.0f);
uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
#endif
// Apply the activation function
if (Activation == ActivationFunction::ReLU ||
Activation == ActivationFunction::ReLU6)
{
output = std::max(output, aqmin);
}
if (Activation == ActivationFunction::ReLU6)
{
output = std::min(output, aqmax);
}
*(outptr + oi*out_row_stride + oj*out_col_stride) = output;
}
}
outptr++;
}
}
template <
unsigned int OutputTileRows, unsigned int OutputTileCols,
unsigned int KernelRows, unsigned int KernelCols,
unsigned int StrideRows, unsigned int StrideCols
>
template<ActivationFunction Activation>
void QAsymm8DepthwiseConvolution<
OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
>::execute_tile(
int n_channels,
const void* packed_params,
const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
)
{
// Activation parameters (unused if Activation is None)
const uint8_t aqmin = _output_quant.offset;
const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
// Byte type pointer to weights and biases
const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
// Offset into input/output tensors
int n = 0;
#if defined(__aarch64__) // Under Aarch64 only use quad registers
for (; n_channels >= 16; n_channels -= 16, n += 16)
{
// Load biases
const int32x4_t biases[4] = {
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
};
wbptr += 16*sizeof(int32_t);
// Load weights
uint8x16_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
weights[i][j] = vld1q_u8(wbptr);
wbptr += 16;
}
}
// Load the input activations
uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
}
}
// Perform the convolution
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
// Two sets of operations are required, we perform the
// multiply-accumulates for the convolution proper but must also sum
// the tile elements to account for the _weight_ offset.
uint32x4_t accs[4];
for (unsigned int i = 0; i < 4; i++)
{
accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
}
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
// Get relevant weight and activation pixel
const uint8x16_t w = weights[wi][wj];
const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
// Perform multiplication and accumulation
const uint16x8_t muls[2] = {
vmull_u8(vget_low_u8(w), vget_low_u8(x)),
vmull_u8(vget_high_u8(w), vget_high_u8(x))
};
const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
const uint16x8_t sum_elems[2] = {
vmull_u8(vget_low_u8(x), woffset),
vmull_u8(vget_high_u8(x), woffset)
};
const uint32x4_t tmps[4] = {
vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
};
for (unsigned int i = 0; i < 4; i++)
{
accs[i] = vaddq_u32(accs[i], tmps[i]);
}
}
}
// Rescale the accumulator and add in the new offset.
uint32x4_t final_accs[4];
for (unsigned int i = 0; i < 4; i++)
{
#ifdef FIXED_POINT_REQUANTISATION
const int32x4_t y = rounding_divide_by_exp2(
saturating_doubling_high_mul(
reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
),
rescale_parameters.shift
);
const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
#else // floating point requantisation
float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
final_accs[i] = vcvtq_u32_f32(fp_acc);
#endif
}
uint8x16_t output = vcombine_u8(
vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
);
// Apply the activation function
if (Activation == ActivationFunction::ReLU ||
Activation == ActivationFunction::ReLU6)
{
output = vmaxq_u8(output, vdupq_n_u8(aqmin));
}
if (Activation == ActivationFunction::ReLU6)
{
output = vminq_u8(output, vdupq_n_u8(aqmax));
}
vst1q_u8(outptrs[oi][oj] + n, output);
}
}
}
#endif // defined(__aarch64__)
for (; n_channels >= 8; n_channels -= 8, n += 8)
{
const int32x4_t biases[2] = {
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
};
wbptr += 8*sizeof(int32_t);
uint8x8_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
weights[i][j] = vld1_u8(wbptr);
wbptr += 8;
}
}
uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
inputs[i][j] = vld1_u8(inptrs[i][j] + n);
}
}
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
uint32x4_t accs[2];
for (unsigned int i = 0; i < 2; i++)
{
accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
}
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
const uint8x8_t w = weights[wi][wj];
const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
const uint16x8_t muls = vmull_u8(w, x);
const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
const uint16x8_t sum_elems = vmull_u8(x, woffset);
const uint32x4_t tmps[2] = {
vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
};
for (unsigned int i = 0; i < 2; i++)
{
accs[i] = vaddq_u32(accs[i], tmps[i]);
}
}
}
uint32x4_t final_accs[2];
for (unsigned int i = 0; i < 2; i++)
{
#ifdef FIXED_POINT_REQUANTISATION
const int32x4_t y = rounding_divide_by_exp2(
saturating_doubling_high_mul(
reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
),
rescale_parameters.shift
);
const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
#else // floating point requantisation
float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
final_accs[i] = vcvtq_u32_f32(fp_acc);
#endif
}
uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
// Apply the activation function
if (Activation == ActivationFunction::ReLU ||
Activation == ActivationFunction::ReLU6)
{
output = vmax_u8(output, vdup_n_u8(aqmin));
}
if (Activation == ActivationFunction::ReLU6)
{
output = vmin_u8(output, vdup_n_u8(aqmax));
}
vst1_u8(outptrs[oi][oj] + n, output);
}
}
}
for (; n_channels; n_channels--, n++)
{
// Load bias
const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
wbptr += sizeof(int32_t);
// Load weights
uint8_t weights[KernelRows][KernelCols];
for (unsigned int i = 0; i < KernelRows; i++)
{
for (unsigned int j = 0; j < KernelCols; j++)
{
weights[i][j] = *(wbptr++);
}
}
// Load the input activations
uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
{
for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
{
inputs[i][j] = *(inptrs[i][j] + n);
}
}
// Perform the convolution
for (unsigned int oi = 0; oi < OutputTileRows; oi++)
{
for (unsigned int oj = 0; oj < OutputTileCols; oj++)
{
int32_t acc = bias;
uint32_t element_sum = 0;
for (unsigned int wi = 0; wi < KernelRows; wi++)
{
for (unsigned int wj = 0; wj < KernelCols; wj++)
{
const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
element_sum += static_cast<uint32_t>(x);
}
}
acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
// Requantize
#ifdef FIXED_POINT_REQUANTISATION
acc = rounding_divide_by_exp2(
saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
rescale_parameters.shift
);
acc += _output_quant.offset;
uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
#else // floating point requantization
float fp_acc = static_cast<float>(acc);
fp_acc *= rescale_parameters.rescale;
fp_acc += static_cast<float>(_output_quant.offset);
fp_acc = std::max<float>(fp_acc, 0.0f);
uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
#endif
// Apply the activation function
if (Activation == ActivationFunction::ReLU ||
Activation == ActivationFunction::ReLU6)
{
output = std::max(output, aqmin);
}
if (Activation == ActivationFunction::ReLU6)
{
output = std::min(output, aqmax);
}
*(outptrs[oi][oj] + n) = output;
}
}
}
}
} // namespace depthwise