blob: bda875dfe123917b33bc5528bba16b0ccd590bf5 [file] [log] [blame]
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <limits>
34
Georgios Pinitas30271c72019-06-24 14:56:34 +010035#include "arm.hpp"
36#include "impl_base.hpp"
37#include "depthwise_quantized.hpp"
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000038
39#pragma once
40
Georgios Pinitas30271c72019-06-24 14:56:34 +010041// Comment the following to use floating-point based quantisation, leave
42// uncommented to use fixed-point.
43#define FIXED_POINT_REQUANTISATION 1
44
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000045using namespace neon_convolution_kernels;
46using namespace qasymm8;
47
48template <typename T>
Georgios Pinitas30271c72019-06-24 14:56:34 +010049struct clamp_to_limits
50{
51 template <typename U>
52 static inline U clamp(const U& v)
53 {
54 const std::numeric_limits<T> limits;
55 const U min = static_cast<U>(limits.min());
56 const U max = static_cast<U>(limits.max());
57 return std::min(std::max(v, min), max);
58 }
59
60 template <typename U>
61 static inline T clamp_and_cast(const U& v)
62 {
63 return static_cast<U>(clamp(v));
64 }
65};
66
67template <typename T>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000068inline T saturating_doubling_high_mul(const T&, const int32_t&);
69
70template <>
71inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
72{
73 return vqrdmulhq_n_s32(a, b);
74}
75
76template <>
77inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
78{
79 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
80}
81
82template <typename T>
83inline T rounding_divide_by_exp2(const T& x, const int exponent);
84
85template <>
86inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
87{
88 const int32x4_t shift = vdupq_n_s32(-exponent);
89 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
90 const int32x4_t fixed = vqaddq_s32(x, fixup);
91 return vrshlq_s32(fixed, shift);
92}
93
94template <>
95inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
96{
97 const int32x2_t shift = vdup_n_s32(-exponent);
98 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
99 const int32x2_t fixed = vqadd_s32(x, fixup);
100 return vrshl_s32(fixed, shift);
101}
102
103template <>
104inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
105{
106 const int32x2_t xs = vdup_n_s32(x);
107 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
108}
109
110namespace depthwise
111{
112template <
Georgios Pinitas30271c72019-06-24 14:56:34 +0100113 unsigned int OutputTileRows, unsigned int OutputTileCols,
114 unsigned int KernelRows, unsigned int KernelCols,
115 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000116>
117QAsymm8DepthwiseConvolution<
Georgios Pinitas30271c72019-06-24 14:56:34 +0100118 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000119>::QAsymm8DepthwiseConvolution(
Georgios Pinitas30271c72019-06-24 14:56:34 +0100120 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
121 const ActivationFunction activation,
122 const QAsymm8Params& weight_quantisation,
123 const QAsymm8Params& input_quantisation,
124 const QAsymm8Params& output_quantisation,
125 unsigned int padding_top,
126 unsigned int padding_left,
127 unsigned int padding_bottom,
128 unsigned int padding_right
129) : QAsymm8DepthwiseConvolution(
130 n_batches, n_input_rows, n_input_cols, n_channels,
131 activation, weight_quantisation, input_quantisation, output_quantisation,
132 QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
133 padding_top, padding_left, padding_bottom, padding_right
134 )
135{
136}
137
138template <
139 unsigned int OutputTileRows, unsigned int OutputTileCols,
140 unsigned int KernelRows, unsigned int KernelCols,
141 unsigned int StrideRows, unsigned int StrideCols
142>
143QAsymm8DepthwiseConvolution<
144 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
145>::QAsymm8DepthwiseConvolution(
146 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
147 int n_output_rows, int n_output_cols,
148 const ActivationFunction activation,
149 const QAsymm8Params& weight_quantisation,
150 const QAsymm8Params& input_quantisation,
151 const QAsymm8Params& output_quantisation,
152 unsigned int padding_top,
153 unsigned int padding_left,
154 unsigned int padding_bottom,
155 unsigned int padding_right
156) : QAsymm8DepthwiseConvolution(
157 n_batches, n_input_rows, n_input_cols, n_channels,
158 n_output_rows, n_output_cols,
159 activation, weight_quantisation, input_quantisation, output_quantisation,
160 QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
161 padding_top, padding_left, padding_bottom, padding_right
162 )
163{
164}
165
166template <
167 unsigned int OutputTileRows, unsigned int OutputTileCols,
168 unsigned int KernelRows, unsigned int KernelCols,
169 unsigned int StrideRows, unsigned int StrideCols
170>
171QAsymm8DepthwiseConvolution<
172 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
173>::QAsymm8DepthwiseConvolution(
174 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
175 const ActivationFunction activation,
176 const QAsymm8Params& weight_quantisation,
177 const QAsymm8Params& input_quantisation,
178 const QAsymm8Params& output_quantisation,
179 const QAsymm8RescaleParams& rescale_params,
180 unsigned int padding_top,
181 unsigned int padding_left,
182 unsigned int padding_bottom,
183 unsigned int padding_right
184) : Base(
185 n_batches, n_input_rows, n_input_cols, n_channels,
186 get_activation_fn(activation, output_quantisation),
187 padding_top, padding_left, padding_bottom, padding_right
188 ),
189 _weights_quant(weight_quantisation),
190 _inputs_quant(input_quantisation),
191 _output_quant(output_quantisation),
192 rescale_parameters(rescale_params)
193{
194}
195
196template <
197 unsigned int OutputTileRows, unsigned int OutputTileCols,
198 unsigned int KernelRows, unsigned int KernelCols,
199 unsigned int StrideRows, unsigned int StrideCols
200>
201QAsymm8DepthwiseConvolution<
202 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
203>::QAsymm8DepthwiseConvolution(
204 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
205 int n_output_rows, int n_output_cols,
206 const ActivationFunction activation,
207 const QAsymm8Params& weight_quantisation,
208 const QAsymm8Params& input_quantisation,
209 const QAsymm8Params& output_quantisation,
210 const QAsymm8RescaleParams& rescale_params,
211 unsigned int padding_top,
212 unsigned int padding_left,
213 unsigned int padding_bottom,
214 unsigned int padding_right
215) : Base(
216 n_batches, n_input_rows, n_input_cols, n_channels,
217 n_output_rows, n_output_cols,
218 get_activation_fn(activation, output_quantisation),
219 padding_top, padding_left, padding_bottom, padding_right
220 ),
221 _weights_quant(weight_quantisation),
222 _inputs_quant(input_quantisation),
223 _output_quant(output_quantisation),
224 rescale_parameters(rescale_params)
225{
226}
227
228template <
229 unsigned int OutputTileRows, unsigned int OutputTileCols,
230 unsigned int KernelRows, unsigned int KernelCols,
231 unsigned int StrideRows, unsigned int StrideCols
232>
233ActivationFunction QAsymm8DepthwiseConvolution<
234 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
235>::get_activation_fn(
236 const ActivationFunction activation,
237 const QAsymm8Params& output_quant
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000238)
239{
Georgios Pinitas30271c72019-06-24 14:56:34 +0100240 if (
241 (activation == ActivationFunction::ReLU &&
242 output_quant.quantize(0) == 0) ||
243 (activation == ActivationFunction::ReLU6 &&
244 output_quant.quantize(0) == 0 &&
245 output_quant.dequantize(255) <= 6.0f)
246 )
247 {
248 // If the range of values which can be represented by a quantized value are
249 // within the range that would be produced by the activation function, then
250 // the activation function is redundant and can be skipped.
251 return ActivationFunction::None;
252 }
253 else if(
254 activation == ActivationFunction::ReLU6 &&
255 output_quant.dequantize(255) <= 6.0f
256 )
257 {
258 // If the largest value that can be represented by a quantized value is
259 // lower than the upper boundary, then the activation function can be
260 // relaxed to a ReLU.
261 return ActivationFunction::ReLU;
262 }
263
264 return activation;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000265}
266
267template <
Georgios Pinitas30271c72019-06-24 14:56:34 +0100268 unsigned int OutputTileRows, unsigned int OutputTileCols,
269 unsigned int KernelRows, unsigned int KernelCols,
270 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000271>
272uint8_t QAsymm8DepthwiseConvolution<
Georgios Pinitas30271c72019-06-24 14:56:34 +0100273 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000274>::_input_padding_value(void) const
275{
276 return _inputs_quant.offset;
277}
278
279template <
Georgios Pinitas30271c72019-06-24 14:56:34 +0100280 unsigned int OutputTileRows, unsigned int OutputTileCols,
281 unsigned int KernelRows, unsigned int KernelCols,
282 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000283>
284void QAsymm8DepthwiseConvolution<
Georgios Pinitas30271c72019-06-24 14:56:34 +0100285 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000286>::_pack_params(
Georgios Pinitas30271c72019-06-24 14:56:34 +0100287 void * const buffer,
288 const void * const weights,
289 const unsigned int weight_row_stride,
290 const unsigned int weight_col_stride,
291 const void * const biases
292) const
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000293{
294 const uint8_t *wptr = static_cast<const uint8_t *>(weights);
295 const int32_t *bptr = static_cast<const int32_t *>(biases);
296 uint8_t *outptr = static_cast<uint8_t *>(buffer);
297
Georgios Pinitas30271c72019-06-24 14:56:34 +0100298 // We set the vector length to use quad registers on Aarch64 and only doubles
299 // on Aarch32. NOTE For SVE set this to the actual vector length.
300#if defined(__aarch64__)
301 unsigned int veclen = 16;
302#else
303#if defined(__arm__)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000304 unsigned int veclen = 8;
Georgios Pinitas30271c72019-06-24 14:56:34 +0100305#endif
306#endif
307
308 // Compute the rank 0 offset arising from the quantisation parameters.
309 const int32_t rank0_offset = (KernelRows * KernelCols *
310 static_cast<int32_t>(_weights_quant.offset) *
311 static_cast<int32_t>(_inputs_quant.offset));
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000312
313 // While there are channels left to process, pack a vector length of them at
314 // a time and reduce the size of vector used as the size of the tensor
315 // decreases.
316 for (
Georgios Pinitas30271c72019-06-24 14:56:34 +0100317 unsigned int n_channels = this->n_channels(); n_channels;
318 n_channels -= veclen,
319 outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
320 )
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000321 {
322 // NOTE Ignore this section if using SVE, the vector length remains the
323 // same and we just don't fill a full register for the tail.
324 while (n_channels < veclen)
325 {
326 // Reduce the vector length to either 8 or 1 (scalar)
327 // TODO Support more vector lengths in `execute_tile`.
328 veclen = (veclen == 16) ? 8 : 1;
329 }
330
331 // Get pointers to bias and weight portions of the output structure.
332 int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
333 uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
334
335 // Copy a vector length of elements
336 for (unsigned int n = 0; n < veclen && n < n_channels; n++)
337 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100338 int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
339 uint32_t weight_sum = 0;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000340
341 for (unsigned int i = 0; i < KernelRows; i++)
342 {
343 uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
344 for (unsigned int j = 0; j < KernelCols; j++)
345 {
346 uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
347 row_outptr[j*veclen + n] = w;
Georgios Pinitas30271c72019-06-24 14:56:34 +0100348 weight_sum += static_cast<uint32_t>(w);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000349 }
350 }
351 wptr++;
Georgios Pinitas30271c72019-06-24 14:56:34 +0100352
353 // Include in the bias contributions from the quantisation offset
354 int32_t rank1_offset = static_cast<int32_t>(
355 static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
356 );
357 out_bptr[n] = bias + rank0_offset - rank1_offset;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000358 }
359 }
360}
361
362template <
Georgios Pinitas30271c72019-06-24 14:56:34 +0100363 unsigned int OutputTileRows, unsigned int OutputTileCols,
364 unsigned int KernelRows, unsigned int KernelCols,
365 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000366>
Georgios Pinitas30271c72019-06-24 14:56:34 +0100367template<ActivationFunction Activation>
368void QAsymm8DepthwiseConvolution<
369 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
370>::execute_tile(
371 int n_channels,
372 const void* packed_params,
373 const uint8_t* inptr,
374 const unsigned int in_row_stride,
375 const unsigned int in_col_stride,
376 uint8_t* outptr,
377 const unsigned int out_row_stride,
378 const unsigned int out_col_stride
379)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000380{
Georgios Pinitas30271c72019-06-24 14:56:34 +0100381 // Activation parameters (unused if Activation is None)
382 const uint8_t aqmin = _output_quant.offset;
383 const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
384 std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000385
386 // Byte type pointer to weights and biases
387 const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
388
Georgios Pinitas30271c72019-06-24 14:56:34 +0100389#if defined(__aarch64__) // Under Aarch64 only use quad registers
390 for (; n_channels >= 16; n_channels -= 16)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000391 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100392 // Load biases
393 const int32x4_t biases[4] = {
394 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
395 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
396 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
397 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000398 };
Georgios Pinitas30271c72019-06-24 14:56:34 +0100399 wbptr += 16*sizeof(int32_t);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000400
Georgios Pinitas30271c72019-06-24 14:56:34 +0100401 // Load weights
402 uint8x16_t weights[KernelRows][KernelCols];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000403 for (unsigned int i = 0; i < KernelRows; i++)
404 {
405 for (unsigned int j = 0; j < KernelCols; j++)
406 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100407 weights[i][j] = vld1q_u8(wbptr);
408 wbptr += 16;
409 }
410 }
411
412 // Load the input activations
413 uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
414 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
415 {
416 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
417 {
418 inputs[i][j] = vld1q_u8(inptr + i*in_row_stride + j*in_col_stride);
419 }
420 }
421 inptr += 16;
422
423 // Perform the convolution
424 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
425 {
426 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
427 {
428 // Two sets of operations are required, we perform the
429 // multiply-accumulates for the convolution proper but must also sum
430 // the tile elements to account for the _weight_ offset.
431 uint32x4_t accs[4];
432 for (unsigned int i = 0; i < 4; i++)
433 {
434 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
435 }
436
437 for (unsigned int wi = 0; wi < KernelRows; wi++)
438 {
439 for (unsigned int wj = 0; wj < KernelCols; wj++)
440 {
441 // Get relevant weight and activation pixel
442 const uint8x16_t w = weights[wi][wj];
443 const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
444
445 // Perform multiplication and accumulation
446 const uint16x8_t muls[2] = {
447 vmull_u8(vget_low_u8(w), vget_low_u8(x)),
448 vmull_u8(vget_high_u8(w), vget_high_u8(x))
449 };
450
451 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
452 const uint16x8_t sum_elems[2] = {
453 vmull_u8(vget_low_u8(x), woffset),
454 vmull_u8(vget_high_u8(x), woffset)
455 };
456
457 const uint32x4_t tmps[4] = {
458 vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
459 vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
460 vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
461 vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
462 };
463 for (unsigned int i = 0; i < 4; i++)
464 {
465 accs[i] = vaddq_u32(accs[i], tmps[i]);
466 }
467 }
468 }
469
470 // Rescale the accumulator and add in the new offset.
471 uint32x4_t final_accs[4];
472 for (unsigned int i = 0; i < 4; i++)
473 {
474#ifdef FIXED_POINT_REQUANTISATION
475 const int32x4_t y = rounding_divide_by_exp2(
476 saturating_doubling_high_mul(
477 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
478 ),
479 rescale_parameters.shift
480 );
481 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
482 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
483#else // floating point requantisation
484 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
485 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
486 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
487 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
488 final_accs[i] = vcvtq_u32_f32(fp_acc);
489#endif
490 }
491
492 uint8x16_t output = vcombine_u8(
493 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
494 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
495 );
496
497 // Apply the activation function
498 if (Activation == ActivationFunction::ReLU ||
499 Activation == ActivationFunction::ReLU6)
500 {
501 output = vmaxq_u8(output, vdupq_n_u8(aqmin));
502 }
503 if (Activation == ActivationFunction::ReLU6)
504 {
505 output = vminq_u8(output, vdupq_n_u8(aqmax));
506 }
507
508 vst1q_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
509 }
510 }
511 outptr += 16;
512 }
513#endif // defined(__aarch64__)
514 for (; n_channels >= 8; n_channels -= 8)
515 {
516 const int32x4_t biases[2] = {
517 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
518 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
519 };
520 wbptr += 8*sizeof(int32_t);
521
522 uint8x8_t weights[KernelRows][KernelCols];
523 for (unsigned int i = 0; i < KernelRows; i++)
524 {
525 for (unsigned int j = 0; j < KernelCols; j++)
526 {
527 weights[i][j] = vld1_u8(wbptr);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000528 wbptr += 8;
529 }
530 }
531
Georgios Pinitas30271c72019-06-24 14:56:34 +0100532 uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
533 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000534 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100535 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000536 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100537 inputs[i][j] = vld1_u8(inptr + i*in_row_stride + j*in_col_stride);
538 }
539 }
540 inptr += 8;
541
542 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
543 {
544 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
545 {
546 uint32x4_t accs[2];
547 for (unsigned int i = 0; i < 2; i++)
548 {
549 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
550 }
551
552 for (unsigned int wi = 0; wi < KernelRows; wi++)
553 {
554 for (unsigned int wj = 0; wj < KernelCols; wj++)
555 {
556 const uint8x8_t w = weights[wi][wj];
557 const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
558
559 const uint16x8_t muls = vmull_u8(w, x);
560 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
561 const uint16x8_t sum_elems = vmull_u8(x, woffset);
562
563 const uint32x4_t tmps[2] = {
564 vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
565 vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
566 };
567 for (unsigned int i = 0; i < 2; i++)
568 {
569 accs[i] = vaddq_u32(accs[i], tmps[i]);
570 }
571 }
572 }
573
574 uint32x4_t final_accs[2];
575 for (unsigned int i = 0; i < 2; i++)
576 {
577#ifdef FIXED_POINT_REQUANTISATION
578 const int32x4_t y = rounding_divide_by_exp2(
579 saturating_doubling_high_mul(
580 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
581 ),
582 rescale_parameters.shift
583 );
584 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
585 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
586#else // floating point requantisation
587 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
588 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
589 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
590 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
591 final_accs[i] = vcvtq_u32_f32(fp_acc);
592#endif
593 }
594
595 uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
596
597 // Apply the activation function
598 if (Activation == ActivationFunction::ReLU ||
599 Activation == ActivationFunction::ReLU6)
600 {
601 output = vmax_u8(output, vdup_n_u8(aqmin));
602 }
603 if (Activation == ActivationFunction::ReLU6)
604 {
605 output = vmin_u8(output, vdup_n_u8(aqmax));
606 }
607
608 vst1_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
609 }
610 }
611 outptr += 8;
612 }
613 for (; n_channels; n_channels--)
614 {
615 // Load bias
616 const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
617 wbptr += sizeof(int32_t);
618
619 // Load weights
620 uint8_t weights[KernelRows][KernelCols];
621 for (unsigned int i = 0; i < KernelRows; i++)
622 {
623 for (unsigned int j = 0; j < KernelCols; j++)
624 {
625 weights[i][j] = *(wbptr++);
626 }
627 }
628
629 // Load the input activations
630 uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
631 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
632 {
633 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
634 {
635 inputs[i][j] = *(inptr + i*in_row_stride + j*in_col_stride);
636 }
637 }
638 inptr++;
639
640 // Perform the convolution
641 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
642 {
643 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
644 {
645 int32_t acc = bias;
646 uint32_t element_sum = 0;
647
648 for (unsigned int wi = 0; wi < KernelRows; wi++)
649 {
650 for (unsigned int wj = 0; wj < KernelCols; wj++)
651 {
652 const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
653 acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
654 element_sum += static_cast<uint32_t>(x);
655 }
656 }
657
658 acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
659
660 // Requantize
661#ifdef FIXED_POINT_REQUANTISATION
662 acc = rounding_divide_by_exp2(
663 saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
664 rescale_parameters.shift
665 );
666 acc += _output_quant.offset;
667 uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
668#else // floating point requantization
669 float fp_acc = static_cast<float>(acc);
670 fp_acc *= rescale_parameters.rescale;
671 fp_acc += static_cast<float>(_output_quant.offset);
672 fp_acc = std::max<float>(fp_acc, 0.0f);
673 uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
674#endif
675
676 // Apply the activation function
677 if (Activation == ActivationFunction::ReLU ||
678 Activation == ActivationFunction::ReLU6)
679 {
680 output = std::max(output, aqmin);
681 }
682 if (Activation == ActivationFunction::ReLU6)
683 {
684 output = std::min(output, aqmax);
685 }
686
687 *(outptr + oi*out_row_stride + oj*out_col_stride) = output;
688 }
689 }
690 outptr++;
691 }
692}
693
694template <
695 unsigned int OutputTileRows, unsigned int OutputTileCols,
696 unsigned int KernelRows, unsigned int KernelCols,
697 unsigned int StrideRows, unsigned int StrideCols
698>
699template<ActivationFunction Activation>
700void QAsymm8DepthwiseConvolution<
701 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
702>::execute_tile(
703 int n_channels,
704 const void* packed_params,
705 const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
706 uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
707)
708{
709 // Activation parameters (unused if Activation is None)
710 const uint8_t aqmin = _output_quant.offset;
711 const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
712 std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
713
714 // Byte type pointer to weights and biases
715 const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
716
717 // Offset into input/output tensors
718 int n = 0;
719
720#if defined(__aarch64__) // Under Aarch64 only use quad registers
721 for (; n_channels >= 16; n_channels -= 16, n += 16)
722 {
723 // Load biases
724 const int32x4_t biases[4] = {
725 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
726 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
727 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
728 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
729 };
730 wbptr += 16*sizeof(int32_t);
731
732 // Load weights
733 uint8x16_t weights[KernelRows][KernelCols];
734 for (unsigned int i = 0; i < KernelRows; i++)
735 {
736 for (unsigned int j = 0; j < KernelCols; j++)
737 {
738 weights[i][j] = vld1q_u8(wbptr);
739 wbptr += 16;
740 }
741 }
742
743 // Load the input activations
744 uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
745 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
746 {
747 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
748 {
749 inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
750 }
751 }
752
753 // Perform the convolution
754 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
755 {
756 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
757 {
758 // Two sets of operations are required, we perform the
759 // multiply-accumulates for the convolution proper but must also sum
760 // the tile elements to account for the _weight_ offset.
761 uint32x4_t accs[4];
762 for (unsigned int i = 0; i < 4; i++)
763 {
764 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
765 }
766
767 for (unsigned int wi = 0; wi < KernelRows; wi++)
768 {
769 for (unsigned int wj = 0; wj < KernelCols; wj++)
770 {
771 // Get relevant weight and activation pixel
772 const uint8x16_t w = weights[wi][wj];
773 const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
774
775 // Perform multiplication and accumulation
776 const uint16x8_t muls[2] = {
777 vmull_u8(vget_low_u8(w), vget_low_u8(x)),
778 vmull_u8(vget_high_u8(w), vget_high_u8(x))
779 };
780
781 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
782 const uint16x8_t sum_elems[2] = {
783 vmull_u8(vget_low_u8(x), woffset),
784 vmull_u8(vget_high_u8(x), woffset)
785 };
786
787 const uint32x4_t tmps[4] = {
788 vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
789 vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
790 vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
791 vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
792 };
793 for (unsigned int i = 0; i < 4; i++)
794 {
795 accs[i] = vaddq_u32(accs[i], tmps[i]);
796 }
797 }
798 }
799
800 // Rescale the accumulator and add in the new offset.
801 uint32x4_t final_accs[4];
802 for (unsigned int i = 0; i < 4; i++)
803 {
804#ifdef FIXED_POINT_REQUANTISATION
805 const int32x4_t y = rounding_divide_by_exp2(
806 saturating_doubling_high_mul(
807 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
808 ),
809 rescale_parameters.shift
810 );
811 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
812 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
813#else // floating point requantisation
814 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
815 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
816 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
817 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
818 final_accs[i] = vcvtq_u32_f32(fp_acc);
819#endif
820 }
821
822 uint8x16_t output = vcombine_u8(
823 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
824 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
825 );
826
827 // Apply the activation function
828 if (Activation == ActivationFunction::ReLU ||
829 Activation == ActivationFunction::ReLU6)
830 {
831 output = vmaxq_u8(output, vdupq_n_u8(aqmin));
832 }
833 if (Activation == ActivationFunction::ReLU6)
834 {
835 output = vminq_u8(output, vdupq_n_u8(aqmax));
836 }
837
838 vst1q_u8(outptrs[oi][oj] + n, output);
839 }
840 }
841 }
842#endif // defined(__aarch64__)
843 for (; n_channels >= 8; n_channels -= 8, n += 8)
844 {
845 const int32x4_t biases[2] = {
846 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
847 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
848 };
849 wbptr += 8*sizeof(int32_t);
850
851 uint8x8_t weights[KernelRows][KernelCols];
852 for (unsigned int i = 0; i < KernelRows; i++)
853 {
854 for (unsigned int j = 0; j < KernelCols; j++)
855 {
856 weights[i][j] = vld1_u8(wbptr);
857 wbptr += 8;
858 }
859 }
860
861 uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
862 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
863 {
864 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
865 {
866 inputs[i][j] = vld1_u8(inptrs[i][j] + n);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000867 }
868 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000869
870 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
871 {
872 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
873 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100874 uint32x4_t accs[2];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000875 for (unsigned int i = 0; i < 2; i++)
876 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100877 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000878 }
879
880 for (unsigned int wi = 0; wi < KernelRows; wi++)
881 {
882 for (unsigned int wj = 0; wj < KernelCols; wj++)
883 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100884 const uint8x8_t w = weights[wi][wj];
885 const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
886
887 const uint16x8_t muls = vmull_u8(w, x);
888 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
889 const uint16x8_t sum_elems = vmull_u8(x, woffset);
890
891 const uint32x4_t tmps[2] = {
892 vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
893 vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
894 };
895 for (unsigned int i = 0; i < 2; i++)
896 {
897 accs[i] = vaddq_u32(accs[i], tmps[i]);
898 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000899 }
900 }
901
Georgios Pinitas30271c72019-06-24 14:56:34 +0100902 uint32x4_t final_accs[2];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000903 for (unsigned int i = 0; i < 2; i++)
904 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100905#ifdef FIXED_POINT_REQUANTISATION
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000906 const int32x4_t y = rounding_divide_by_exp2(
Georgios Pinitas30271c72019-06-24 14:56:34 +0100907 saturating_doubling_high_mul(
908 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
909 ),
910 rescale_parameters.shift
911 );
912 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
913 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
914#else // floating point requantisation
915 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
916 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
917 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
918 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
919 final_accs[i] = vcvtq_u32_f32(fp_acc);
920#endif
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000921 }
922
Georgios Pinitas30271c72019-06-24 14:56:34 +0100923 uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
924
925 // Apply the activation function
926 if (Activation == ActivationFunction::ReLU ||
927 Activation == ActivationFunction::ReLU6)
928 {
929 output = vmax_u8(output, vdup_n_u8(aqmin));
930 }
931 if (Activation == ActivationFunction::ReLU6)
932 {
933 output = vmin_u8(output, vdup_n_u8(aqmax));
934 }
935
936 vst1_u8(outptrs[oi][oj] + n, output);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000937 }
938 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000939 }
Georgios Pinitas30271c72019-06-24 14:56:34 +0100940 for (; n_channels; n_channels--, n++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000941 {
942 // Load bias
943 const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
944 wbptr += sizeof(int32_t);
945
946 // Load weights
Georgios Pinitas30271c72019-06-24 14:56:34 +0100947 uint8_t weights[KernelRows][KernelCols];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000948 for (unsigned int i = 0; i < KernelRows; i++)
949 {
950 for (unsigned int j = 0; j < KernelCols; j++)
951 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100952 weights[i][j] = *(wbptr++);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000953 }
954 }
955
956 // Load the input activations
Georgios Pinitas30271c72019-06-24 14:56:34 +0100957 uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
958 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000959 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100960 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000961 {
Georgios Pinitas30271c72019-06-24 14:56:34 +0100962 inputs[i][j] = *(inptrs[i][j] + n);
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100963 }
964 }
965
966 // Perform the convolution
967 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
968 {
969 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
970 {
971 int32_t acc = bias;
Georgios Pinitas30271c72019-06-24 14:56:34 +0100972 uint32_t element_sum = 0;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100973
974 for (unsigned int wi = 0; wi < KernelRows; wi++)
975 {
976 for (unsigned int wj = 0; wj < KernelCols; wj++)
977 {
978 const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
Georgios Pinitas30271c72019-06-24 14:56:34 +0100979 acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
980 element_sum += static_cast<uint32_t>(x);
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100981 }
982 }
983
Georgios Pinitas30271c72019-06-24 14:56:34 +0100984 acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
985
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100986 // Requantize
Georgios Pinitas30271c72019-06-24 14:56:34 +0100987#ifdef FIXED_POINT_REQUANTISATION
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100988 acc = rounding_divide_by_exp2(
Georgios Pinitas30271c72019-06-24 14:56:34 +0100989 saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
990 rescale_parameters.shift
991 );
992 acc += _output_quant.offset;
993 uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
994#else // floating point requantization
995 float fp_acc = static_cast<float>(acc);
996 fp_acc *= rescale_parameters.rescale;
997 fp_acc += static_cast<float>(_output_quant.offset);
998 fp_acc = std::max<float>(fp_acc, 0.0f);
999 uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
1000#endif
1001
1002 // Apply the activation function
1003 if (Activation == ActivationFunction::ReLU ||
1004 Activation == ActivationFunction::ReLU6)
1005 {
1006 output = std::max(output, aqmin);
1007 }
1008 if (Activation == ActivationFunction::ReLU6)
1009 {
1010 output = std::min(output, aqmax);
1011 }
1012
1013 *(outptrs[oi][oj] + n) = output;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +01001014 }
1015 }
1016 }
1017}
1018
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001019} // namespace depthwise