blob: be73065b00ea786beff12a71ff5234de90c88d8a [file] [log] [blame]
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <limits>
34
35#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
36#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
37#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
38
39#pragma once
40
41// Comment the following to use floating-point based quantisation, leave
42// uncommented to use fixed-point.
43#define FIXED_POINT_REQUANTISATION 1
44
45using namespace neon_convolution_kernels;
46using namespace qasymm8;
47
48template <typename T>
49struct clamp_to_limits
50{
51 template <typename U>
52 static inline U clamp(const U& v)
53 {
54 const std::numeric_limits<T> limits;
55 const U min = static_cast<U>(limits.min());
56 const U max = static_cast<U>(limits.max());
57 return std::min(std::max(v, min), max);
58 }
59
60 template <typename U>
61 static inline T clamp_and_cast(const U& v)
62 {
63 return static_cast<U>(clamp(v));
64 }
65};
66
67template <typename T>
68inline T saturating_doubling_high_mul(const T&, const int32_t&);
69
70template <>
71inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
72{
73 return vqrdmulhq_n_s32(a, b);
74}
75
76template <>
77inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
78{
79 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
80}
81
82template <typename T>
83inline T rounding_divide_by_exp2(const T& x, const int exponent);
84
85template <>
86inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
87{
88 const int32x4_t shift = vdupq_n_s32(-exponent);
89 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
90 const int32x4_t fixed = vqaddq_s32(x, fixup);
91 return vrshlq_s32(fixed, shift);
92}
93
94template <>
95inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
96{
97 const int32x2_t shift = vdup_n_s32(-exponent);
98 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
99 const int32x2_t fixed = vqadd_s32(x, fixup);
100 return vrshl_s32(fixed, shift);
101}
102
103template <>
104inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
105{
106 const int32x2_t xs = vdup_n_s32(x);
107 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
108}
109
110namespace depthwise
111{
112template <
113 unsigned int OutputTileRows, unsigned int OutputTileCols,
114 unsigned int KernelRows, unsigned int KernelCols,
115 unsigned int StrideRows, unsigned int StrideCols
116>
117QAsymm8DepthwiseConvolution<
118 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
119>::QAsymm8DepthwiseConvolution(
120 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
121 const ActivationFunction activation,
122 const QAsymm8Params& weight_quantisation,
123 const QAsymm8Params& input_quantisation,
124 const QAsymm8Params& output_quantisation,
125 unsigned int padding_top,
126 unsigned int padding_left,
127 unsigned int padding_bottom,
128 unsigned int padding_right
129) : QAsymm8DepthwiseConvolution(
130 n_batches, n_input_rows, n_input_cols, n_channels,
131 activation, weight_quantisation, input_quantisation, output_quantisation,
132 QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
133 padding_top, padding_left, padding_bottom, padding_right
134 )
135{
136}
137
138template <
139 unsigned int OutputTileRows, unsigned int OutputTileCols,
140 unsigned int KernelRows, unsigned int KernelCols,
141 unsigned int StrideRows, unsigned int StrideCols
142>
143QAsymm8DepthwiseConvolution<
144 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
145>::QAsymm8DepthwiseConvolution(
146 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
147 const ActivationFunction activation,
148 const QAsymm8Params& weight_quantisation,
149 const QAsymm8Params& input_quantisation,
150 const QAsymm8Params& output_quantisation,
151 const QAsymm8RescaleParams& rescale_params,
152 unsigned int padding_top,
153 unsigned int padding_left,
154 unsigned int padding_bottom,
155 unsigned int padding_right
156) : Base(
157 n_batches, n_input_rows, n_input_cols, n_channels,
158 get_activation_fn(activation, output_quantisation),
159 padding_top, padding_left, padding_bottom, padding_right
160 ),
161 _weights_quant(weight_quantisation),
162 _inputs_quant(input_quantisation),
163 _output_quant(output_quantisation),
164 rescale_parameters(rescale_params)
165{
166}
167
168template <
169 unsigned int OutputTileRows, unsigned int OutputTileCols,
170 unsigned int KernelRows, unsigned int KernelCols,
171 unsigned int StrideRows, unsigned int StrideCols
172>
173ActivationFunction QAsymm8DepthwiseConvolution<
174 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
175>::get_activation_fn(
176 const ActivationFunction activation,
177 const QAsymm8Params& output_quant
178)
179{
180 if (
181 (activation == ActivationFunction::ReLU &&
182 output_quant.quantize(0) == 0) ||
183 (activation == ActivationFunction::ReLU6 &&
184 output_quant.quantize(0) == 0 &&
185 output_quant.dequantize(255) <= 6.0f)
186 )
187 {
188 // If the range of values which can be represented by a quantized value are
189 // within the range that would be produced by the activation function, then
190 // the activation function is redundant and can be skipped.
191 return ActivationFunction::None;
192 }
193 else if(
194 activation == ActivationFunction::ReLU6 &&
195 output_quant.dequantize(255) <= 6.0f
196 )
197 {
198 // If the largest value that can be represented by a quantized value is
199 // lower than the upper boundary, then the activation function can be
200 // relaxed to a ReLU.
201 return ActivationFunction::ReLU;
202 }
203
204 return activation;
205}
206
207template <
208 unsigned int OutputTileRows, unsigned int OutputTileCols,
209 unsigned int KernelRows, unsigned int KernelCols,
210 unsigned int StrideRows, unsigned int StrideCols
211>
212uint8_t QAsymm8DepthwiseConvolution<
213 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
214>::_input_padding_value(void) const
215{
216 return _inputs_quant.offset;
217}
218
219template <
220 unsigned int OutputTileRows, unsigned int OutputTileCols,
221 unsigned int KernelRows, unsigned int KernelCols,
222 unsigned int StrideRows, unsigned int StrideCols
223>
224void QAsymm8DepthwiseConvolution<
225 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
226>::_pack_params(
227 void * const buffer,
228 const void * const weights,
229 const unsigned int weight_row_stride,
230 const unsigned int weight_col_stride,
231 const void * const biases
232) const
233{
234 const uint8_t *wptr = static_cast<const uint8_t *>(weights);
235 const int32_t *bptr = static_cast<const int32_t *>(biases);
236 uint8_t *outptr = static_cast<uint8_t *>(buffer);
237
238 // We set the vector length to use quad registers on Aarch64 and only doubles
239 // on Aarch32. NOTE For SVE set this to the actual vector length.
240#if defined(__aarch64__)
241 unsigned int veclen = 16;
242#else
243#if defined(__arm__)
244 unsigned int veclen = 8;
245#endif
246#endif
247
248 // Compute the rank 0 offset arising from the quantisation parameters.
249 const int32_t rank0_offset = (KernelRows * KernelCols *
250 static_cast<int32_t>(_weights_quant.offset) *
251 static_cast<int32_t>(_inputs_quant.offset));
252
253 // While there are channels left to process, pack a vector length of them at
254 // a time and reduce the size of vector used as the size of the tensor
255 // decreases.
256 for (
257 unsigned int n_channels = this->n_channels(); n_channels;
258 n_channels -= veclen,
259 outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
260 )
261 {
262 // NOTE Ignore this section if using SVE, the vector length remains the
263 // same and we just don't fill a full register for the tail.
264 while (n_channels < veclen)
265 {
266 // Reduce the vector length to either 8 or 1 (scalar)
267 // TODO Support more vector lengths in `execute_tile`.
268 veclen = (veclen == 16) ? 8 : 1;
269 }
270
271 // Get pointers to bias and weight portions of the output structure.
272 int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
273 uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
274
275 // Copy a vector length of elements
276 for (unsigned int n = 0; n < veclen && n < n_channels; n++)
277 {
278 int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
279 uint32_t weight_sum = 0;
280
281 for (unsigned int i = 0; i < KernelRows; i++)
282 {
283 uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
284 for (unsigned int j = 0; j < KernelCols; j++)
285 {
286 uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
287 row_outptr[j*veclen + n] = w;
288 weight_sum += static_cast<uint32_t>(w);
289 }
290 }
291 wptr++;
292
293 // Include in the bias contributions from the quantisation offset
294 int32_t rank1_offset = static_cast<int32_t>(
295 static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
296 );
297 out_bptr[n] = bias + rank0_offset - rank1_offset;
298 }
299 }
300}
301
302template <
303 unsigned int OutputTileRows, unsigned int OutputTileCols,
304 unsigned int KernelRows, unsigned int KernelCols,
305 unsigned int StrideRows, unsigned int StrideCols
306>
307template<ActivationFunction Activation>
308void QAsymm8DepthwiseConvolution<
309 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
310>::execute_tile(
311 int n_channels,
312 const void* packed_params,
313 const uint8_t* inptr,
314 const unsigned int in_row_stride,
315 const unsigned int in_col_stride,
316 uint8_t* outptr,
317 const unsigned int out_row_stride,
318 const unsigned int out_col_stride
319)
320{
321 // Activation parameters (unused if Activation is None)
322 const uint8_t aqmin = _output_quant.offset;
323 const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
324 std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
325
326 // Byte type pointer to weights and biases
327 const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
328
329#if defined(__aarch64__) // Under Aarch64 only use quad registers
330 for (; n_channels >= 16; n_channels -= 16)
331 {
332 // Load biases
333 const int32x4_t biases[4] = {
334 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
335 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
336 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
337 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
338 };
339 wbptr += 16*sizeof(int32_t);
340
341 // Load weights
342 uint8x16_t weights[KernelRows][KernelCols];
343 for (unsigned int i = 0; i < KernelRows; i++)
344 {
345 for (unsigned int j = 0; j < KernelCols; j++)
346 {
347 weights[i][j] = vld1q_u8(wbptr);
348 wbptr += 16;
349 }
350 }
351
352 // Load the input activations
353 uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
354 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
355 {
356 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
357 {
358 inputs[i][j] = vld1q_u8(inptr + i*in_row_stride + j*in_col_stride);
359 }
360 }
361 inptr += 16;
362
363 // Perform the convolution
364 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
365 {
366 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
367 {
368 // Two sets of operations are required, we perform the
369 // multiply-accumulates for the convolution proper but must also sum
370 // the tile elements to account for the _weight_ offset.
371 uint32x4_t accs[4];
372 for (unsigned int i = 0; i < 4; i++)
373 {
374 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
375 }
376
377 for (unsigned int wi = 0; wi < KernelRows; wi++)
378 {
379 for (unsigned int wj = 0; wj < KernelCols; wj++)
380 {
381 // Get relevant weight and activation pixel
382 const uint8x16_t w = weights[wi][wj];
383 const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
384
385 // Perform multiplication and accumulation
386 const uint16x8_t muls[2] = {
387 vmull_u8(vget_low_u8(w), vget_low_u8(x)),
388 vmull_u8(vget_high_u8(w), vget_high_u8(x))
389 };
390
391 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
392 const uint16x8_t sum_elems[2] = {
393 vmull_u8(vget_low_u8(x), woffset),
394 vmull_u8(vget_high_u8(x), woffset)
395 };
396
397 const uint32x4_t tmps[4] = {
398 vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
399 vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
400 vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
401 vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
402 };
403 for (unsigned int i = 0; i < 4; i++)
404 {
405 accs[i] = vaddq_u32(accs[i], tmps[i]);
406 }
407 }
408 }
409
410 // Rescale the accumulator and add in the new offset.
411 uint32x4_t final_accs[4];
412 for (unsigned int i = 0; i < 4; i++)
413 {
414#ifdef FIXED_POINT_REQUANTISATION
415 const int32x4_t y = rounding_divide_by_exp2(
416 saturating_doubling_high_mul(
417 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
418 ),
419 rescale_parameters.shift
420 );
421 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
422 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
423#else // floating point requantisation
424 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
425 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
426 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
427 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
428 final_accs[i] = vcvtq_u32_f32(fp_acc);
429#endif
430 }
431
432 uint8x16_t output = vcombine_u8(
433 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
434 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
435 );
436
437 // Apply the activation function
438 if (Activation == ActivationFunction::ReLU ||
439 Activation == ActivationFunction::ReLU6)
440 {
441 output = vmaxq_u8(output, vdupq_n_u8(aqmin));
442 }
443 if (Activation == ActivationFunction::ReLU6)
444 {
445 output = vminq_u8(output, vdupq_n_u8(aqmax));
446 }
447
448 vst1q_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
449 }
450 }
451 outptr += 16;
452 }
453#endif // defined(__aarch64__)
454 for (; n_channels >= 8; n_channels -= 8)
455 {
456 const int32x4_t biases[2] = {
457 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
458 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
459 };
460 wbptr += 8*sizeof(int32_t);
461
462 uint8x8_t weights[KernelRows][KernelCols];
463 for (unsigned int i = 0; i < KernelRows; i++)
464 {
465 for (unsigned int j = 0; j < KernelCols; j++)
466 {
467 weights[i][j] = vld1_u8(wbptr);
468 wbptr += 8;
469 }
470 }
471
472 uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
473 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
474 {
475 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
476 {
477 inputs[i][j] = vld1_u8(inptr + i*in_row_stride + j*in_col_stride);
478 }
479 }
480 inptr += 8;
481
482 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
483 {
484 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
485 {
486 uint32x4_t accs[2];
487 for (unsigned int i = 0; i < 2; i++)
488 {
489 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
490 }
491
492 for (unsigned int wi = 0; wi < KernelRows; wi++)
493 {
494 for (unsigned int wj = 0; wj < KernelCols; wj++)
495 {
496 const uint8x8_t w = weights[wi][wj];
497 const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
498
499 const uint16x8_t muls = vmull_u8(w, x);
500 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
501 const uint16x8_t sum_elems = vmull_u8(x, woffset);
502
503 const uint32x4_t tmps[2] = {
504 vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
505 vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
506 };
507 for (unsigned int i = 0; i < 2; i++)
508 {
509 accs[i] = vaddq_u32(accs[i], tmps[i]);
510 }
511 }
512 }
513
514 uint32x4_t final_accs[2];
515 for (unsigned int i = 0; i < 2; i++)
516 {
517#ifdef FIXED_POINT_REQUANTISATION
518 const int32x4_t y = rounding_divide_by_exp2(
519 saturating_doubling_high_mul(
520 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
521 ),
522 rescale_parameters.shift
523 );
524 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
525 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
526#else // floating point requantisation
527 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
528 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
529 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
530 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
531 final_accs[i] = vcvtq_u32_f32(fp_acc);
532#endif
533 }
534
535 uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
536
537 // Apply the activation function
538 if (Activation == ActivationFunction::ReLU ||
539 Activation == ActivationFunction::ReLU6)
540 {
541 output = vmax_u8(output, vdup_n_u8(aqmin));
542 }
543 if (Activation == ActivationFunction::ReLU6)
544 {
545 output = vmin_u8(output, vdup_n_u8(aqmax));
546 }
547
548 vst1_u8(outptr + oi*out_row_stride + oj*out_col_stride, output);
549 }
550 }
551 outptr += 8;
552 }
553 for (; n_channels; n_channels--)
554 {
555 // Load bias
556 const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
557 wbptr += sizeof(int32_t);
558
559 // Load weights
560 uint8_t weights[KernelRows][KernelCols];
561 for (unsigned int i = 0; i < KernelRows; i++)
562 {
563 for (unsigned int j = 0; j < KernelCols; j++)
564 {
565 weights[i][j] = *(wbptr++);
566 }
567 }
568
569 // Load the input activations
570 uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
571 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
572 {
573 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
574 {
575 inputs[i][j] = *(inptr + i*in_row_stride + j*in_col_stride);
576 }
577 }
578 inptr++;
579
580 // Perform the convolution
581 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
582 {
583 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
584 {
585 int32_t acc = bias;
586 uint32_t element_sum = 0;
587
588 for (unsigned int wi = 0; wi < KernelRows; wi++)
589 {
590 for (unsigned int wj = 0; wj < KernelCols; wj++)
591 {
592 const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
593 acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
594 element_sum += static_cast<uint32_t>(x);
595 }
596 }
597
598 acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
599
600 // Requantize
601#ifdef FIXED_POINT_REQUANTISATION
602 acc = rounding_divide_by_exp2(
603 saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
604 rescale_parameters.shift
605 );
606 acc += _output_quant.offset;
607 uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
608#else // floating point requantization
609 float fp_acc = static_cast<float>(acc);
610 fp_acc *= rescale_parameters.rescale;
611 fp_acc += static_cast<float>(_output_quant.offset);
612 fp_acc = std::max<float>(fp_acc, 0.0f);
613 uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
614#endif
615
616 // Apply the activation function
617 if (Activation == ActivationFunction::ReLU ||
618 Activation == ActivationFunction::ReLU6)
619 {
620 output = std::max(output, aqmin);
621 }
622 if (Activation == ActivationFunction::ReLU6)
623 {
624 output = std::min(output, aqmax);
625 }
626
627 *(outptr + oi*out_row_stride + oj*out_col_stride) = output;
628 }
629 }
630 outptr++;
631 }
632}
633
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100634template <
635 unsigned int OutputTileRows, unsigned int OutputTileCols,
636 unsigned int KernelRows, unsigned int KernelCols,
637 unsigned int StrideRows, unsigned int StrideCols
638>
639template<ActivationFunction Activation>
640void QAsymm8DepthwiseConvolution<
641 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
642>::execute_tile(
643 int n_channels,
644 const void* packed_params,
645 const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
646 uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
647)
648{
649 // Activation parameters (unused if Activation is None)
650 const uint8_t aqmin = _output_quant.offset;
651 const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
652 std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
653
654 // Byte type pointer to weights and biases
655 const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
656
657 // Offset into input/output tensors
658 int n = 0;
659
660#if defined(__aarch64__) // Under Aarch64 only use quad registers
661 for (; n_channels >= 16; n_channels -= 16, n += 16)
662 {
663 // Load biases
664 const int32x4_t biases[4] = {
665 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
666 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
667 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
668 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
669 };
670 wbptr += 16*sizeof(int32_t);
671
672 // Load weights
673 uint8x16_t weights[KernelRows][KernelCols];
674 for (unsigned int i = 0; i < KernelRows; i++)
675 {
676 for (unsigned int j = 0; j < KernelCols; j++)
677 {
678 weights[i][j] = vld1q_u8(wbptr);
679 wbptr += 16;
680 }
681 }
682
683 // Load the input activations
684 uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
685 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
686 {
687 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
688 {
689 inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
690 }
691 }
692
693 // Perform the convolution
694 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
695 {
696 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
697 {
698 // Two sets of operations are required, we perform the
699 // multiply-accumulates for the convolution proper but must also sum
700 // the tile elements to account for the _weight_ offset.
701 uint32x4_t accs[4];
702 for (unsigned int i = 0; i < 4; i++)
703 {
704 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
705 }
706
707 for (unsigned int wi = 0; wi < KernelRows; wi++)
708 {
709 for (unsigned int wj = 0; wj < KernelCols; wj++)
710 {
711 // Get relevant weight and activation pixel
712 const uint8x16_t w = weights[wi][wj];
713 const uint8x16_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
714
715 // Perform multiplication and accumulation
716 const uint16x8_t muls[2] = {
717 vmull_u8(vget_low_u8(w), vget_low_u8(x)),
718 vmull_u8(vget_high_u8(w), vget_high_u8(x))
719 };
720
721 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
722 const uint16x8_t sum_elems[2] = {
723 vmull_u8(vget_low_u8(x), woffset),
724 vmull_u8(vget_high_u8(x), woffset)
725 };
726
727 const uint32x4_t tmps[4] = {
728 vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
729 vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
730 vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
731 vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
732 };
733 for (unsigned int i = 0; i < 4; i++)
734 {
735 accs[i] = vaddq_u32(accs[i], tmps[i]);
736 }
737 }
738 }
739
740 // Rescale the accumulator and add in the new offset.
741 uint32x4_t final_accs[4];
742 for (unsigned int i = 0; i < 4; i++)
743 {
744#ifdef FIXED_POINT_REQUANTISATION
745 const int32x4_t y = rounding_divide_by_exp2(
746 saturating_doubling_high_mul(
747 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
748 ),
749 rescale_parameters.shift
750 );
751 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
752 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
753#else // floating point requantisation
754 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
755 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
756 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
757 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
758 final_accs[i] = vcvtq_u32_f32(fp_acc);
759#endif
760 }
761
762 uint8x16_t output = vcombine_u8(
763 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
764 vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
765 );
766
767 // Apply the activation function
768 if (Activation == ActivationFunction::ReLU ||
769 Activation == ActivationFunction::ReLU6)
770 {
771 output = vmaxq_u8(output, vdupq_n_u8(aqmin));
772 }
773 if (Activation == ActivationFunction::ReLU6)
774 {
775 output = vminq_u8(output, vdupq_n_u8(aqmax));
776 }
777
778 vst1q_u8(outptrs[oi][oj] + n, output);
779 }
780 }
781 }
782#endif // defined(__aarch64__)
783 for (; n_channels >= 8; n_channels -= 8, n += 8)
784 {
785 const int32x4_t biases[2] = {
786 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
787 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
788 };
789 wbptr += 8*sizeof(int32_t);
790
791 uint8x8_t weights[KernelRows][KernelCols];
792 for (unsigned int i = 0; i < KernelRows; i++)
793 {
794 for (unsigned int j = 0; j < KernelCols; j++)
795 {
796 weights[i][j] = vld1_u8(wbptr);
797 wbptr += 8;
798 }
799 }
800
801 uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
802 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
803 {
804 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
805 {
806 inputs[i][j] = vld1_u8(inptrs[i][j] + n);
807 }
808 }
809
810 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
811 {
812 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
813 {
814 uint32x4_t accs[2];
815 for (unsigned int i = 0; i < 2; i++)
816 {
817 accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
818 }
819
820 for (unsigned int wi = 0; wi < KernelRows; wi++)
821 {
822 for (unsigned int wj = 0; wj < KernelCols; wj++)
823 {
824 const uint8x8_t w = weights[wi][wj];
825 const uint8x8_t x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
826
827 const uint16x8_t muls = vmull_u8(w, x);
828 const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
829 const uint16x8_t sum_elems = vmull_u8(x, woffset);
830
831 const uint32x4_t tmps[2] = {
832 vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
833 vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
834 };
835 for (unsigned int i = 0; i < 2; i++)
836 {
837 accs[i] = vaddq_u32(accs[i], tmps[i]);
838 }
839 }
840 }
841
842 uint32x4_t final_accs[2];
843 for (unsigned int i = 0; i < 2; i++)
844 {
845#ifdef FIXED_POINT_REQUANTISATION
846 const int32x4_t y = rounding_divide_by_exp2(
847 saturating_doubling_high_mul(
848 reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
849 ),
850 rescale_parameters.shift
851 );
852 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
853 final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
854#else // floating point requantisation
855 float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
856 fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
857 fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
858 fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
859 final_accs[i] = vcvtq_u32_f32(fp_acc);
860#endif
861 }
862
863 uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
864
865 // Apply the activation function
866 if (Activation == ActivationFunction::ReLU ||
867 Activation == ActivationFunction::ReLU6)
868 {
869 output = vmax_u8(output, vdup_n_u8(aqmin));
870 }
871 if (Activation == ActivationFunction::ReLU6)
872 {
873 output = vmin_u8(output, vdup_n_u8(aqmax));
874 }
875
876 vst1_u8(outptrs[oi][oj] + n, output);
877 }
878 }
879 }
880 for (; n_channels; n_channels--, n++)
881 {
882 // Load bias
883 const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
884 wbptr += sizeof(int32_t);
885
886 // Load weights
887 uint8_t weights[KernelRows][KernelCols];
888 for (unsigned int i = 0; i < KernelRows; i++)
889 {
890 for (unsigned int j = 0; j < KernelCols; j++)
891 {
892 weights[i][j] = *(wbptr++);
893 }
894 }
895
896 // Load the input activations
897 uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
898 for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
899 {
900 for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
901 {
902 inputs[i][j] = *(inptrs[i][j] + n);
903 }
904 }
905
906 // Perform the convolution
907 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
908 {
909 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
910 {
911 int32_t acc = bias;
912 uint32_t element_sum = 0;
913
914 for (unsigned int wi = 0; wi < KernelRows; wi++)
915 {
916 for (unsigned int wj = 0; wj < KernelCols; wj++)
917 {
918 const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
919 acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
920 element_sum += static_cast<uint32_t>(x);
921 }
922 }
923
924 acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
925
926 // Requantize
927#ifdef FIXED_POINT_REQUANTISATION
928 acc = rounding_divide_by_exp2(
929 saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
930 rescale_parameters.shift
931 );
932 acc += _output_quant.offset;
933 uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
934#else // floating point requantization
935 float fp_acc = static_cast<float>(acc);
936 fp_acc *= rescale_parameters.rescale;
937 fp_acc += static_cast<float>(_output_quant.offset);
938 fp_acc = std::max<float>(fp_acc, 0.0f);
939 uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
940#endif
941
942 // Apply the activation function
943 if (Activation == ActivationFunction::ReLU ||
944 Activation == ActivationFunction::ReLU6)
945 {
946 output = std::max(output, aqmin);
947 }
948 if (Activation == ActivationFunction::ReLU6)
949 {
950 output = std::min(output, aqmax);
951 }
952
953 *(outptrs[oi][oj] + n) = output;
954 }
955 }
956 }
957}
958
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000959} // namespace depthwise