blob: 5546d37e5981427dc1aa868baa73f9166638f146 [file] [log] [blame]
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <limits>
34
35#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
36#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
37#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
38
39#pragma once
40
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000041using namespace neon_convolution_kernels;
42using namespace qasymm8;
43
44template <typename T>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000045inline T saturating_doubling_high_mul(const T&, const int32_t&);
46
47template <>
48inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
49{
50 return vqrdmulhq_n_s32(a, b);
51}
52
53template <>
54inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
55{
56 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
57}
58
59template <typename T>
60inline T rounding_divide_by_exp2(const T& x, const int exponent);
61
62template <>
63inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
64{
65 const int32x4_t shift = vdupq_n_s32(-exponent);
66 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
67 const int32x4_t fixed = vqaddq_s32(x, fixup);
68 return vrshlq_s32(fixed, shift);
69}
70
71template <>
72inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
73{
74 const int32x2_t shift = vdup_n_s32(-exponent);
75 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
76 const int32x2_t fixed = vqadd_s32(x, fixup);
77 return vrshl_s32(fixed, shift);
78}
79
80template <>
81inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
82{
83 const int32x2_t xs = vdup_n_s32(x);
84 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
85}
86
87namespace depthwise
88{
89template <
Georgios Pinitasa851bba2019-04-12 13:55:39 +010090 unsigned int OutputTileRows, unsigned int OutputTileCols,
91 unsigned int KernelRows, unsigned int KernelCols,
92 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000093>
94QAsymm8DepthwiseConvolution<
Georgios Pinitasa851bba2019-04-12 13:55:39 +010095 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000096>::QAsymm8DepthwiseConvolution(
Georgios Pinitasa851bba2019-04-12 13:55:39 +010097 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
98 const ActivationFunction activation,
99 const QAsymm8Params& weight_quantisation,
100 const QAsymm8Params& input_quantisation,
101 const QAsymm8Params& output_quantisation,
102 unsigned int padding_top,
103 unsigned int padding_left,
104 unsigned int padding_bottom,
105 unsigned int padding_right
106 ) : QAsymm8DepthwiseConvolution(
107 n_batches, n_input_rows, n_input_cols, n_channels,
108 activation, weight_quantisation, input_quantisation, output_quantisation,
109 QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
110 padding_top, padding_left, padding_bottom, padding_right
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000111)
112{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000113}
114
115template <
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100116 unsigned int OutputTileRows, unsigned int OutputTileCols,
117 unsigned int KernelRows, unsigned int KernelCols,
118 unsigned int StrideRows, unsigned int StrideCols
119>
120QAsymm8DepthwiseConvolution<
121 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
122>::QAsymm8DepthwiseConvolution(
123 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
124 const ActivationFunction activation,
125 const QAsymm8Params& weight_quantisation,
126 const QAsymm8Params& input_quantisation,
127 const QAsymm8Params& output_quantisation,
128 const QAsymm8RescaleParams& rescale_params,
129 unsigned int padding_top,
130 unsigned int padding_left,
131 unsigned int padding_bottom,
132 unsigned int padding_right
133 ) : Base(
134 n_batches, n_input_rows, n_input_cols, n_channels, activation,
135 padding_top, padding_left, padding_bottom, padding_right
136),
137 _weights_quant(weight_quantisation),
138 _inputs_quant(input_quantisation),
139 _output_quant(output_quantisation),
140 rescale_parameters(rescale_params)
141{
142}
143
144template <
145 unsigned int OutputTileRows, unsigned int OutputTileCols,
146 unsigned int KernelRows, unsigned int KernelCols,
147 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000148>
149uint8_t QAsymm8DepthwiseConvolution<
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100150 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000151>::_input_padding_value(void) const
152{
153 return _inputs_quant.offset;
154}
155
156template <
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100157 unsigned int OutputTileRows, unsigned int OutputTileCols,
158 unsigned int KernelRows, unsigned int KernelCols,
159 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000160>
161void QAsymm8DepthwiseConvolution<
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100162 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000163>::_pack_params(
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100164 void * const buffer,
165 const void * const weights,
166 const unsigned int weight_row_stride,
167 const unsigned int weight_col_stride,
168 const void * const biases
169 ) const
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000170{
171 const uint8_t *wptr = static_cast<const uint8_t *>(weights);
172 const int32_t *bptr = static_cast<const int32_t *>(biases);
173 uint8_t *outptr = static_cast<uint8_t *>(buffer);
174
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100175 // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
176 // For SVE set this to half the vector length.
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000177 unsigned int veclen = 8;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000178
179 // While there are channels left to process, pack a vector length of them at
180 // a time and reduce the size of vector used as the size of the tensor
181 // decreases.
182 for (
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100183 unsigned int n_channels = this->n_channels(); n_channels;
184 n_channels -= veclen,
185 outptr += veclen*(sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
186 )
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000187 {
188 // NOTE Ignore this section if using SVE, the vector length remains the
189 // same and we just don't fill a full register for the tail.
190 while (n_channels < veclen)
191 {
192 // Reduce the vector length to either 8 or 1 (scalar)
193 // TODO Support more vector lengths in `execute_tile`.
194 veclen = (veclen == 16) ? 8 : 1;
195 }
196
197 // Get pointers to bias and weight portions of the output structure.
198 int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
199 uint8_t *out_wptr = outptr + veclen*sizeof(int32_t);
200
201 // Copy a vector length of elements
202 for (unsigned int n = 0; n < veclen && n < n_channels; n++)
203 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100204 const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
205 out_bptr[n] = bias;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000206
207 for (unsigned int i = 0; i < KernelRows; i++)
208 {
209 uint8_t *row_outptr = out_wptr + i*KernelCols*veclen;
210 for (unsigned int j = 0; j < KernelCols; j++)
211 {
212 uint8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
213 row_outptr[j*veclen + n] = w;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000214 }
215 }
216 wptr++;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000217 }
218 }
219}
220
221template <
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100222 unsigned int OutputTileRows, unsigned int OutputTileCols,
223 unsigned int KernelRows, unsigned int KernelCols,
224 unsigned int StrideRows, unsigned int StrideCols,
225 typename FInput, typename FOutput
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000226>
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100227static inline void tilefn(
228 int n_channels,
229 const void* packed_params,
230 FInput &get_input_ptr,
231 FOutput &get_output_ptr,
232 const int32_t clamp_max,
233 const int32_t clamp_min,
234 const uint8_t input_offset,
235 const uint8_t weight_offset,
236 const uint8_t output_offset,
237 const int32_t requant_multiplier,
238 const int32_t requant_shift
239 )
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000240{
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100241 constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
242 constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
243
244 // Offset into channels
245 int channel = 0;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000246
247 // Byte type pointer to weights and biases
248 const uint8_t *wbptr = static_cast<const uint8_t *>(packed_params);
249
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100250 for (; n_channels >= 8; n_channels -= 8, channel += 8)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000251 {
252 const int32x4_t biases[2] = {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100253 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
254 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000255 };
256 wbptr += 8*sizeof(int32_t);
257
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100258 int16x8_t weights[KernelRows][KernelCols];
259 const uint8x8_t woffset = vdup_n_u8(weight_offset);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000260 for (unsigned int i = 0; i < KernelRows; i++)
261 {
262 for (unsigned int j = 0; j < KernelCols; j++)
263 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100264 const uint8x8_t w = vld1_u8(wbptr);
265 weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000266 wbptr += 8;
267 }
268 }
269
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100270 int16x8_t inputs[InnerTileRows][InnerTileCols];
271 const uint8x8_t ioffset = vdup_n_u8(input_offset);
272 for (unsigned int i = 0; i < InnerTileRows; i++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000273 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100274 for (unsigned int j = 0; j < InnerTileCols; j++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000275 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100276 const auto x = vld1_u8(get_input_ptr(i, j, channel));
277 inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000278 }
279 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000280
281 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
282 {
283 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
284 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100285 int32x4_t accs[2];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000286 for (unsigned int i = 0; i < 2; i++)
287 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100288 accs[i] = biases[i];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000289 }
290
291 for (unsigned int wi = 0; wi < KernelRows; wi++)
292 {
293 for (unsigned int wj = 0; wj < KernelCols; wj++)
294 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100295 const auto w = weights[wi][wj];
296 const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
297 accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
298 accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000299 }
300 }
301
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100302 int32x4_t final_accs[2];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000303 for (unsigned int i = 0; i < 2; i++)
304 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000305 const int32x4_t y = rounding_divide_by_exp2(
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100306 saturating_doubling_high_mul(accs[i], requant_multiplier),
307 requant_shift);
308 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
309 final_accs[i] = vaddq_s32(y, offset);
310 final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
311 final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000312 }
313
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100314 const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
315 vreinterpretq_s16_s32(final_accs[1]));
316 const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
317 const uint8x8_t output =
318 vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
319 vst1_u8(get_output_ptr(oi, oj, channel), output);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000320 }
321 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000322 }
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100323 for (; n_channels; n_channels--, channel++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000324 {
325 // Load bias
326 const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
327 wbptr += sizeof(int32_t);
328
329 // Load weights
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100330 int16_t weights[KernelRows][KernelCols];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000331 for (unsigned int i = 0; i < KernelRows; i++)
332 {
333 for (unsigned int j = 0; j < KernelCols; j++)
334 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100335 weights[i][j] = *(wbptr++) - weight_offset;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000336 }
337 }
338
339 // Load the input activations
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100340 int16_t inputs[InnerTileRows][InnerTileCols];
341 for (unsigned int i = 0; i < InnerTileRows; i++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000342 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100343 for (unsigned int j = 0; j < InnerTileCols; j++)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000344 {
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100345 inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100346 }
347 }
348
349 // Perform the convolution
350 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
351 {
352 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
353 {
354 int32_t acc = bias;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100355
356 for (unsigned int wi = 0; wi < KernelRows; wi++)
357 {
358 for (unsigned int wj = 0; wj < KernelCols; wj++)
359 {
360 const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100361 acc += w * x;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100362 }
363 }
364
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100365 // Requantize
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100366 acc = rounding_divide_by_exp2(
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100367 saturating_doubling_high_mul(acc, requant_multiplier),
368 requant_shift);
369 acc += output_offset;
370 acc = std::max(acc, clamp_min);
371 acc = std::min(acc, clamp_max);
372 uint8_t output = static_cast<uint8_t>(acc);
373 *(get_output_ptr(oi, oj, channel)) = output;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100374 }
375 }
376 }
377}
378
Georgios Pinitasa851bba2019-04-12 13:55:39 +0100379template <
380 unsigned int OutputTileRows, unsigned int OutputTileCols,
381 unsigned int KernelRows, unsigned int KernelCols,
382 unsigned int StrideRows, unsigned int StrideCols,
383 typename FInput, typename FOutput
384>
385static inline void execute_tilefn(
386 int n_channels,
387 const void* packed_params,
388 const nck::ActivationFunction actfn,
389 FInput &get_input_ptr,
390 FOutput &get_output_ptr,
391 const QAsymm8Params &input_quant,
392 const QAsymm8Params &weight_quant,
393 const QAsymm8Params &output_quant,
394 const QAsymm8RescaleParams &requant
395 ) {
396 // Compute min/max clamp values
397 int32_t clamp_min = std::numeric_limits<uint8_t>::min();
398 int32_t clamp_max = std::numeric_limits<uint8_t>::max();
399
400 if (actfn == nck::ActivationFunction::ReLU ||
401 actfn == nck::ActivationFunction::ReLU6) {
402 const int32_t bottom_rail = output_quant.offset;
403 clamp_min = std::max(clamp_min, bottom_rail);
404 }
405
406 if (actfn == nck::ActivationFunction::ReLU6) {
407 const int32_t top_rail = output_quant.quantize(6.0f);
408 clamp_max = std::min(clamp_max, top_rail);
409 }
410
411 // Call the tile execution method
412 tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
413 StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
414 clamp_max, clamp_min, input_quant.offset,
415 weight_quant.offset, output_quant.offset,
416 requant.multiplier, requant.shift);
417}
418
419template <
420 unsigned int OutputTileRows, unsigned int OutputTileCols,
421 unsigned int KernelRows, unsigned int KernelCols,
422 unsigned int StrideRows, unsigned int StrideCols
423>
424template <nck::ActivationFunction Activation>
425void QAsymm8DepthwiseConvolution<
426 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
427>::execute_tile(
428 int n_channels,
429 const void* packed_params,
430 const uint8_t* inptr,
431 unsigned int in_row_stride,
432 unsigned int in_col_stride,
433 uint8_t* outptr,
434 unsigned int out_row_stride,
435 unsigned int out_col_stride
436 ) {
437 // Construct methods to get pointers
438 const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
439 const int i, const int j, const int channel) {
440 return inptr + i * in_row_stride + j * in_col_stride + channel;
441 };
442
443 const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
444 const int i, const int j, const int channel) {
445 return outptr + i * out_row_stride + j * out_col_stride + channel;
446 };
447
448 execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
449 StrideRows, StrideCols>(
450 n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
451 _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
452}
453
454template <
455 unsigned int OutputTileRows, unsigned int OutputTileCols,
456 unsigned int KernelRows, unsigned int KernelCols,
457 unsigned int StrideRows, unsigned int StrideCols
458>
459template <nck::ActivationFunction Activation>
460void QAsymm8DepthwiseConvolution<
461 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
462>::execute_tile(
463 int n_channels,
464 const void* packed_params,
465 const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
466 uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
467 ) {
468 // Construct methods to get pointers
469 const auto get_input_ptr = [inptrs](const int i, const int j,
470 const int channel) {
471 return inptrs[i][j] + channel;
472 };
473
474 const auto get_output_ptr = [outptrs](const int i, const int j,
475 const int channel) {
476 return outptrs[i][j] + channel;
477 };
478
479 // Call the tile execution method
480 execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
481 StrideRows, StrideCols>(
482 n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
483 _inputs_quant, _weights_quant, _output_quant, rescale_parameters);
484}
485
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000486} // namespace depthwise