blob: 68e20d98a9c720cc899a0c510cdb74df78a3baf2 [file] [log] [blame]
Giuseppe Rossinif01201a2019-11-06 14:57:49 +00001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2019 Arm Limited.
Giuseppe Rossinif01201a2019-11-06 14:57:49 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include <limits>
34
35#include "arm.hpp"
36#include "impl_base.hpp"
37#include "depthwise_quantized.hpp"
38
39#pragma once
40
41namespace {
42
43template <
44 unsigned int OutputTileRows, unsigned int OutputTileCols,
45 unsigned int KernelRows, unsigned int KernelCols,
46 unsigned int StrideRows, unsigned int StrideCols,
47 typename FInput, typename FOutput
48>
49static inline void tilefn_hybrid(
50 int n_channels,
51 const void* packed_params,
52 FInput &get_input_ptr,
53 FOutput &get_output_ptr,
54 int32_t clamp_min,
55 int32_t clamp_max,
56 uint8_t input_offset,
57 uint8_t output_offset
58)
59{
60 constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
61 constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
62
63 // Offset into channels
64 int channel = 0;
65
66 // Byte type pointer to weights and biases
67 const int8_t *wbptr = static_cast<const int8_t *>(packed_params);
68
69 for (; n_channels >= 8; n_channels -= 8, channel += 8)
70 {
71 const int32x4_t biases[2] = {
72 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
73 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
74 };
75 const int32x4_t multipliers[2] = {
76 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
77 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
78 };
79 const int32x4_t shifts[2] = {
80 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
81 vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
82 };
83 wbptr += 24*sizeof(int32_t);
84
85 int16x8_t weights[KernelRows][KernelCols];
86 for (unsigned int i = 0; i < KernelRows; i++)
87 {
88 for (unsigned int j = 0; j < KernelCols; j++)
89 {
90 const auto w = vld1_s8(wbptr);
91 weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
92 wbptr += 8;
93 }
94 }
95
96 int16x8_t inputs[InnerTileRows][InnerTileCols];
97 const uint8x8_t ioffset = vdup_n_u8(input_offset);
98 for (unsigned int i = 0; i < InnerTileRows; i++)
99 {
100 for (unsigned int j = 0; j < InnerTileCols; j++)
101 {
102 const auto x = vld1_u8(get_input_ptr(i, j, channel));
103 inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
104 }
105 }
106
107 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
108 {
109 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
110 {
111 int32x4_t accs[2];
112 for (unsigned int i = 0; i < 2; i++)
113 {
114 accs[i] = biases[i];
115 }
116
117 for (unsigned int wi = 0; wi < KernelRows; wi++)
118 {
119 for (unsigned int wj = 0; wj < KernelCols; wj++)
120 {
121 const auto w = weights[wi][wj];
122 const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
123 accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
124 accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
125 }
126 }
127
128 int32x4_t final_accs[2];
129 for (unsigned int i = 0; i < 2; i++)
130 {
131 const int32x4_t y = rounding_divide_by_exp2(
132 saturating_doubling_high_mul(accs[i], multipliers[i]),
133 shifts[i]);
134 const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
135 final_accs[i] = vaddq_s32(y, offset);
136 final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
137 final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
138 }
139
140 const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
141 vreinterpretq_s16_s32(final_accs[1]));
142 const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
143 const uint8x8_t output =
144 vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
145
146 vst1_u8(get_output_ptr(oi, oj, channel), output);
147 }
148 }
149 }
150
151 for (; n_channels; n_channels--, channel++)
152 {
153 // Load bias
154 const int32_t bias = *reinterpret_cast<const int32_t *>(wbptr);
155 const int32_t multiplier = *reinterpret_cast<const int32_t *>(wbptr + sizeof(int32_t));
156 const int32_t shift = *reinterpret_cast<const int32_t *>(wbptr + 2*sizeof(int32_t));
157
158 wbptr += 3*sizeof(int32_t);
159
160 // Load weights
161 int16_t weights[KernelRows][KernelCols];
162 for (unsigned int i = 0; i < KernelRows; i++)
163 {
164 for (unsigned int j = 0; j < KernelCols; j++)
165 {
166 weights[i][j] = *(wbptr++);
167 }
168 }
169
170 // Load the input activations
171 int16_t inputs[InnerTileRows][InnerTileCols];
172 for (unsigned int i = 0; i < InnerTileRows; i++)
173 {
174 for (unsigned int j = 0; j < InnerTileCols; j++)
175 {
176 inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
177 }
178 }
179
180 // Perform the convolution
181 for (unsigned int oi = 0; oi < OutputTileRows; oi++)
182 {
183 for (unsigned int oj = 0; oj < OutputTileCols; oj++)
184 {
185 int32_t acc = bias;
186
187 for (unsigned int wi = 0; wi < KernelRows; wi++)
188 {
189 for (unsigned int wj = 0; wj < KernelCols; wj++)
190 {
191 const auto w = weights[wi][wj], x = inputs[oi*StrideRows + wi][oj*StrideCols + wj];
192 acc += w * x;
193 }
194 }
195
196 // Requantize
197 acc = rounding_divide_by_exp2(
198 saturating_doubling_high_mul(acc, multiplier),
199 -shift);
200 acc += output_offset;
201 acc = std::max(acc, clamp_min);
202 acc = std::min(acc, clamp_max);
203 uint8_t output = static_cast<uint8_t>(acc);
204 *(get_output_ptr(oi, oj, channel)) = output;
205 }
206 }
207 }
208}
209
210template <
211 unsigned int OutputTileRows, unsigned int OutputTileCols,
212 unsigned int KernelRows, unsigned int KernelCols,
213 unsigned int StrideRows, unsigned int StrideCols,
214 typename FInput, typename FOutput
215>
216static inline void execute_tilefn_hybrid(
217 int n_channels,
218 const void* packed_params,
219 const ActivationFunction actfn,
220 const qasymm8::QAsymm8Params &input_quant,
221 const qasymm8::QAsymm8Params &output_quant,
222 FInput &get_input_ptr,
223 FOutput &get_output_ptr) {
224
225 // Compute min/max clamp values
226 int32_t clamp_min = std::numeric_limits<uint8_t>::min();
227 int32_t clamp_max = std::numeric_limits<uint8_t>::max();
228
229 if (actfn == ActivationFunction::ReLU) {
230 clamp_min = output_quant.offset;
231 }
232
233 // Disabling Relu6 for now
234 if (actfn == ActivationFunction::ReLU6) {
235 const int32_t top_rail = output_quant.quantize(6.0f);
236 clamp_max = std::min(clamp_max, top_rail);
237 }
238
239 // Call the tile execution method
240 tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
241 StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
242}
243}
244
245
246
247namespace depthwise {
248using namespace qsymm8;
249template <
250 unsigned int OutputTileRows, unsigned int OutputTileCols,
251 unsigned int KernelRows, unsigned int KernelCols,
252 unsigned int StrideRows, unsigned int StrideCols
253>
254QSymm8HybridPerChannelDepthwiseConvolution<
255 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
256>::QSymm8HybridPerChannelDepthwiseConvolution(
257 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
258 const ActivationFunction activation,
259 const QSymm8PerChannelParams& weight_quantisation,
260 const qasymm8::QAsymm8Params& input_quantisation,
261 const qasymm8::QAsymm8Params& output_quantisation,
262 unsigned int padding_top,
263 unsigned int padding_left,
264 unsigned int padding_bottom,
265 unsigned int padding_right
266) : QSymm8HybridPerChannelDepthwiseConvolution(
267 n_batches, n_input_rows, n_input_cols, n_channels,
268 activation, weight_quantisation, input_quantisation, output_quantisation,
269 QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
270 padding_top, padding_left, padding_bottom, padding_right
271 )
272{
273}
274
275template <
276 unsigned int OutputTileRows, unsigned int OutputTileCols,
277 unsigned int KernelRows, unsigned int KernelCols,
278 unsigned int StrideRows, unsigned int StrideCols
279>
280QSymm8HybridPerChannelDepthwiseConvolution<
281 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
282>::QSymm8HybridPerChannelDepthwiseConvolution(
283 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
284 const ActivationFunction activation,
285 const QSymm8PerChannelParams& weight_quantisation,
286 const qasymm8::QAsymm8Params& input_quantisation,
287 const qasymm8::QAsymm8Params& output_quantisation,
288 const QSymm8PerChannelRescaleParams& rescale_params,
289 unsigned int padding_top,
290 unsigned int padding_left,
291 unsigned int padding_bottom,
292 unsigned int padding_right
293) : Base(
294 n_batches, n_input_rows, n_input_cols, n_channels, activation,
295 padding_top, padding_left, padding_bottom, padding_right
296 ),
297 _weights_quant(weight_quantisation),
298 _input_quant(input_quantisation),
299 _output_quant(output_quantisation),
300 _rescale_parameters(rescale_params)
301{
302}
303
304template <
305 unsigned int OutputTileRows, unsigned int OutputTileCols,
306 unsigned int KernelRows, unsigned int KernelCols,
307 unsigned int StrideRows, unsigned int StrideCols
308>
309uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
310 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
311>::_input_padding_value(void) const
312{
313 return _input_quant.offset;
314}
315
316template <
317 unsigned int OutputTileRows, unsigned int OutputTileCols,
318 unsigned int KernelRows, unsigned int KernelCols,
319 unsigned int StrideRows, unsigned int StrideCols
320>
321void QSymm8HybridPerChannelDepthwiseConvolution<
322 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
323>::_pack_params(
324 void * const buffer,
325 const void * const weights,
326 const unsigned int weight_row_stride,
327 const unsigned int weight_col_stride,
328 const void * const biases
329) const
330{
331 const int8_t *wptr = static_cast<const int8_t *>(weights);
332 const int32_t *bptr = static_cast<const int32_t *>(biases);
333 const int32_t *mptr = static_cast<const int32_t *>(_rescale_parameters.multipliers.data());
334 const int32_t *sptr = static_cast<const int32_t *>(_rescale_parameters.shifts.data());
335 int8_t *outptr = static_cast<int8_t *>(buffer);
336
337 // We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
338 // For SVE set this to half the vector length.
339 unsigned int veclen = 8;
340
341 // While there are channels left to process, pack a vector length of them at
342 // a time and reduce the size of vector used as the size of the tensor
343 // decreases.
344 for (
345 unsigned int n_channels = this->n_channels(); n_channels;
346 n_channels -= veclen,
347 outptr += veclen*(3*sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
348 )
349 {
350 // NOTE Ignore this section if using SVE, the vector length remains the
351 // same and we just don't fill a full register for the tail.
352 while (n_channels < veclen)
353 {
354 // Reduce the vector length to either 8 or 1 (scalar)
355 // TODO Support more vector lengths in `execute_tile`.
356 veclen = (veclen == 16) ? 8 : 1;
357 }
358
359 // Get pointers to bias and weight portions of the output structure.
360 int32_t *out_bptr = reinterpret_cast<int32_t *>(outptr);
361 int32_t *out_mptr = reinterpret_cast<int32_t *>(outptr + veclen*sizeof(int32_t));
362 int32_t *out_sptr = reinterpret_cast<int32_t *>(outptr + 2*veclen*sizeof(int32_t));
363 int8_t *out_wptr = outptr + 3*veclen*sizeof(int32_t);
364
365 // Copy a vector length of elements
366 for (unsigned int n = 0; n < veclen && n < n_channels; n++)
367 {
368 const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
369 const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
370 const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
371
372 out_bptr[n] = bias;
373 out_mptr[n] = multiplier;
374 out_sptr[n] = -shift;
375
376 for (unsigned int i = 0; i < KernelRows; i++)
377 {
378 int8_t *row_outptr = out_wptr + i*KernelCols*veclen;
379 for (unsigned int j = 0; j < KernelCols; j++)
380 {
381 int8_t w = *(wptr + i*weight_row_stride + j*weight_col_stride);
382 row_outptr[j*veclen + n] = w;
383 }
384 }
385 wptr++;
386 }
387 }
388}
389
390
391template <
392 unsigned int OutputTileRows, unsigned int OutputTileCols,
393 unsigned int KernelRows, unsigned int KernelCols,
394 unsigned int StrideRows, unsigned int StrideCols
395>
396template <ActivationFunction Activation>
397void QSymm8HybridPerChannelDepthwiseConvolution<
398 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
399>::execute_tile(
400 int n_channels,
401 const void* packed_params,
402 const uint8_t* inptr,
403 unsigned int in_row_stride,
404 unsigned int in_col_stride,
405 uint8_t* outptr,
406 unsigned int out_row_stride,
407 unsigned int out_col_stride
408) {
409
410 // Construct methods to get pointers
411 const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
412 const int i, const int j, const int channel) {
413 return inptr + i * in_row_stride + j * in_col_stride + channel;
414 };
415
416 const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
417 const int i, const int j, const int channel) {
418 return outptr + i * out_row_stride + j * out_col_stride + channel;
419 };
420
421 execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
422 StrideRows, StrideCols>(
423 n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
424}
425
426template <
427 unsigned int OutputTileRows, unsigned int OutputTileCols,
428 unsigned int KernelRows, unsigned int KernelCols,
429 unsigned int StrideRows, unsigned int StrideCols
430>
431template <ActivationFunction Activation>
432void QSymm8HybridPerChannelDepthwiseConvolution<
433 OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
434>::execute_tile(
435 int n_channels,
436 const void* packed_params,
437 const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
438 uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
439) {
440 // Construct methods to get pointers
441 const auto get_input_ptr = [inptrs](const int i, const int j,
442 const int channel) {
443 return inptrs[i][j] + channel;
444 };
445
446 const auto get_output_ptr = [outptrs](const int i, const int j,
447 const int channel) {
448 return outptrs[i][j] + channel;
449 };
450
451 // Call the tile execution method
452 execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
453 StrideRows, StrideCols>(
454 n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
455}
456
457} // namespace depthwise