blob: 350e25ed9439ad8a8b684727b48441c1f338c7d3 [file] [log] [blame]
Dana Zlotnikebbae942022-02-03 12:52:15 +02001/*
2 * Copyright (c) 2019-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
25#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
26#include "src/core/NEON/wrapper/wrapper.h"
27
28namespace arm_compute
29{
30namespace cpu
31{
32namespace
33{
34constexpr auto data_layout = DataLayout::NHWC;
35const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
36const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
37const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
38
39constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
40constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
41constexpr size_t vector_size = 8;
42
43struct DepthwiseConvolutionRunInfo
44{
45 const size_t num_read_elements_per_iteration;
46 const uint32_t x_start;
47 const uint32_t x_end;
48 const uint32_t x_step;
49 const uint32_t x_leftover_start;
50 const size_t input_stride_y;
51 const size_t input_stride_z;
52 const size_t input_max_offset;
53 const size_t weights_width;
54 const size_t weights_height;
55 const size_t weights_stride_y;
56 const size_t weights_stride_z;
57 const size_t conv_stride_x;
58 const size_t conv_stride_y;
59 const size_t conv_pad_left;
60 const size_t conv_pad_top;
61 const size_t input_height;
62 const size_t input_width;
63 const size_t input_depth;
64
65 DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
66 : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
67 x_start(w.x().start()),
68 x_end(w.x().end()),
69 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
70 x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
71 input_stride_y(input.strides_in_bytes().y()),
72 input_stride_z(input.strides_in_bytes().z()),
73 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
74 weights_width(weights.dimension(width_idx)),
75 weights_height(weights.dimension(height_idx)),
76 weights_stride_y(weights.strides_in_bytes().y()),
77 weights_stride_z(weights.strides_in_bytes().z()),
78 conv_stride_x(conv_info.stride().first),
79 conv_stride_y(conv_info.stride().second),
80 conv_pad_left(conv_info.pad_left()),
81 conv_pad_top(conv_info.pad_top()),
82 input_height(input.dimension(height_idx)),
83 input_width(input.dimension(width_idx)),
84 input_depth(input.dimension(channel_idx))
85 {
86 }
87};
88
89inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
90{
91 return vqrdmulhq_n_s32(a, b);
92}
93
94inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b)
95{
96 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
97}
98
99inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent)
100{
101 const int32x4_t shift = vdupq_n_s32(-exponent);
102 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
103 const int32x4_t fixed = vqaddq_s32(x, fixup);
104 return vrshlq_s32(fixed, shift);
105}
106
107inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent)
108{
109 const int32x2_t shift = vdup_n_s32(-exponent);
110 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
111 const int32x2_t fixed = vqadd_s32(x, fixup);
112 return vrshl_s32(fixed, shift);
113}
114
115inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
116{
117 const int32x2_t xs = vdup_n_s32(x);
118 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
119}
120
121inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
122{
123 const int32_t current_h = base_h + h * dilation.y();
124 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
125
126 const int32_t current_w = base_w + w * dilation.x();
127 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
128
129 return is_valid_h && is_valid_w;
130}
131
132template <typename T>
133void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
134 const Size2D &dilation, const Window &window, bool has_biases)
135{
136 constexpr auto element_per_vector = vector_size / sizeof(T);
137 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
138 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
139
140 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
141
142 const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
143
144 Window execution_window = window;
145 execution_window.set(Window::DimX, dim_single_unit_step);
146
147 Window win_input = window;
148 win_input.set(Window::DimX, dim_manual_loop);
149 win_input.set(Window::DimY, dim_manual_loop);
150 win_input.set(Window::DimZ, dim_manual_loop);
151
152 Window win_weights = win_input;
153 win_weights.set(Window::DimW, dim_manual_loop);
154
155 Window win_output = window;
156 win_output.set(Window::DimX, dim_manual_loop);
157
158 Iterator input_it(src, win_input);
159 Iterator weights_it(weights, win_weights);
160 Iterator output_it(dst, win_output);
161 Iterator biases_it{};
162
163 if(has_biases)
164 {
165 biases_it = Iterator(biases, win_weights);
166 }
167
168 execute_window_loop(execution_window, [&](const Coordinates & id)
169 {
170 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
171 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
172 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
173
174 auto const base_weights_ptr = weights_it.ptr();
175 uint32_t x = run_info.x_start;
176
177 for(; x < run_info.x_leftover_start; x += run_info.x_step)
178 {
179 VectorType acc = zero_vector;
180 auto weights_ptr = base_weights_ptr;
181 int64_t input_offset = base_input_offset;
182
183 for(uint32_t h = 0; h < run_info.weights_height; ++h)
184 {
185 int64_t offs = input_offset + x * sizeof(T);
186 for(uint32_t w = 0; w < run_info.weights_width; ++w)
187 {
188 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
189 const auto input_vals = is_valid_region ?
190 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
191 zero_vector;
192 const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
193 acc = wrapper::vmla(acc, weights_vals, input_vals);
194
195 offs += dilation.x() * run_info.input_stride_y;
196 }
197
198 weights_ptr += run_info.weights_stride_z;
199 input_offset += dilation.y() * run_info.input_stride_z;
200 }
201
202 if(has_biases)
203 {
204 const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
205 acc = wrapper::vadd(acc, biases_vals);
206 }
207
208 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
209 }
210
211 for(; x < run_info.x_end; ++x)
212 {
213 auto acc_scalar = T{ 0 };
214 auto weights_ptr = base_weights_ptr;
215 int64_t input_offset = base_input_offset;
216
217 for(size_t h = 0; h < run_info.weights_height; ++h)
218 {
219 int64_t offs = input_offset + x * sizeof(T);
220 for(size_t w = 0; w < run_info.weights_width; ++w)
221 {
222 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
223 const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
224 const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
225
226 acc_scalar += (input_vals * weights_vals);
227
228 offs += dilation.x() * run_info.input_stride_y;
229 }
230
231 weights_ptr += run_info.weights_stride_z;
232 input_offset += dilation.y() * run_info.input_stride_z;
233 }
234
235 if(has_biases)
236 {
237 const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
238 acc_scalar += biases_vals;
239 }
240 *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
241 }
242 },
243 input_it, weights_it, biases_it, output_it);
244}
245
246template <typename T>
247void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
248 const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
249{
250 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
251
252 Window execution_window = window;
253 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
254
255 Window win_input = execution_window;
256 win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
257 win_input.set(Window::DimY, dim_manual_loop);
258 win_input.set(Window::DimZ, dim_manual_loop);
259
260 Window win_weights = window;
261 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
262 win_weights.set(Window::DimY, dim_manual_loop);
263 win_weights.set(Window::DimZ, dim_manual_loop);
264 win_weights.set(Window::DimW, dim_manual_loop);
265
266 Window win_output = window;
267 win_output.set_dimension_step(Window::DimX, run_info.x_step);
268
269 Iterator input_it(src, win_input);
270 Iterator weights_it(weights, win_weights);
271 Iterator output_it(dst, win_output);
272 Iterator biases_it{};
273
274 if(has_biases)
275 {
276 biases_it = Iterator(biases, win_weights);
277 }
278
279 execute_window_loop(execution_window, [&](const Coordinates & id)
280 {
281 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
282
283 const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
284 const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
285 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
286
287 auto weights_ptr = weights_it.ptr();
288 for(size_t h = 0; h < run_info.weights_height; ++h)
289 {
290 int offs = input_offset;
291 for(size_t w = 0; w < run_info.weights_width; ++w)
292 {
293 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
294 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
295
296 for(size_t m = 0; m < depth_multiplier; ++m)
297 {
298 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
299 acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
300 }
301
302 offs += dilation.x() * run_info.input_stride_y;
303 }
304
305 weights_ptr += run_info.weights_stride_z;
306 input_offset += dilation.y() * run_info.input_stride_z;
307 }
308
309 if(has_biases)
310 {
311 for(size_t m = 0; m < depth_multiplier; ++m)
312 {
313 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
314 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
315 }
316 }
317 else
318 {
319 for(size_t m = 0; m < depth_multiplier; ++m)
320 {
321 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
322 }
323 }
324 },
325 input_it, weights_it, biases_it, output_it);
326}
327
328template <typename T, typename TW>
329void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
330 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
331{
332 ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
333 constexpr auto element_per_vector = vector_size / sizeof(T);
334 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
335 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
336 using AccType = int32_t;
337 using AccArrayType = std::array<AccType, element_per_vector>;
338
339 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
340 const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
341
342 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
343
344 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
345 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
346 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
347 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
348
349 Window execution_window = window;
350 execution_window.set(Window::DimX, dim_single_unit_step);
351
352 Window win_input = window;
353 win_input.set(Window::DimX, dim_manual_loop);
354 win_input.set(Window::DimY, dim_manual_loop);
355 win_input.set(Window::DimZ, dim_manual_loop);
356
357 Window win_weights = win_input;
358 win_weights.set(Window::DimW, dim_manual_loop);
359
360 Window win_output = window;
361 win_output.set(Window::DimX, dim_manual_loop);
362
363 Iterator input_it(src, win_input);
364 Iterator weights_it(weights, win_weights);
365 Iterator output_it(dst, win_output);
366 Iterator biases_it{};
367
368 if(has_biases)
369 {
370 biases_it = Iterator(biases, win_weights);
371 }
372
373 execute_window_loop(execution_window, [&](const Coordinates & id)
374 {
375 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
376 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
377 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
378 auto const base_weights_ptr = weights_it.ptr();
379 size_t x = run_info.x_start;
380
381 for(; x < run_info.x_leftover_start; x += run_info.x_step)
382 {
383 AccArrayType acc{};
384 AccArrayType in_sum{};
385 AccArrayType we_sum{};
386
387 auto weights_ptr = base_weights_ptr;
388 auto input_offset = base_input_offset;
389
390 for(size_t h = 0; h < run_info.weights_height; ++h)
391 {
392 int64_t offs = input_offset + x * sizeof(T);
393 for(size_t w = 0; w < run_info.weights_width; ++w)
394 {
395 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
396 const auto input_vals = is_valid_region ?
397 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
398 out_of_bound_vector;
399 const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
400
401 for(size_t i = 0; i < element_per_vector; ++i)
402 {
403 acc.at(i) += input_vals[i] * weights_vals[i];
404 in_sum.at(i) += input_vals[i];
405 we_sum.at(i) += weights_vals[i];
406 }
407
408 offs += dilation.x() * run_info.input_stride_y;
409 }
410
411 weights_ptr += run_info.weights_stride_z;
412 input_offset += dilation.y() * run_info.input_stride_z;
413 }
414
415 VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
416 for(size_t i = 0; i < element_per_vector; ++i)
417 {
418 acc.at(i) -= in_sum.at(i) * weights_qoffset;
419 acc.at(i) -= we_sum.at(i) * input_qoffset;
420 acc.at(i) += k_offset;
421
422 if(has_biases)
423 {
424 acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
425 }
426
427 const int32_t out_mul = output_multiplier.at(x + i);
428 const int32_t out_shift = output_shift.at(x + i);
429 if(out_shift < 0)
430 {
431 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
432 }
433 else
434 {
435 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
436 }
437 out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
438 }
439
440 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
441 }
442
443 // left-over
444 for(; x < run_info.x_end; ++x)
445 {
446 AccType acc = 0;
447 AccType in_sum = 0;
448 AccType we_sum = 0;
449
450 auto weights_ptr = base_weights_ptr;
451 auto input_offset = base_input_offset;
452
453 for(size_t h = 0; h < run_info.weights_height; ++h)
454 {
455 int64_t offs = input_offset + x * sizeof(T);
456 for(size_t w = 0; w < run_info.weights_width; ++w)
457 {
458 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
459 const auto input_val = is_valid_region ?
460 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
461 out_of_bound_value;
462 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
463
464 acc += input_val * weights_val;
465 in_sum += input_val;
466 we_sum += weights_val;
467
468 offs += dilation.x() * run_info.input_stride_y;
469 }
470
471 weights_ptr += run_info.weights_stride_z;
472 input_offset += dilation.y() * run_info.input_stride_z;
473 }
474
475 T out_vals{ 0 };
476
477 acc -= in_sum * weights_qoffset;
478 acc -= we_sum * input_qoffset;
479 acc += k_offset;
480
481 if(has_biases)
482 {
483 acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
484 }
485
486 const int32_t out_mul = output_multiplier.at(x);
487 const int32_t out_shift = output_shift.at(x);
488
489 if(out_shift < 0)
490 {
491 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
492 }
493 else
494 {
495 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
496 }
497
498 out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
499 *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
500 }
501 },
502 input_it, weights_it, biases_it, output_it);
503}
504
505template <typename T, typename TW>
506void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
507 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
508{
509 using AccType = int32_t;
510
511 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
512
513 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
514
515 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
516 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
517 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
518 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
519
520 Window execution_window = window;
521 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
522
523 Window win_input = execution_window;
524 win_input.set(Window::DimY, dim_manual_loop);
525 win_input.set(Window::DimZ, dim_manual_loop);
526
527 Window win_weights = window;
528 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
529 win_weights.set(Window::DimY, dim_manual_loop);
530 win_weights.set(Window::DimZ, dim_manual_loop);
531 win_weights.set(Window::DimW, dim_manual_loop);
532
533 Window win_output = window;
534 win_output.set_dimension_step(Window::DimX, run_info.x_step);
535
536 Iterator input_it(src, win_input);
537 Iterator weights_it(weights, win_weights);
538 Iterator output_it(dst, win_output);
539 Iterator biases_it{};
540
541 if(has_biases)
542 {
543 biases_it = Iterator(biases, win_weights);
544 }
545
546 execute_window_loop(execution_window, [&](const Coordinates & id)
547 {
548 std::vector<AccType> acc(depth_multiplier, 0);
549 std::vector<AccType> we_sum(depth_multiplier, 0);
550 AccType in_sum = 0;
551
552 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
553 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
554 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
555
556 auto weights_ptr = weights_it.ptr();
557 for(size_t h = 0; h < run_info.weights_height; ++h)
558 {
559 int offs = input_offset;
560 for(size_t w = 0; w < run_info.weights_width; ++w)
561 {
562 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
563 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
564
565 for(size_t m = 0; m < depth_multiplier; ++m)
566 {
567 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
568 acc.at(m) += input_val * weights_val;
569
570 we_sum.at(m) += weights_val;
571 }
572
573 offs += dilation.x() * run_info.input_stride_y;
574 in_sum += input_val;
575 }
576
577 weights_ptr += run_info.weights_stride_z;
578 input_offset += dilation.y() * run_info.input_stride_z;
579 }
580
581 for(size_t m = 0; m < depth_multiplier; ++m)
582 {
583 acc.at(m) -= in_sum * weights_qoffset;
584 acc.at(m) -= we_sum.at(m) * input_qoffset;
585 acc.at(m) += k_offset;
586
587 if(has_biases)
588 {
589 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
590 }
591
592 const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
593 const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
594 if(out_shift < 0)
595 {
596 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
597 }
598 else
599 {
600 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
601 }
602 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
603 }
604 },
605 input_it, weights_it, biases_it, output_it);
606}
607
608template <typename T, typename TW>
609void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
610 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
611{
612 constexpr int half_vec = vector_size / 2;
613
614 using AccType = int32_t;
615 using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
616 using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
617 using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
618
619 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
620
621 const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
622 const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
623 const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
624
625 const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
626 const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
627 const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
628
629 const auto out_mul = output_multiplier.at(0);
630 const auto out_shift = output_shift.at(0);
631
632 Window execution_window = window;
633 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
634
635 Window win_input = execution_window;
636 win_input.set(Window::DimY, dim_manual_loop);
637 win_input.set(Window::DimZ, dim_manual_loop);
638
639 Window win_weights = window;
640 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
641 win_weights.set(Window::DimY, dim_manual_loop);
642 win_weights.set(Window::DimZ, dim_manual_loop);
643 win_weights.set(Window::DimW, dim_manual_loop);
644
645 Window win_output = window;
646 win_output.set_dimension_step(Window::DimX, run_info.x_step);
647
648 Iterator input_it(src, win_input);
649 Iterator weights_it(weights, win_weights);
650 Iterator output_it(dst, win_output);
651 Iterator biases_it{};
652
653 if(has_biases)
654 {
655 biases_it = Iterator(biases, win_weights);
656 }
657
658 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
659 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
660
661 execute_window_loop(execution_window, [&](const Coordinates & id)
662 {
663 std::fill(begin(acc0), end(acc0), zero);
664 std::fill(begin(acc1), end(acc1), zero);
665
666 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
667 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
668 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
669
670 auto weights_ptr = weights_it.ptr();
671 for(size_t h = 0; h < run_info.weights_height; ++h)
672 {
673 const int32_t current_h = input_z + h * dilation.y();
674 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
675 {
676 int offs = input_offset;
677 for(size_t w = 0; w < run_info.weights_width; ++w)
678 {
679 const int32_t current_w = input_y + w * dilation.x();
680 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
681 {
682 const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
683 const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
684 const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
685
686 for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
687 {
688 const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
689 const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
690 const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
691
692 acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
693 acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
694 }
695 }
696
697 offs += dilation.x() * run_info.input_stride_y;
698 }
699 }
700
701 weights_ptr += run_info.weights_stride_z;
702 input_offset += dilation.y() * run_info.input_stride_z;
703 }
704
705 for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
706 {
707 if(has_biases)
708 {
709 const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
710 const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
711
712 acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
713 acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
714 }
715
716 if(out_shift < 0)
717 {
718 acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
719 acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
720 }
721 else
722 {
723 acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
724 acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
725 }
726
727 acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
728 acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
729
730 const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
731 wrapper::vmovn(acc1.at(i)));
732
733 if(std::is_same<T, uint8_t>::value)
734 {
735 wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
736 }
737 else
738 {
739 wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
740 }
741 }
742 },
743 input_it, weights_it, biases_it, output_it);
744}
745} // namespace
746template <typename T, typename TW>
747void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
748 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
749{
750 PadStrideInfo conv_info = info.pad_stride_info;
751 unsigned int depth_multiplier = info.depth_multiplier;
752 Size2D dilation = info.dilation;
753
754 if(depth_multiplier == 1)
755 {
756 depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
757 }
758 else
759 {
760 depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
761 }
762}
763template void run_depthwise_float<float, float>(const ITensor *src, const ITensor *weights, const ITensor *biases,
764 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
765#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
766template void run_depthwise_float<float16_t, float16_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
767 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
768#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
769
770template <typename T, typename TW>
771void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
772 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
773{
774 PadStrideInfo conv_info = info.pad_stride_info;
775 unsigned int depth_multiplier = info.depth_multiplier;
776 Size2D dilation = info.dilation;
777 std::vector<int> output_multiplier;
778 std::vector<int> output_shift;
779
780 const auto input_scale = src->info()->quantization_info().uniform().scale;
781 const auto output_scale = dst->info()->quantization_info().uniform().scale;
782 auto weights_scale = weights->info()->quantization_info().scale();
783
784 if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
785 {
786 for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
787 {
788 weights_scale.push_back(weights_scale.front());
789 }
790 }
791
792 for(const auto &s : weights_scale)
793 {
794 int32_t out_mult = 0;
795 int32_t out_shift = 0;
796 const float multiplier = input_scale * s / output_scale;
797 arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
798
799 output_multiplier.push_back(out_mult);
800 output_shift.push_back(out_shift);
801 }
802
803 if(depth_multiplier == 1)
804 {
805 depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
806 }
807 else
808 {
809 const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
810 const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
811
812 if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
813 {
814 depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
815 }
816 else
817 {
818 depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
819 }
820 }
821}
822template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
823 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
824template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
825 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
826template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
827 ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
828} // namespace cpu
829} // namespace arm_compute