blob: eac9baaf014392997489af51ed8170c7999dfc9c [file] [log] [blame]
Giorgio Arena44f55722019-07-12 14:49:49 +01001/*
SiCongLib88272e2021-02-24 15:40:57 +00002 * Copyright (c) 2019-2021 Arm Limited.
Giorgio Arena44f55722019-07-12 14:49:49 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Manuel Bottinib4bb6a02021-05-24 16:01:32 +010024#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010025
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +010026#include "arm_compute/core/ITensor.h"
27#include "arm_compute/core/ITensorInfo.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010028#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010029#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010030#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010031#include "src/core/NEON/wrapper/traits.h"
32#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010033#include "src/core/helpers/AutoConfiguration.h"
34#include "src/core/helpers/WindowHelpers.h"
Matthew Bentham758b5ba2020-03-05 23:37:48 +000035#include "support/ToolchainSupport.h"
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +010036
Giorgio Arena44f55722019-07-12 14:49:49 +010037namespace arm_compute
38{
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +010039namespace cpu
40{
41namespace kernels
42{
Giorgio Arena44f55722019-07-12 14:49:49 +010043namespace
44{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010045constexpr auto data_layout = DataLayout::NHWC;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010046const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
47const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
48const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
49
50constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
51constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
52constexpr size_t vector_size = 8;
53
54struct DepthwiseConvolutionRunInfo
Giorgio Arenad93e2632019-10-15 11:09:33 +010055{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010056 const size_t num_read_elements_per_iteration;
57 const uint32_t x_start;
58 const uint32_t x_end;
59 const uint32_t x_step;
60 const uint32_t x_leftover_start;
61 const size_t input_stride_y;
62 const size_t input_stride_z;
63 const size_t input_max_offset;
64 const size_t weights_width;
65 const size_t weights_height;
66 const size_t weights_stride_y;
67 const size_t weights_stride_z;
68 const size_t conv_stride_x;
69 const size_t conv_stride_y;
70 const size_t conv_pad_left;
71 const size_t conv_pad_top;
72 const size_t input_height;
73 const size_t input_width;
74 const size_t input_depth;
75
Manuel Bottinib4bb6a02021-05-24 16:01:32 +010076 DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010077 : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
78 x_start(w.x().start()),
79 x_end(w.x().end()),
80 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
81 x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
82 input_stride_y(input.strides_in_bytes().y()),
83 input_stride_z(input.strides_in_bytes().z()),
84 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
85 weights_width(weights.dimension(width_idx)),
86 weights_height(weights.dimension(height_idx)),
87 weights_stride_y(weights.strides_in_bytes().y()),
88 weights_stride_z(weights.strides_in_bytes().z()),
89 conv_stride_x(conv_info.stride().first),
90 conv_stride_y(conv_info.stride().second),
91 conv_pad_left(conv_info.pad_left()),
92 conv_pad_top(conv_info.pad_top()),
93 input_height(input.dimension(height_idx)),
94 input_width(input.dimension(width_idx)),
95 input_depth(input.dimension(channel_idx))
Giorgio Arenad93e2632019-10-15 11:09:33 +010096 {
Giorgio Arenad93e2632019-10-15 11:09:33 +010097 }
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010098};
99
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000100inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
101{
102 return vqrdmulhq_n_s32(a, b);
103}
104
105inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b)
106{
107 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
108}
109
110inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent)
111{
112 const int32x4_t shift = vdupq_n_s32(-exponent);
113 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
114 const int32x4_t fixed = vqaddq_s32(x, fixup);
115 return vrshlq_s32(fixed, shift);
116}
117
118inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent)
119{
120 const int32x2_t shift = vdup_n_s32(-exponent);
121 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
122 const int32x2_t fixed = vqadd_s32(x, fixup);
123 return vrshl_s32(fixed, shift);
124}
125
126inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
127{
128 const int32x2_t xs = vdup_n_s32(x);
129 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
130}
131
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100132inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
133{
134 const int32_t current_h = base_h + h * dilation.y();
135 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
136
137 const int32_t current_w = base_w + w * dilation.x();
138 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
139
140 return is_valid_h && is_valid_w;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100141}
142
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100143template <typename T>
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100144void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100145 const Size2D &dilation, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100146{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100147 constexpr auto element_per_vector = vector_size / sizeof(T);
148 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
149 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
Giorgio Arena44f55722019-07-12 14:49:49 +0100150
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100151 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100152
153 const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
154
155 Window execution_window = window;
156 execution_window.set(Window::DimX, dim_single_unit_step);
Giorgio Arena44f55722019-07-12 14:49:49 +0100157
158 Window win_input = window;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100159 win_input.set(Window::DimX, dim_manual_loop);
160 win_input.set(Window::DimY, dim_manual_loop);
161 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100162
163 Window win_weights = win_input;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100164 win_weights.set(Window::DimW, dim_manual_loop);
165
166 Window win_output = window;
167 win_output.set(Window::DimX, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100168
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100169 Iterator input_it(src, win_input);
Giorgio Arena44f55722019-07-12 14:49:49 +0100170 Iterator weights_it(weights, win_weights);
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100171 Iterator output_it(dst, win_output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100172 Iterator biases_it{};
173
174 if(has_biases)
175 {
176 biases_it = Iterator(biases, win_weights);
177 }
178
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100179 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arena44f55722019-07-12 14:49:49 +0100180 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100181 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
182 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
183 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100184
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100185 auto const base_weights_ptr = weights_it.ptr();
186 uint32_t x = run_info.x_start;
Giorgio Arena44f55722019-07-12 14:49:49 +0100187
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100188 for(; x < run_info.x_leftover_start; x += run_info.x_step)
Giorgio Arena44f55722019-07-12 14:49:49 +0100189 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100190 VectorType acc = zero_vector;
191 auto weights_ptr = base_weights_ptr;
192 int64_t input_offset = base_input_offset;
Giorgio Arena44f55722019-07-12 14:49:49 +0100193
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100194 for(uint32_t h = 0; h < run_info.weights_height; ++h)
195 {
196 int64_t offs = input_offset + x * sizeof(T);
197 for(uint32_t w = 0; w < run_info.weights_width; ++w)
198 {
199 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
200 const auto input_vals = is_valid_region ?
201 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
202 zero_vector;
203 const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
204 acc = wrapper::vmla(acc, weights_vals, input_vals);
205
206 offs += dilation.x() * run_info.input_stride_y;
207 }
208
209 weights_ptr += run_info.weights_stride_z;
210 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100211 }
212
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100213 if(has_biases)
214 {
215 const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
216 acc = wrapper::vadd(acc, biases_vals);
217 }
218
219 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
Giorgio Arena44f55722019-07-12 14:49:49 +0100220 }
221
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100222 for(; x < run_info.x_end; ++x)
Giorgio Arena44f55722019-07-12 14:49:49 +0100223 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100224 auto acc_scalar = T{ 0 };
225 auto weights_ptr = base_weights_ptr;
226 int64_t input_offset = base_input_offset;
Giorgio Arena44f55722019-07-12 14:49:49 +0100227
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100228 for(size_t h = 0; h < run_info.weights_height; ++h)
229 {
230 int64_t offs = input_offset + x * sizeof(T);
231 for(size_t w = 0; w < run_info.weights_width; ++w)
232 {
233 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
234 const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
235 const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
236
237 acc_scalar += (input_vals * weights_vals);
238
239 offs += dilation.x() * run_info.input_stride_y;
240 }
241
242 weights_ptr += run_info.weights_stride_z;
243 input_offset += dilation.y() * run_info.input_stride_z;
244 }
245
246 if(has_biases)
247 {
248 const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
249 acc_scalar += biases_vals;
250 }
251 *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
252 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100253 },
254 input_it, weights_it, biases_it, output_it);
255}
256
Michalis Spyrouf401c742020-05-12 16:18:33 +0100257template <typename T>
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100258void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100259 const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100260{
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100261 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
Giorgio Arena44f55722019-07-12 14:49:49 +0100262
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100263 Window execution_window = window;
264 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
Giorgio Arena44f55722019-07-12 14:49:49 +0100265
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100266 Window win_input = execution_window;
267 win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
268 win_input.set(Window::DimY, dim_manual_loop);
269 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100270
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100271 Window win_weights = window;
272 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
273 win_weights.set(Window::DimY, dim_manual_loop);
274 win_weights.set(Window::DimZ, dim_manual_loop);
275 win_weights.set(Window::DimW, dim_manual_loop);
276
277 Window win_output = window;
278 win_output.set_dimension_step(Window::DimX, run_info.x_step);
Giorgio Arena44f55722019-07-12 14:49:49 +0100279
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100280 Iterator input_it(src, win_input);
Giorgio Arena44f55722019-07-12 14:49:49 +0100281 Iterator weights_it(weights, win_weights);
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100282 Iterator output_it(dst, win_output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100283 Iterator biases_it{};
284
285 if(has_biases)
286 {
287 biases_it = Iterator(biases, win_weights);
288 }
289
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100290 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arena44f55722019-07-12 14:49:49 +0100291 {
292 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
293
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100294 const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
295 const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
296 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100297
298 auto weights_ptr = weights_it.ptr();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100299 for(size_t h = 0; h < run_info.weights_height; ++h)
Giorgio Arena44f55722019-07-12 14:49:49 +0100300 {
301 int offs = input_offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100302 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arena44f55722019-07-12 14:49:49 +0100303 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100304 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
305 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
Giorgio Arena44f55722019-07-12 14:49:49 +0100306
307 for(size_t m = 0; m < depth_multiplier; ++m)
308 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100309 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +0100310 acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
Giorgio Arena44f55722019-07-12 14:49:49 +0100311 }
312
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100313 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arena44f55722019-07-12 14:49:49 +0100314 }
315
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100316 weights_ptr += run_info.weights_stride_z;
317 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100318 }
319
320 if(has_biases)
321 {
322 for(size_t m = 0; m < depth_multiplier; ++m)
323 {
324 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
325 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
326 }
327 }
328 else
329 {
330 for(size_t m = 0; m < depth_multiplier; ++m)
331 {
332 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
333 }
334 }
335 },
336 input_it, weights_it, biases_it, output_it);
337}
338
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100339template <typename T, typename TW>
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100340void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
341 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
Giorgio Arenad93e2632019-10-15 11:09:33 +0100342{
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100343 ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100344 constexpr auto element_per_vector = vector_size / sizeof(T);
345 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
346 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
347 using AccType = int32_t;
348 using AccArrayType = std::array<AccType, element_per_vector>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100349
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100350 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100351 const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
352
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100353 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100354
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100355 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100356 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100357 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100358 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
359
360 Window execution_window = window;
361 execution_window.set(Window::DimX, dim_single_unit_step);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100362
363 Window win_input = window;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100364 win_input.set(Window::DimX, dim_manual_loop);
365 win_input.set(Window::DimY, dim_manual_loop);
366 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100367
368 Window win_weights = win_input;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100369 win_weights.set(Window::DimW, dim_manual_loop);
370
371 Window win_output = window;
372 win_output.set(Window::DimX, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100373
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100374 Iterator input_it(src, win_input);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100375 Iterator weights_it(weights, win_weights);
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100376 Iterator output_it(dst, win_output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100377 Iterator biases_it{};
378
379 if(has_biases)
380 {
381 biases_it = Iterator(biases, win_weights);
382 }
383
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100384 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100385 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100386 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
387 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
388 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
389 auto const base_weights_ptr = weights_it.ptr();
390 size_t x = run_info.x_start;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100391
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100392 for(; x < run_info.x_leftover_start; x += run_info.x_step)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100393 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100394 AccArrayType acc{};
395 AccArrayType in_sum{};
396 AccArrayType we_sum{};
Giorgio Arenad93e2632019-10-15 11:09:33 +0100397
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100398 auto weights_ptr = base_weights_ptr;
399 auto input_offset = base_input_offset;
400
401 for(size_t h = 0; h < run_info.weights_height; ++h)
402 {
403 int64_t offs = input_offset + x * sizeof(T);
404 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100405 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100406 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
407 const auto input_vals = is_valid_region ?
408 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
409 out_of_bound_vector;
410 const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
411
Sang-Hoon Park1a0a4bc2020-11-12 17:41:32 +0000412 for(size_t i = 0; i < element_per_vector; ++i)
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100413 {
414 acc.at(i) += input_vals[i] * weights_vals[i];
415 in_sum.at(i) += input_vals[i];
416 we_sum.at(i) += weights_vals[i];
417 }
418
419 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100420 }
421
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100422 weights_ptr += run_info.weights_stride_z;
423 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100424 }
425
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100426 VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
Sang-Hoon Park1a0a4bc2020-11-12 17:41:32 +0000427 for(size_t i = 0; i < element_per_vector; ++i)
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100428 {
429 acc.at(i) -= in_sum.at(i) * weights_qoffset;
430 acc.at(i) -= we_sum.at(i) * input_qoffset;
431 acc.at(i) += k_offset;
432
433 if(has_biases)
434 {
435 acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
436 }
437
438 const int32_t out_mul = output_multiplier.at(x + i);
439 const int32_t out_shift = output_shift.at(x + i);
440 if(out_shift < 0)
441 {
442 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
443 }
444 else
445 {
446 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
447 }
448 out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
449 }
450
451 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100452 }
453
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100454 // left-over
455 for(; x < run_info.x_end; ++x)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100456 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100457 AccType acc = 0;
458 AccType in_sum = 0;
459 AccType we_sum = 0;
460
461 auto weights_ptr = base_weights_ptr;
462 auto input_offset = base_input_offset;
463
464 for(size_t h = 0; h < run_info.weights_height; ++h)
465 {
466 int64_t offs = input_offset + x * sizeof(T);
467 for(size_t w = 0; w < run_info.weights_width; ++w)
468 {
469 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
470 const auto input_val = is_valid_region ?
471 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
472 out_of_bound_value;
473 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
474
475 acc += input_val * weights_val;
476 in_sum += input_val;
477 we_sum += weights_val;
478
479 offs += dilation.x() * run_info.input_stride_y;
480 }
481
482 weights_ptr += run_info.weights_stride_z;
483 input_offset += dilation.y() * run_info.input_stride_z;
484 }
485
486 T out_vals{ 0 };
487
488 acc -= in_sum * weights_qoffset;
489 acc -= we_sum * input_qoffset;
490 acc += k_offset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100491
492 if(has_biases)
493 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100494 acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100495 }
496
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100497 const int32_t out_mul = output_multiplier.at(x);
498 const int32_t out_shift = output_shift.at(x);
499
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000500 if(out_shift < 0)
501 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100502 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000503 }
504 else
505 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100506 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000507 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100508
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100509 out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
510 *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
511 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100512 },
513 input_it, weights_it, biases_it, output_it);
514}
515
Michalis Spyrouf401c742020-05-12 16:18:33 +0100516template <typename T, typename TW>
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100517void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
518 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
Giorgio Arenad93e2632019-10-15 11:09:33 +0100519{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100520 using AccType = int32_t;
521
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100522 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100523
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100524 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
Giorgio Arenad93e2632019-10-15 11:09:33 +0100525
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100526 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100527 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100528 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100529 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100530
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100531 Window execution_window = window;
532 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100533
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100534 Window win_input = execution_window;
535 win_input.set(Window::DimY, dim_manual_loop);
536 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100537
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100538 Window win_weights = window;
539 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
540 win_weights.set(Window::DimY, dim_manual_loop);
541 win_weights.set(Window::DimZ, dim_manual_loop);
542 win_weights.set(Window::DimW, dim_manual_loop);
543
544 Window win_output = window;
545 win_output.set_dimension_step(Window::DimX, run_info.x_step);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100546
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100547 Iterator input_it(src, win_input);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100548 Iterator weights_it(weights, win_weights);
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100549 Iterator output_it(dst, win_output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100550 Iterator biases_it{};
551
552 if(has_biases)
553 {
554 biases_it = Iterator(biases, win_weights);
555 }
556
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100557 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100558 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100559 std::vector<AccType> acc(depth_multiplier, 0);
560 std::vector<AccType> we_sum(depth_multiplier, 0);
561 AccType in_sum = 0;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100562
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100563 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
564 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
565 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100566
567 auto weights_ptr = weights_it.ptr();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100568 for(size_t h = 0; h < run_info.weights_height; ++h)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100569 {
570 int offs = input_offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100571 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100572 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100573 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
574 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100575
576 for(size_t m = 0; m < depth_multiplier; ++m)
577 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100578 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100579 acc.at(m) += input_val * weights_val;
580
581 we_sum.at(m) += weights_val;
582 }
583
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100584 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100585 in_sum += input_val;
586 }
587
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100588 weights_ptr += run_info.weights_stride_z;
589 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100590 }
591
592 for(size_t m = 0; m < depth_multiplier; ++m)
593 {
594 acc.at(m) -= in_sum * weights_qoffset;
595 acc.at(m) -= we_sum.at(m) * input_qoffset;
596 acc.at(m) += k_offset;
597
598 if(has_biases)
599 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000600 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
601 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100602
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100603 const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
604 const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000605 if(out_shift < 0)
606 {
607 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100608 }
609 else
610 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000611 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100612 }
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100613 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100614 }
615 },
616 input_it, weights_it, biases_it, output_it);
617}
618
Giorgio Arena3737c792020-11-23 17:47:23 +0000619template <typename T, typename TW>
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100620void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
621 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
Giorgio Arena3737c792020-11-23 17:47:23 +0000622{
623 constexpr int half_vec = vector_size / 2;
624
625 using AccType = int32_t;
626 using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
627 using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
628 using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
629
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100630 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
Giorgio Arena3737c792020-11-23 17:47:23 +0000631
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100632 const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
Giorgio Arena3737c792020-11-23 17:47:23 +0000633 const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100634 const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
Giorgio Arena3737c792020-11-23 17:47:23 +0000635
636 const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
637 const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
638 const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
639
640 const auto out_mul = output_multiplier.at(0);
641 const auto out_shift = output_shift.at(0);
642
643 Window execution_window = window;
644 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
645
646 Window win_input = execution_window;
647 win_input.set(Window::DimY, dim_manual_loop);
648 win_input.set(Window::DimZ, dim_manual_loop);
649
650 Window win_weights = window;
651 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
652 win_weights.set(Window::DimY, dim_manual_loop);
653 win_weights.set(Window::DimZ, dim_manual_loop);
654 win_weights.set(Window::DimW, dim_manual_loop);
655
656 Window win_output = window;
657 win_output.set_dimension_step(Window::DimX, run_info.x_step);
658
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100659 Iterator input_it(src, win_input);
Giorgio Arena3737c792020-11-23 17:47:23 +0000660 Iterator weights_it(weights, win_weights);
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100661 Iterator output_it(dst, win_output);
Giorgio Arena3737c792020-11-23 17:47:23 +0000662 Iterator biases_it{};
663
664 if(has_biases)
665 {
666 biases_it = Iterator(biases, win_weights);
667 }
668
669 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
670 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
671
672 execute_window_loop(execution_window, [&](const Coordinates & id)
673 {
674 std::fill(begin(acc0), end(acc0), zero);
675 std::fill(begin(acc1), end(acc1), zero);
676
677 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
678 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
679 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
680
681 auto weights_ptr = weights_it.ptr();
682 for(size_t h = 0; h < run_info.weights_height; ++h)
683 {
684 const int32_t current_h = input_z + h * dilation.y();
685 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
686 {
687 int offs = input_offset;
688 for(size_t w = 0; w < run_info.weights_width; ++w)
689 {
690 const int32_t current_w = input_y + w * dilation.x();
691 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
692 {
693 const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
694 const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
695 const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
696
697 for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
698 {
699 const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
700 const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
701 const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
702
703 acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
704 acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
705 }
706 }
707
708 offs += dilation.x() * run_info.input_stride_y;
709 }
710 }
711
712 weights_ptr += run_info.weights_stride_z;
713 input_offset += dilation.y() * run_info.input_stride_z;
714 }
715
716 for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
717 {
718 if(has_biases)
719 {
720 const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
721 const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
722
723 acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
724 acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
725 }
726
727 if(out_shift < 0)
728 {
729 acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
730 acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
731 }
732 else
733 {
734 acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
735 acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
736 }
737
738 acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
739 acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
740
741 const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
742 wrapper::vmovn(acc1.at(i)));
743
744 if(std::is_same<T, uint8_t>::value)
745 {
746 wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
747 }
748 else
749 {
750 wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
751 }
752 }
753 },
754 input_it, weights_it, biases_it, output_it);
755}
756
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100757Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100758{
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100759 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
760 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
761 ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
762 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100763 ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100764 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
765 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
766 ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100767 ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
768 ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
Giorgio Arena44f55722019-07-12 14:49:49 +0100769
Giorgio Arenad93e2632019-10-15 11:09:33 +0100770 if(is_data_type_quantized_per_channel(weights->data_type()))
771 {
772 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100773 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
774 }
775 else
776 {
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100777 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100778 }
779
Giorgio Arena44f55722019-07-12 14:49:49 +0100780 if(biases != nullptr)
781 {
782 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
783 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100784
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100785 if(is_data_type_quantized_asymmetric(src->data_type()))
Giorgio Arenad93e2632019-10-15 11:09:33 +0100786 {
787 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
788 }
789 else
790 {
791 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
792 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100793 }
794
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100795 if(dst->total_size() != 0)
Giorgio Arena44f55722019-07-12 14:49:49 +0100796 {
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100797 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
798 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
799 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
Giorgio Arena44f55722019-07-12 14:49:49 +0100800 }
801
802 return Status{};
803}
Giorgio Arenad93e2632019-10-15 11:09:33 +0100804} // namespace
Giorgio Arena44f55722019-07-12 14:49:49 +0100805
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100806CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100807 : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
Giorgio Arena44f55722019-07-12 14:49:49 +0100808{
809}
810
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100811void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100812{
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100813 ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
814 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
Giorgio Arena44f55722019-07-12 14:49:49 +0100815
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100816 _conv_info = info.pad_stride_info;
817 _depth_multiplier = info.depth_multiplier;
818 _dilation = info.dilation;
Michalis Spyrouf401c742020-05-12 16:18:33 +0100819 _has_biases = (biases != nullptr);
Giorgio Arena44f55722019-07-12 14:49:49 +0100820
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100821 if(is_data_type_quantized(src->data_type()))
Giorgio Arena44f55722019-07-12 14:49:49 +0100822 {
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100823 const auto input_scale = src->quantization_info().uniform().scale;
824 const auto output_scale = dst->quantization_info().uniform().scale;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100825
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100826 auto weights_scale = weights->quantization_info().scale();
827 if(!is_data_type_quantized_per_channel(weights->data_type()))
Giorgio Arenad93e2632019-10-15 11:09:33 +0100828 {
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100829 for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100830 {
831 weights_scale.push_back(weights_scale.front());
832 }
833 }
834
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100835 for(const auto &s : weights_scale)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100836 {
Michalis Spyroue7be8a02019-12-12 16:16:09 +0000837 int32_t out_mult = 0;
838 int32_t out_shift = 0;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100839 const float multiplier = input_scale * s / output_scale;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000840 arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100841
842 _output_multiplier.push_back(out_mult);
843 _output_shift.push_back(out_shift);
844 }
845 }
846
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100847 switch(weights->data_type())
Giorgio Arenad93e2632019-10-15 11:09:33 +0100848 {
849 case DataType::QASYMM8:
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100850 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100851 break;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000852 case DataType::QASYMM8_SIGNED:
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100853 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000854 break;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100855 case DataType::QSYMM8_PER_CHANNEL:
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100856 if(src->data_type() == DataType::QASYMM8)
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000857 {
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100858 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000859 }
860 else
861 {
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100862 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000863 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100864 break;
865#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
866 case DataType::F16:
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100867 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100868 break;
869#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arena44f55722019-07-12 14:49:49 +0100870 case DataType::F32:
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100871 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
Giorgio Arena44f55722019-07-12 14:49:49 +0100872 break;
873 default:
874 ARM_COMPUTE_ERROR("Data type not supported");
875 break;
876 }
877
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100878 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
879 auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100880
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100881 Window win = calculate_max_window(*dst, Steps());
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100882 ICpuKernel::configure(win);
Giorgio Arena44f55722019-07-12 14:49:49 +0100883}
884
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100885Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100886{
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100887 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
Giorgio Arena44f55722019-07-12 14:49:49 +0100888 return Status{};
889}
890
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100891template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
892void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
893 ITensor *dst, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100894{
895 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100896 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Giorgio Arena44f55722019-07-12 14:49:49 +0100897
898 if(_depth_multiplier == 1)
899 {
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100900 depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
Giorgio Arena44f55722019-07-12 14:49:49 +0100901 }
902 else
903 {
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100904 depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100905 }
906}
907
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100908template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
909void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
910 ITensor *dst, const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100911{
912 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100913 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100914
915 if(_depth_multiplier == 1)
916 {
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100917 depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100918 }
919 else
920 {
Giorgio Arena3737c792020-11-23 17:47:23 +0000921 const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100922 const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
Giorgio Arena3737c792020-11-23 17:47:23 +0000923
924 if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
925 {
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100926 depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
Giorgio Arena3737c792020-11-23 17:47:23 +0000927 }
928 else
929 {
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100930 depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
Giorgio Arena3737c792020-11-23 17:47:23 +0000931 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100932 }
933}
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100934
Manuel Bottinib4bb6a02021-05-24 16:01:32 +0100935void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Michalis Spyrou60c3b0e2021-04-08 12:02:58 +0100936{
937 ARM_COMPUTE_UNUSED(info);
938 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
939 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
940 ARM_COMPUTE_ERROR_ON(_func == nullptr);
941
942 const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
943 const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
944 const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
945 auto dst = tensors.get_tensor(TensorType::ACL_DST);
946 (this->*_func)(src, weights, biases, dst, window, _has_biases);
947}
948} // namespace kernels
949} // namespace cpu
Giorgio Arena44f55722019-07-12 14:49:49 +0100950} // namespace arm_compute