blob: 7626fda886430a0c3caa56e782ef308ef92586b9 [file] [log] [blame]
Giorgio Arena44f55722019-07-12 14:49:49 +01001/*
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +00002 * Copyright (c) 2019-2020 ARM Limited.
Giorgio Arena44f55722019-07-12 14:49:49 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +010024#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010025
26#include "arm_compute/core/AccessWindowStatic.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010027#include "arm_compute/core/CPP/Validate.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010028#include "arm_compute/core/NEON/wrapper/traits.h"
29#include "arm_compute/core/NEON/wrapper/wrapper.h"
30#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010031#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
32#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +010033
Giorgio Arena44f55722019-07-12 14:49:49 +010034namespace arm_compute
35{
36namespace
37{
Giorgio Arenad93e2632019-10-15 11:09:33 +010038void pad_vectors(std::vector<int> &mult, std::vector<int> &shift, int vec_size)
39{
40 ARM_COMPUTE_ERROR_ON(mult.size() != shift.size());
41 while(mult.size() % vec_size != 0)
42 {
43 mult.push_back(0);
44 shift.push_back(0);
45 }
46}
47
Giorgio Arena44f55722019-07-12 14:49:49 +010048template <typename T, int S, bool has_biases>
Giorgio Arenad93e2632019-10-15 11:09:33 +010049void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
50 const Size2D &dilation, const Window &window)
Giorgio Arena44f55722019-07-12 14:49:49 +010051{
52 using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
53 using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
54
55 const size_t input_stride_y = input->info()->strides_in_bytes().y();
56 const size_t input_stride_z = input->info()->strides_in_bytes().z();
57 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
58 input->info()->strides_in_bytes().y();
59 const size_t weights_width = weights->info()->dimension(1);
60 const size_t weights_height = weights->info()->dimension(2);
61 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
62 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
63 const size_t conv_stride_x = conv_info.stride().first;
64 const size_t conv_stride_y = conv_info.stride().second;
65 const size_t conv_pad_left = conv_info.pad_left();
66 const size_t conv_pad_top = conv_info.pad_top();
67
68 Window win_input = window;
69 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
70 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
71
72 Window win_weights = win_input;
73 win_weights.set(3, Window::Dimension(0, 0, 0));
74
75 Iterator input_it(input, win_input);
76 Iterator weights_it(weights, win_weights);
77 Iterator output_it(output, window);
78 Iterator biases_it{};
79
80 if(has_biases)
81 {
82 biases_it = Iterator(biases, win_weights);
83 }
84
85 execute_window_loop(window, [&](const Coordinates & id)
86 {
87 VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
88
89 const int input_y = id.y() * conv_stride_x - conv_pad_left;
90 const int input_z = id.z() * conv_stride_y - conv_pad_top;
91 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
92
93 auto weights_ptr = weights_it.ptr();
94 for(size_t h = 0; h < weights_height; ++h)
95 {
96 int offs = input_offset;
97 for(size_t w = 0; w < weights_width; ++w)
98 {
99 const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
100 const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
101
102 acc = wrapper::vmla(acc, weights_vals, input_vals);
103 offs += dilation.x() * input_stride_y;
104 }
105
106 weights_ptr += weights_stride_z;
107 input_offset += dilation.y() * input_stride_z;
108 }
109
110 if(has_biases)
111 {
112 const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
113 acc = wrapper::vadd(acc, biases_vals);
114 }
115
116 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
117 },
118 input_it, weights_it, biases_it, output_it);
119}
120
121template <typename T, bool has_biases>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100122void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
123 const Size2D &dilation, unsigned int depth_multiplier, const Window &window)
Giorgio Arena44f55722019-07-12 14:49:49 +0100124{
125 const size_t input_stride_y = input->info()->strides_in_bytes().y();
126 const size_t input_stride_z = input->info()->strides_in_bytes().z();
127 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
128 input->info()->strides_in_bytes().y();
129 const size_t weights_width = weights->info()->dimension(1);
130 const size_t weights_height = weights->info()->dimension(2);
131 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
132 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
133 const size_t conv_stride_x = conv_info.stride().first;
134 const size_t conv_stride_y = conv_info.stride().second;
135 const size_t conv_pad_left = conv_info.pad_left();
136 const size_t conv_pad_top = conv_info.pad_top();
137
138 Window win_input = window;
139 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
140 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
141
142 Window win_weights = win_input;
143 win_weights.set(3, Window::Dimension(0, 0, 0));
144
145 win_input.set_dimension_step(Window::DimX, 1);
146
147 Iterator input_it(input, win_input);
148 Iterator weights_it(weights, win_weights);
149 Iterator output_it(output, window);
150 Iterator biases_it{};
151
152 if(has_biases)
153 {
154 biases_it = Iterator(biases, win_weights);
155 }
156
157 execute_window_loop(window, [&](const Coordinates & id)
158 {
159 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
160
161 const int input_y = id.y() * conv_stride_x - conv_pad_left;
162 const int input_z = id.z() * conv_stride_y - conv_pad_top;
163 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
164
165 auto weights_ptr = weights_it.ptr();
166 for(size_t h = 0; h < weights_height; ++h)
167 {
168 int offs = input_offset;
169 for(size_t w = 0; w < weights_width; ++w)
170 {
171 const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
172
173 for(size_t m = 0; m < depth_multiplier; ++m)
174 {
175 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +0100176 acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
Giorgio Arena44f55722019-07-12 14:49:49 +0100177 }
178
179 offs += dilation.x() * input_stride_y;
180 }
181
182 weights_ptr += weights_stride_z;
183 input_offset += dilation.y() * input_stride_z;
184 }
185
186 if(has_biases)
187 {
188 for(size_t m = 0; m < depth_multiplier; ++m)
189 {
190 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
191 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
192 }
193 }
194 else
195 {
196 for(size_t m = 0; m < depth_multiplier; ++m)
197 {
198 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
199 }
200 }
201 },
202 input_it, weights_it, biases_it, output_it);
203}
204
Giorgio Arenad93e2632019-10-15 11:09:33 +0100205template <typename T, typename TW, int S, bool has_biases, bool is_per_channel>
206void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
207 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window)
208{
209 using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
210 using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
211
212 const size_t input_stride_y = input->info()->strides_in_bytes().y();
213 const size_t input_stride_z = input->info()->strides_in_bytes().z();
214 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
215 input->info()->strides_in_bytes().y();
216 const size_t weights_width = weights->info()->dimension(1);
217 const size_t weights_height = weights->info()->dimension(2);
218 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
219 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
220 const size_t conv_stride_x = conv_info.stride().first;
221 const size_t conv_stride_y = conv_info.stride().second;
222 const size_t conv_pad_left = conv_info.pad_left();
223 const size_t conv_pad_top = conv_info.pad_top();
224
225 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
226 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
227 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
228 const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
229
230 Window win_input = window;
231 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
232 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
233
234 Window win_weights = win_input;
235 win_weights.set(3, Window::Dimension(0, 0, 0));
236
237 Iterator input_it(input, win_input);
238 Iterator weights_it(weights, win_weights);
239 Iterator output_it(output, window);
240 Iterator biases_it{};
241
242 if(has_biases)
243 {
244 biases_it = Iterator(biases, win_weights);
245 }
246
247 execute_window_loop(window, [&](const Coordinates & id)
248 {
249 std::vector<int32_t> acc(S, 0);
250 std::vector<int32_t> in_sum(S, 0);
251 std::vector<int32_t> we_sum(S, 0);
252
253 const int input_y = id.y() * conv_stride_x - conv_pad_left;
254 const int input_z = id.z() * conv_stride_y - conv_pad_top;
255 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
256
257 auto weights_ptr = weights_it.ptr();
258 for(size_t h = 0; h < weights_height; ++h)
259 {
260 int offs = input_offset;
261 for(size_t w = 0; w < weights_width; ++w)
262 {
263 const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
264 const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * weights_stride_y));
265
266 for(int i = 0; i < S; ++i)
267 {
268 acc.at(i) += input_vals[i] * weights_vals[i];
269 in_sum.at(i) += input_vals[i];
270 we_sum.at(i) += weights_vals[i];
271 }
272
273 offs += dilation.x() * input_stride_y;
274 }
275
276 weights_ptr += weights_stride_z;
277 input_offset += dilation.y() * input_stride_z;
278 }
279
280 VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
281 for(int i = 0; i < S; ++i)
282 {
283 acc.at(i) -= in_sum.at(i) * weights_qoffset;
284 acc.at(i) -= we_sum.at(i) * input_qoffset;
285 acc.at(i) += k_offset;
286
287 if(has_biases)
288 {
289 acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
290 }
291
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000292 const int out_mul = output_multiplier.at(id.x() + i);
293 const int out_shift = output_shift.at(id.x() + i);
294 if(out_shift < 0)
295 {
296 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
297 }
298 else
299 {
300 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
301 }
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000302 out_vals[i] = static_cast<T>(utility::clamp<int32_t, T>(acc.at(i)));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100303 }
304
305 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), out_vals);
306 },
307 input_it, weights_it, biases_it, output_it);
308}
309
310template <typename T, typename TW, bool has_biases, bool is_per_channel>
311void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
312 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window)
313{
314 const size_t input_stride_y = input->info()->strides_in_bytes().y();
315 const size_t input_stride_z = input->info()->strides_in_bytes().z();
316 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
317 input->info()->strides_in_bytes().y();
318 const size_t weights_width = weights->info()->dimension(1);
319 const size_t weights_height = weights->info()->dimension(2);
320 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
321 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
322 const size_t conv_stride_x = conv_info.stride().first;
323 const size_t conv_stride_y = conv_info.stride().second;
324 const size_t conv_pad_left = conv_info.pad_left();
325 const size_t conv_pad_top = conv_info.pad_top();
326
327 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
328 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
329 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
330 const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
331
332 Window win_input = window;
333 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
334 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
335
336 Window win_weights = win_input;
337 win_weights.set(3, Window::Dimension(0, 0, 0));
338
339 win_input.set_dimension_step(Window::DimX, 1);
340
341 Iterator input_it(input, win_input);
342 Iterator weights_it(weights, win_weights);
343 Iterator output_it(output, window);
344 Iterator biases_it{};
345
346 if(has_biases)
347 {
348 biases_it = Iterator(biases, win_weights);
349 }
350
351 execute_window_loop(window, [&](const Coordinates & id)
352 {
353 std::vector<int32_t> acc(depth_multiplier, 0);
354 std::vector<int32_t> we_sum(depth_multiplier, 0);
355 int32_t in_sum = 0;
356
357 const int input_y = id.y() * conv_stride_x - conv_pad_left;
358 const int input_z = id.z() * conv_stride_y - conv_pad_top;
359 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
360
361 auto weights_ptr = weights_it.ptr();
362 for(size_t h = 0; h < weights_height; ++h)
363 {
364 int offs = input_offset;
365 for(size_t w = 0; w < weights_width; ++w)
366 {
367 const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
368
369 for(size_t m = 0; m < depth_multiplier; ++m)
370 {
371 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
372 acc.at(m) += input_val * weights_val;
373
374 we_sum.at(m) += weights_val;
375 }
376
377 offs += dilation.x() * input_stride_y;
378 in_sum += input_val;
379 }
380
381 weights_ptr += weights_stride_z;
382 input_offset += dilation.y() * input_stride_z;
383 }
384
385 for(size_t m = 0; m < depth_multiplier; ++m)
386 {
387 acc.at(m) -= in_sum * weights_qoffset;
388 acc.at(m) -= we_sum.at(m) * input_qoffset;
389 acc.at(m) += k_offset;
390
391 if(has_biases)
392 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000393 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
394 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100395
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000396 const int out_mul = output_multiplier.at(id.x() + m);
397 const int out_shift = output_shift.at(id.x() + m);
398 if(out_shift < 0)
399 {
400 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100401 }
402 else
403 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000404 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100405 }
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000406 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, T>(acc.at(m)));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100407 }
408 },
409 input_it, weights_it, biases_it, output_it);
410}
411
Giorgio Arena44f55722019-07-12 14:49:49 +0100412Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
413 const Size2D &dilation)
414{
415 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100416 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
417 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000418 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Giorgio Arena44f55722019-07-12 14:49:49 +0100419 ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100420 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
421 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
Giorgio Arena44f55722019-07-12 14:49:49 +0100422 ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
423 ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
424 ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
425
Giorgio Arenad93e2632019-10-15 11:09:33 +0100426 if(is_data_type_quantized_per_channel(weights->data_type()))
427 {
428 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
429 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
430 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
431 }
432 else
433 {
434 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
435 }
436
Giorgio Arena44f55722019-07-12 14:49:49 +0100437 if(biases != nullptr)
438 {
439 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
440 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100441
442 if(is_data_type_quantized_asymmetric(input->data_type()))
443 {
444 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
445 }
446 else
447 {
448 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
449 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100450 }
451
452 if(output->total_size() != 0)
453 {
454 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
455 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
456 }
457
458 return Status{};
459}
Giorgio Arena44f55722019-07-12 14:49:49 +0100460
461std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
462 ITensorInfo *output, const PadStrideInfo &conv_info,
463 unsigned int depth_multiplier, const Size2D &dilation)
464{
465 // Get convolved dimensions
466 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
467
468 // Output auto inizialitation if not yet initialized
Giorgio Arenad93e2632019-10-15 11:09:33 +0100469 auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
Giorgio Arena44f55722019-07-12 14:49:49 +0100470
471 // Configure kernel window (generic)
472 const unsigned int num_elems_read_per_iteration = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
473 const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
474
475 // Configure kernel window
476 Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
477
478 AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
479 input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
480 AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
481 AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
482
483 bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
484
485 if(biases != nullptr)
486 {
487 AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
488 window_changed |= update_window_and_padding(win, biases_access);
489 }
490
491 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
492
493 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
494 return std::make_pair(err, win);
495}
Giorgio Arenad93e2632019-10-15 11:09:33 +0100496} // namespace
Giorgio Arena44f55722019-07-12 14:49:49 +0100497
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100498NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
Giorgio Arenad93e2632019-10-15 11:09:33 +0100499 : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift()
Giorgio Arena44f55722019-07-12 14:49:49 +0100500{
501}
502
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100503BorderSize NEDepthwiseConvolutionLayerNativeKernel::border_size() const
Giorgio Arena44f55722019-07-12 14:49:49 +0100504{
505 return _border_size;
506}
507
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100508void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
509 const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100510{
511 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
512 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
513
514 _input = input;
515 _weights = weights;
516 _biases = biases;
517 _output = output;
518 _conv_info = conv_info;
519 _depth_multiplier = depth_multiplier;
520 _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
521 _dilation = dilation;
522
Giorgio Arenad93e2632019-10-15 11:09:33 +0100523 if(is_data_type_quantized(_input->info()->data_type()))
Giorgio Arena44f55722019-07-12 14:49:49 +0100524 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100525 const auto input_scale = input->info()->quantization_info().uniform().scale;
526 const auto output_scale = output->info()->quantization_info().uniform().scale;
527
528 auto weights_scale = weights->info()->quantization_info().scale();
529 if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
530 {
531 for(size_t i = 1; i < _weights->info()->dimension(0); ++i)
532 {
533 weights_scale.push_back(weights_scale.front());
534 }
535 }
536
537 for(size_t i = 0; i < weights_scale.size(); ++i)
538 {
Michalis Spyroue7be8a02019-12-12 16:16:09 +0000539 int32_t out_mult = 0;
540 int32_t out_shift = 0;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100541 const float multiplier = input_scale * weights_scale.at(i) / output_scale;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000542 arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100543
544 _output_multiplier.push_back(out_mult);
545 _output_shift.push_back(out_shift);
546 }
547 }
548
549 switch(_weights->info()->data_type())
550 {
551 case DataType::QASYMM8:
552 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8, true, false> :
553 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8, false, false>;
554 pad_vectors(_output_multiplier, _output_shift, 8);
555 break;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000556 case DataType::QASYMM8_SIGNED:
557 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, true, false> :
558 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, false, false>;
559 pad_vectors(_output_multiplier, _output_shift, 8);
560 break;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100561 case DataType::QSYMM8_PER_CHANNEL:
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000562 if(_input->info()->data_type() == DataType::QASYMM8)
563 {
564 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8, true, true> :
565 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8, false, true>;
566 }
567 else
568 {
569 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, true, true> :
570 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, false, true>;
571 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100572 pad_vectors(_output_multiplier, _output_shift, 8);
573 break;
574#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
575 case DataType::F16:
576 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4, true, false> :
577 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4, false, false>;
578 pad_vectors(_output_multiplier, _output_shift, 4);
579 break;
580#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arena44f55722019-07-12 14:49:49 +0100581 case DataType::F32:
Giorgio Arenad93e2632019-10-15 11:09:33 +0100582 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2, true, false> :
583 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2, false, false>;
584 pad_vectors(_output_multiplier, _output_shift, 2);
Giorgio Arena44f55722019-07-12 14:49:49 +0100585 break;
586 default:
587 ARM_COMPUTE_ERROR("Data type not supported");
588 break;
589 }
590
591 auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
592 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
593 INEKernel::configure(win_config.second);
594}
595
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100596Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
597 unsigned int depth_multiplier,
598 const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100599{
600 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
601 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
602 depth_multiplier, dilation)
603 .first);
604 return Status{};
605}
606
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100607void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100608{
609 ARM_COMPUTE_UNUSED(info);
610 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
611 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
612
613 (this->*_func)(window);
614}
615
Giorgio Arenad93e2632019-10-15 11:09:33 +0100616template < typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename std::enable_if < std::is_same<T, float>::value
617#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
618 || std::is_same<T, float16_t>::value
619#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
620 ,
621 int >::type >
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100622void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
Giorgio Arena44f55722019-07-12 14:49:49 +0100623{
624 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
625 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
626
627 if(_depth_multiplier == 1)
628 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100629 depthwise_loop_multiplier1_fp<T, S, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, window);
Giorgio Arena44f55722019-07-12 14:49:49 +0100630 }
631 else
632 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100633 depthwise_loop_generic_fp<T, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window);
634 }
635}
636
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000637template <typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100638void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
639{
640 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
641 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
642
643 if(_depth_multiplier == 1)
644 {
645 depthwise_loop_multiplier1_quantized<T, TW, S, has_biases, is_per_channel>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window);
646 }
647 else
648 {
649 depthwise_loop_generic_quantized<T, TW, has_biases, is_per_channel>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window);
Giorgio Arena44f55722019-07-12 14:49:49 +0100650 }
651}
652} // namespace arm_compute