blob: a0d45afd2a630ef83e0dce54b1ef7649455ee646 [file] [log] [blame]
Giorgio Arena44f55722019-07-12 14:49:49 +01001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +010024#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010025
26#include "arm_compute/core/AccessWindowStatic.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010027#include "arm_compute/core/CPP/Validate.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010028#include "arm_compute/core/NEON/wrapper/traits.h"
29#include "arm_compute/core/NEON/wrapper/wrapper.h"
30#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010031#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
32#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +010033
Giorgio Arena44f55722019-07-12 14:49:49 +010034namespace arm_compute
35{
36namespace
37{
Giorgio Arenad93e2632019-10-15 11:09:33 +010038void pad_vectors(std::vector<int> &mult, std::vector<int> &shift, int vec_size)
39{
40 ARM_COMPUTE_ERROR_ON(mult.size() != shift.size());
41 while(mult.size() % vec_size != 0)
42 {
43 mult.push_back(0);
44 shift.push_back(0);
45 }
46}
47
Giorgio Arena44f55722019-07-12 14:49:49 +010048template <typename T, int S, bool has_biases>
Giorgio Arenad93e2632019-10-15 11:09:33 +010049void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
50 const Size2D &dilation, const Window &window)
Giorgio Arena44f55722019-07-12 14:49:49 +010051{
52 using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
53 using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
54
55 const size_t input_stride_y = input->info()->strides_in_bytes().y();
56 const size_t input_stride_z = input->info()->strides_in_bytes().z();
57 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
58 input->info()->strides_in_bytes().y();
59 const size_t weights_width = weights->info()->dimension(1);
60 const size_t weights_height = weights->info()->dimension(2);
61 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
62 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
63 const size_t conv_stride_x = conv_info.stride().first;
64 const size_t conv_stride_y = conv_info.stride().second;
65 const size_t conv_pad_left = conv_info.pad_left();
66 const size_t conv_pad_top = conv_info.pad_top();
67
68 Window win_input = window;
69 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
70 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
71
72 Window win_weights = win_input;
73 win_weights.set(3, Window::Dimension(0, 0, 0));
74
75 Iterator input_it(input, win_input);
76 Iterator weights_it(weights, win_weights);
77 Iterator output_it(output, window);
78 Iterator biases_it{};
79
80 if(has_biases)
81 {
82 biases_it = Iterator(biases, win_weights);
83 }
84
85 execute_window_loop(window, [&](const Coordinates & id)
86 {
87 VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
88
89 const int input_y = id.y() * conv_stride_x - conv_pad_left;
90 const int input_z = id.z() * conv_stride_y - conv_pad_top;
91 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
92
93 auto weights_ptr = weights_it.ptr();
94 for(size_t h = 0; h < weights_height; ++h)
95 {
96 int offs = input_offset;
97 for(size_t w = 0; w < weights_width; ++w)
98 {
99 const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
100 const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
101
102 acc = wrapper::vmla(acc, weights_vals, input_vals);
103 offs += dilation.x() * input_stride_y;
104 }
105
106 weights_ptr += weights_stride_z;
107 input_offset += dilation.y() * input_stride_z;
108 }
109
110 if(has_biases)
111 {
112 const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
113 acc = wrapper::vadd(acc, biases_vals);
114 }
115
116 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
117 },
118 input_it, weights_it, biases_it, output_it);
119}
120
121template <typename T, bool has_biases>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100122void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
123 const Size2D &dilation, unsigned int depth_multiplier, const Window &window)
Giorgio Arena44f55722019-07-12 14:49:49 +0100124{
125 const size_t input_stride_y = input->info()->strides_in_bytes().y();
126 const size_t input_stride_z = input->info()->strides_in_bytes().z();
127 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
128 input->info()->strides_in_bytes().y();
129 const size_t weights_width = weights->info()->dimension(1);
130 const size_t weights_height = weights->info()->dimension(2);
131 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
132 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
133 const size_t conv_stride_x = conv_info.stride().first;
134 const size_t conv_stride_y = conv_info.stride().second;
135 const size_t conv_pad_left = conv_info.pad_left();
136 const size_t conv_pad_top = conv_info.pad_top();
137
138 Window win_input = window;
139 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
140 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
141
142 Window win_weights = win_input;
143 win_weights.set(3, Window::Dimension(0, 0, 0));
144
145 win_input.set_dimension_step(Window::DimX, 1);
146
147 Iterator input_it(input, win_input);
148 Iterator weights_it(weights, win_weights);
149 Iterator output_it(output, window);
150 Iterator biases_it{};
151
152 if(has_biases)
153 {
154 biases_it = Iterator(biases, win_weights);
155 }
156
157 execute_window_loop(window, [&](const Coordinates & id)
158 {
159 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
160
161 const int input_y = id.y() * conv_stride_x - conv_pad_left;
162 const int input_z = id.z() * conv_stride_y - conv_pad_top;
163 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
164
165 auto weights_ptr = weights_it.ptr();
166 for(size_t h = 0; h < weights_height; ++h)
167 {
168 int offs = input_offset;
169 for(size_t w = 0; w < weights_width; ++w)
170 {
171 const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
172
173 for(size_t m = 0; m < depth_multiplier; ++m)
174 {
175 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +0100176 acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
Giorgio Arena44f55722019-07-12 14:49:49 +0100177 }
178
179 offs += dilation.x() * input_stride_y;
180 }
181
182 weights_ptr += weights_stride_z;
183 input_offset += dilation.y() * input_stride_z;
184 }
185
186 if(has_biases)
187 {
188 for(size_t m = 0; m < depth_multiplier; ++m)
189 {
190 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
191 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
192 }
193 }
194 else
195 {
196 for(size_t m = 0; m < depth_multiplier; ++m)
197 {
198 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
199 }
200 }
201 },
202 input_it, weights_it, biases_it, output_it);
203}
204
Giorgio Arenad93e2632019-10-15 11:09:33 +0100205template <typename T, typename TW, int S, bool has_biases, bool is_per_channel>
206void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
207 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window)
208{
209 using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
210 using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
211
212 const size_t input_stride_y = input->info()->strides_in_bytes().y();
213 const size_t input_stride_z = input->info()->strides_in_bytes().z();
214 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
215 input->info()->strides_in_bytes().y();
216 const size_t weights_width = weights->info()->dimension(1);
217 const size_t weights_height = weights->info()->dimension(2);
218 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
219 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
220 const size_t conv_stride_x = conv_info.stride().first;
221 const size_t conv_stride_y = conv_info.stride().second;
222 const size_t conv_pad_left = conv_info.pad_left();
223 const size_t conv_pad_top = conv_info.pad_top();
224
225 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
226 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
227 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
228 const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
229
230 Window win_input = window;
231 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
232 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
233
234 Window win_weights = win_input;
235 win_weights.set(3, Window::Dimension(0, 0, 0));
236
237 Iterator input_it(input, win_input);
238 Iterator weights_it(weights, win_weights);
239 Iterator output_it(output, window);
240 Iterator biases_it{};
241
242 if(has_biases)
243 {
244 biases_it = Iterator(biases, win_weights);
245 }
246
247 execute_window_loop(window, [&](const Coordinates & id)
248 {
249 std::vector<int32_t> acc(S, 0);
250 std::vector<int32_t> in_sum(S, 0);
251 std::vector<int32_t> we_sum(S, 0);
252
253 const int input_y = id.y() * conv_stride_x - conv_pad_left;
254 const int input_z = id.z() * conv_stride_y - conv_pad_top;
255 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
256
257 auto weights_ptr = weights_it.ptr();
258 for(size_t h = 0; h < weights_height; ++h)
259 {
260 int offs = input_offset;
261 for(size_t w = 0; w < weights_width; ++w)
262 {
263 const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
264 const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * weights_stride_y));
265
266 for(int i = 0; i < S; ++i)
267 {
268 acc.at(i) += input_vals[i] * weights_vals[i];
269 in_sum.at(i) += input_vals[i];
270 we_sum.at(i) += weights_vals[i];
271 }
272
273 offs += dilation.x() * input_stride_y;
274 }
275
276 weights_ptr += weights_stride_z;
277 input_offset += dilation.y() * input_stride_z;
278 }
279
280 VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
281 for(int i = 0; i < S; ++i)
282 {
283 acc.at(i) -= in_sum.at(i) * weights_qoffset;
284 acc.at(i) -= we_sum.at(i) * input_qoffset;
285 acc.at(i) += k_offset;
286
287 if(has_biases)
288 {
289 acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
290 }
291
292 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), output_multiplier.at(id.x() + i)), output_shift.at(id.x() + i)) + output_qoffset;
293 out_vals[i] = static_cast<T>(utility::clamp<int32_t, uint8_t>(acc.at(i)));
294 }
295
296 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), out_vals);
297 },
298 input_it, weights_it, biases_it, output_it);
299}
300
301template <typename T, typename TW, bool has_biases, bool is_per_channel>
302void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
303 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window)
304{
305 const size_t input_stride_y = input->info()->strides_in_bytes().y();
306 const size_t input_stride_z = input->info()->strides_in_bytes().z();
307 const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
308 input->info()->strides_in_bytes().y();
309 const size_t weights_width = weights->info()->dimension(1);
310 const size_t weights_height = weights->info()->dimension(2);
311 const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
312 const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
313 const size_t conv_stride_x = conv_info.stride().first;
314 const size_t conv_stride_y = conv_info.stride().second;
315 const size_t conv_pad_left = conv_info.pad_left();
316 const size_t conv_pad_top = conv_info.pad_top();
317
318 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
319 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
320 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
321 const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
322
323 Window win_input = window;
324 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
325 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
326
327 Window win_weights = win_input;
328 win_weights.set(3, Window::Dimension(0, 0, 0));
329
330 win_input.set_dimension_step(Window::DimX, 1);
331
332 Iterator input_it(input, win_input);
333 Iterator weights_it(weights, win_weights);
334 Iterator output_it(output, window);
335 Iterator biases_it{};
336
337 if(has_biases)
338 {
339 biases_it = Iterator(biases, win_weights);
340 }
341
342 execute_window_loop(window, [&](const Coordinates & id)
343 {
344 std::vector<int32_t> acc(depth_multiplier, 0);
345 std::vector<int32_t> we_sum(depth_multiplier, 0);
346 int32_t in_sum = 0;
347
348 const int input_y = id.y() * conv_stride_x - conv_pad_left;
349 const int input_z = id.z() * conv_stride_y - conv_pad_top;
350 int input_offset = input_y * input_stride_y + input_z * input_stride_z;
351
352 auto weights_ptr = weights_it.ptr();
353 for(size_t h = 0; h < weights_height; ++h)
354 {
355 int offs = input_offset;
356 for(size_t w = 0; w < weights_width; ++w)
357 {
358 const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
359
360 for(size_t m = 0; m < depth_multiplier; ++m)
361 {
362 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
363 acc.at(m) += input_val * weights_val;
364
365 we_sum.at(m) += weights_val;
366 }
367
368 offs += dilation.x() * input_stride_y;
369 in_sum += input_val;
370 }
371
372 weights_ptr += weights_stride_z;
373 input_offset += dilation.y() * input_stride_z;
374 }
375
376 for(size_t m = 0; m < depth_multiplier; ++m)
377 {
378 acc.at(m) -= in_sum * weights_qoffset;
379 acc.at(m) -= we_sum.at(m) * input_qoffset;
380 acc.at(m) += k_offset;
381
382 if(has_biases)
383 {
384 const auto biases_val = *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
385
386 int32_t out_val = acc.at(m) + biases_val;
387 out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(out_val, output_multiplier.at(id.x() + m)),
388 output_shift.at(id.x() + m))
389 + output_qoffset;
390 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, uint8_t>(out_val));
391 }
392 else
393 {
394 int32_t out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), output_multiplier.at(id.x() + m)),
395 output_shift.at(id.x() + m))
396 + output_qoffset;
397 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, uint8_t>(out_val));
398 }
399 }
400 },
401 input_it, weights_it, biases_it, output_it);
402}
403
Giorgio Arena44f55722019-07-12 14:49:49 +0100404Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
405 const Size2D &dilation)
406{
407 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100408 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
409 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
410 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
Giorgio Arena44f55722019-07-12 14:49:49 +0100411 ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100412 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
413 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
Giorgio Arena44f55722019-07-12 14:49:49 +0100414 ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
415 ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
416 ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
417
Giorgio Arenad93e2632019-10-15 11:09:33 +0100418 if(is_data_type_quantized_per_channel(weights->data_type()))
419 {
420 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
421 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
422 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
423 }
424 else
425 {
426 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
427 }
428
Giorgio Arena44f55722019-07-12 14:49:49 +0100429 if(biases != nullptr)
430 {
431 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
432 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100433
434 if(is_data_type_quantized_asymmetric(input->data_type()))
435 {
436 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
437 }
438 else
439 {
440 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
441 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100442 }
443
444 if(output->total_size() != 0)
445 {
446 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
447 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
448 }
449
450 return Status{};
451}
Giorgio Arena44f55722019-07-12 14:49:49 +0100452
453std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
454 ITensorInfo *output, const PadStrideInfo &conv_info,
455 unsigned int depth_multiplier, const Size2D &dilation)
456{
457 // Get convolved dimensions
458 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
459
460 // Output auto inizialitation if not yet initialized
Giorgio Arenad93e2632019-10-15 11:09:33 +0100461 auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
Giorgio Arena44f55722019-07-12 14:49:49 +0100462
463 // Configure kernel window (generic)
464 const unsigned int num_elems_read_per_iteration = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
465 const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
466
467 // Configure kernel window
468 Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
469
470 AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
471 input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
472 AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
473 AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
474
475 bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
476
477 if(biases != nullptr)
478 {
479 AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
480 window_changed |= update_window_and_padding(win, biases_access);
481 }
482
483 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
484
485 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
486 return std::make_pair(err, win);
487}
Giorgio Arenad93e2632019-10-15 11:09:33 +0100488} // namespace
Giorgio Arena44f55722019-07-12 14:49:49 +0100489
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100490NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
Giorgio Arenad93e2632019-10-15 11:09:33 +0100491 : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift()
Giorgio Arena44f55722019-07-12 14:49:49 +0100492{
493}
494
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100495BorderSize NEDepthwiseConvolutionLayerNativeKernel::border_size() const
Giorgio Arena44f55722019-07-12 14:49:49 +0100496{
497 return _border_size;
498}
499
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100500void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
501 const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100502{
503 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
504 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
505
506 _input = input;
507 _weights = weights;
508 _biases = biases;
509 _output = output;
510 _conv_info = conv_info;
511 _depth_multiplier = depth_multiplier;
512 _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
513 _dilation = dilation;
514
Giorgio Arenad93e2632019-10-15 11:09:33 +0100515 if(is_data_type_quantized(_input->info()->data_type()))
Giorgio Arena44f55722019-07-12 14:49:49 +0100516 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100517 const auto input_scale = input->info()->quantization_info().uniform().scale;
518 const auto output_scale = output->info()->quantization_info().uniform().scale;
519
520 auto weights_scale = weights->info()->quantization_info().scale();
521 if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
522 {
523 for(size_t i = 1; i < _weights->info()->dimension(0); ++i)
524 {
525 weights_scale.push_back(weights_scale.front());
526 }
527 }
528
529 for(size_t i = 0; i < weights_scale.size(); ++i)
530 {
531 int out_mult = 0;
532 int out_shift = 0;
533 const float multiplier = input_scale * weights_scale.at(i) / output_scale;
534 ARM_COMPUTE_ERROR_ON(multiplier > 1.f);
535 arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &out_mult, &out_shift);
536
537 _output_multiplier.push_back(out_mult);
538 _output_shift.push_back(out_shift);
539 }
540 }
541
542 switch(_weights->info()->data_type())
543 {
544 case DataType::QASYMM8:
545 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8, true, false> :
546 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8, false, false>;
547 pad_vectors(_output_multiplier, _output_shift, 8);
548 break;
549 case DataType::QSYMM8_PER_CHANNEL:
550 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8, true, true> :
551 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8, false, true>;
552 pad_vectors(_output_multiplier, _output_shift, 8);
553 break;
554#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
555 case DataType::F16:
556 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4, true, false> :
557 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4, false, false>;
558 pad_vectors(_output_multiplier, _output_shift, 4);
559 break;
560#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arena44f55722019-07-12 14:49:49 +0100561 case DataType::F32:
Giorgio Arenad93e2632019-10-15 11:09:33 +0100562 _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2, true, false> :
563 &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2, false, false>;
564 pad_vectors(_output_multiplier, _output_shift, 2);
Giorgio Arena44f55722019-07-12 14:49:49 +0100565 break;
566 default:
567 ARM_COMPUTE_ERROR("Data type not supported");
568 break;
569 }
570
571 auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
572 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
573 INEKernel::configure(win_config.second);
574}
575
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100576Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
577 unsigned int depth_multiplier,
578 const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100579{
580 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
581 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
582 depth_multiplier, dilation)
583 .first);
584 return Status{};
585}
586
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100587void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100588{
589 ARM_COMPUTE_UNUSED(info);
590 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
591 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
592
593 (this->*_func)(window);
594}
595
Giorgio Arenad93e2632019-10-15 11:09:33 +0100596template < typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename std::enable_if < std::is_same<T, float>::value
597#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
598 || std::is_same<T, float16_t>::value
599#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
600 ,
601 int >::type >
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100602void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
Giorgio Arena44f55722019-07-12 14:49:49 +0100603{
604 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
605 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
606
607 if(_depth_multiplier == 1)
608 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100609 depthwise_loop_multiplier1_fp<T, S, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, window);
Giorgio Arena44f55722019-07-12 14:49:49 +0100610 }
611 else
612 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100613 depthwise_loop_generic_fp<T, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window);
614 }
615}
616
617template <typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
618void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
619{
620 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
621 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
622
623 if(_depth_multiplier == 1)
624 {
625 depthwise_loop_multiplier1_quantized<T, TW, S, has_biases, is_per_channel>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window);
626 }
627 else
628 {
629 depthwise_loop_generic_quantized<T, TW, has_biases, is_per_channel>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window);
Giorgio Arena44f55722019-07-12 14:49:49 +0100630 }
631}
632} // namespace arm_compute