blob: 533b3745942f9ebd09f4f102a3448cd2396be05e [file] [log] [blame]
Giorgio Arena44f55722019-07-12 14:49:49 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2019-2020 Arm Limited.
Giorgio Arena44f55722019-07-12 14:49:49 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +010024#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010025
Giorgio Arena44f55722019-07-12 14:49:49 +010026#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010027#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/AccessWindowStatic.h"
29#include "src/core/CPP/Validate.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010030#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010031#include "src/core/NEON/wrapper/traits.h"
32#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010033#include "src/core/helpers/AutoConfiguration.h"
34#include "src/core/helpers/WindowHelpers.h"
Matthew Bentham758b5ba2020-03-05 23:37:48 +000035#include "support/ToolchainSupport.h"
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +010036
Giorgio Arena44f55722019-07-12 14:49:49 +010037namespace arm_compute
38{
39namespace
40{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010041constexpr auto data_layout = DataLayout::NHWC;
42const size_t batch_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
43const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
44const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
45const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
46
47constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
48constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
49constexpr size_t vector_size = 8;
50
51struct DepthwiseConvolutionRunInfo
Giorgio Arenad93e2632019-10-15 11:09:33 +010052{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010053 const size_t num_read_elements_per_iteration;
54 const uint32_t x_start;
55 const uint32_t x_end;
56 const uint32_t x_step;
57 const uint32_t x_leftover_start;
58 const size_t input_stride_y;
59 const size_t input_stride_z;
60 const size_t input_max_offset;
61 const size_t weights_width;
62 const size_t weights_height;
63 const size_t weights_stride_y;
64 const size_t weights_stride_z;
65 const size_t conv_stride_x;
66 const size_t conv_stride_y;
67 const size_t conv_pad_left;
68 const size_t conv_pad_top;
69 const size_t input_height;
70 const size_t input_width;
71 const size_t input_depth;
72
73 DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
74 : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
75 x_start(w.x().start()),
76 x_end(w.x().end()),
77 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
78 x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
79 input_stride_y(input.strides_in_bytes().y()),
80 input_stride_z(input.strides_in_bytes().z()),
81 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
82 weights_width(weights.dimension(width_idx)),
83 weights_height(weights.dimension(height_idx)),
84 weights_stride_y(weights.strides_in_bytes().y()),
85 weights_stride_z(weights.strides_in_bytes().z()),
86 conv_stride_x(conv_info.stride().first),
87 conv_stride_y(conv_info.stride().second),
88 conv_pad_left(conv_info.pad_left()),
89 conv_pad_top(conv_info.pad_top()),
90 input_height(input.dimension(height_idx)),
91 input_width(input.dimension(width_idx)),
92 input_depth(input.dimension(channel_idx))
Giorgio Arenad93e2632019-10-15 11:09:33 +010093 {
Giorgio Arenad93e2632019-10-15 11:09:33 +010094 }
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010095};
96
97inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
98{
99 const int32_t current_h = base_h + h * dilation.y();
100 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
101
102 const int32_t current_w = base_w + w * dilation.x();
103 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
104
105 return is_valid_h && is_valid_w;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100106}
107
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100108template <typename T>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100109void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100110 const Size2D &dilation, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100111{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100112 constexpr auto element_per_vector = vector_size / sizeof(T);
113 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
114 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
Giorgio Arena44f55722019-07-12 14:49:49 +0100115
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100116 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
117
118 const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
119
120 Window execution_window = window;
121 execution_window.set(Window::DimX, dim_single_unit_step);
Giorgio Arena44f55722019-07-12 14:49:49 +0100122
123 Window win_input = window;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100124 win_input.set(Window::DimX, dim_manual_loop);
125 win_input.set(Window::DimY, dim_manual_loop);
126 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100127
128 Window win_weights = win_input;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100129 win_weights.set(Window::DimW, dim_manual_loop);
130
131 Window win_output = window;
132 win_output.set(Window::DimX, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100133
134 Iterator input_it(input, win_input);
135 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100136 Iterator output_it(output, win_output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100137 Iterator biases_it{};
138
139 if(has_biases)
140 {
141 biases_it = Iterator(biases, win_weights);
142 }
143
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100144 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arena44f55722019-07-12 14:49:49 +0100145 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100146 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
147 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
148 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100149
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100150 auto const base_weights_ptr = weights_it.ptr();
151 uint32_t x = run_info.x_start;
Giorgio Arena44f55722019-07-12 14:49:49 +0100152
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100153 for(; x < run_info.x_leftover_start; x += run_info.x_step)
Giorgio Arena44f55722019-07-12 14:49:49 +0100154 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100155 VectorType acc = zero_vector;
156 auto weights_ptr = base_weights_ptr;
157 int64_t input_offset = base_input_offset;
Giorgio Arena44f55722019-07-12 14:49:49 +0100158
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100159 for(uint32_t h = 0; h < run_info.weights_height; ++h)
160 {
161 int64_t offs = input_offset + x * sizeof(T);
162 for(uint32_t w = 0; w < run_info.weights_width; ++w)
163 {
164 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
165 const auto input_vals = is_valid_region ?
166 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
167 zero_vector;
168 const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
169 acc = wrapper::vmla(acc, weights_vals, input_vals);
170
171 offs += dilation.x() * run_info.input_stride_y;
172 }
173
174 weights_ptr += run_info.weights_stride_z;
175 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100176 }
177
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100178 if(has_biases)
179 {
180 const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
181 acc = wrapper::vadd(acc, biases_vals);
182 }
183
184 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
Giorgio Arena44f55722019-07-12 14:49:49 +0100185 }
186
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100187 for(; x < run_info.x_end; ++x)
Giorgio Arena44f55722019-07-12 14:49:49 +0100188 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100189 auto acc_scalar = T{ 0 };
190 auto weights_ptr = base_weights_ptr;
191 int64_t input_offset = base_input_offset;
Giorgio Arena44f55722019-07-12 14:49:49 +0100192
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100193 for(size_t h = 0; h < run_info.weights_height; ++h)
194 {
195 int64_t offs = input_offset + x * sizeof(T);
196 for(size_t w = 0; w < run_info.weights_width; ++w)
197 {
198 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
199 const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
200 const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
201
202 acc_scalar += (input_vals * weights_vals);
203
204 offs += dilation.x() * run_info.input_stride_y;
205 }
206
207 weights_ptr += run_info.weights_stride_z;
208 input_offset += dilation.y() * run_info.input_stride_z;
209 }
210
211 if(has_biases)
212 {
213 const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
214 acc_scalar += biases_vals;
215 }
216 *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
217 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100218 },
219 input_it, weights_it, biases_it, output_it);
220}
221
Michalis Spyrouf401c742020-05-12 16:18:33 +0100222template <typename T>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100223void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100224 const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100225{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100226 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
Giorgio Arena44f55722019-07-12 14:49:49 +0100227
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100228 Window execution_window = window;
229 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
Giorgio Arena44f55722019-07-12 14:49:49 +0100230
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100231 Window win_input = execution_window;
232 win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
233 win_input.set(Window::DimY, dim_manual_loop);
234 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100235
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100236 Window win_weights = window;
237 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
238 win_weights.set(Window::DimY, dim_manual_loop);
239 win_weights.set(Window::DimZ, dim_manual_loop);
240 win_weights.set(Window::DimW, dim_manual_loop);
241
242 Window win_output = window;
243 win_output.set_dimension_step(Window::DimX, run_info.x_step);
Giorgio Arena44f55722019-07-12 14:49:49 +0100244
245 Iterator input_it(input, win_input);
246 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100247 Iterator output_it(output, win_output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100248 Iterator biases_it{};
249
250 if(has_biases)
251 {
252 biases_it = Iterator(biases, win_weights);
253 }
254
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100255 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arena44f55722019-07-12 14:49:49 +0100256 {
257 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
258
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100259 const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
260 const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
261 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100262
263 auto weights_ptr = weights_it.ptr();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100264 for(size_t h = 0; h < run_info.weights_height; ++h)
Giorgio Arena44f55722019-07-12 14:49:49 +0100265 {
266 int offs = input_offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100267 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arena44f55722019-07-12 14:49:49 +0100268 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100269 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
270 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
Giorgio Arena44f55722019-07-12 14:49:49 +0100271
272 for(size_t m = 0; m < depth_multiplier; ++m)
273 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100274 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +0100275 acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
Giorgio Arena44f55722019-07-12 14:49:49 +0100276 }
277
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100278 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arena44f55722019-07-12 14:49:49 +0100279 }
280
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100281 weights_ptr += run_info.weights_stride_z;
282 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100283 }
284
285 if(has_biases)
286 {
287 for(size_t m = 0; m < depth_multiplier; ++m)
288 {
289 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
290 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
291 }
292 }
293 else
294 {
295 for(size_t m = 0; m < depth_multiplier; ++m)
296 {
297 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
298 }
299 }
300 },
301 input_it, weights_it, biases_it, output_it);
302}
303
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100304template <typename T, typename TW>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100305void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100306 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100307{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100308 constexpr auto element_per_vector = vector_size / sizeof(T);
309 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
310 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
311 using AccType = int32_t;
312 using AccArrayType = std::array<AccType, element_per_vector>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100313
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100314 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
315 const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
316
317 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100318
319 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
320 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
321 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100322 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
323
324 Window execution_window = window;
325 execution_window.set(Window::DimX, dim_single_unit_step);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100326
327 Window win_input = window;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100328 win_input.set(Window::DimX, dim_manual_loop);
329 win_input.set(Window::DimY, dim_manual_loop);
330 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100331
332 Window win_weights = win_input;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100333 win_weights.set(Window::DimW, dim_manual_loop);
334
335 Window win_output = window;
336 win_output.set(Window::DimX, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100337
338 Iterator input_it(input, win_input);
339 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100340 Iterator output_it(output, win_output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100341 Iterator biases_it{};
342
343 if(has_biases)
344 {
345 biases_it = Iterator(biases, win_weights);
346 }
347
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100348 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100349 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100350 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
351 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
352 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
353 auto const base_weights_ptr = weights_it.ptr();
354 size_t x = run_info.x_start;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100355
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100356 for(; x < run_info.x_leftover_start; x += run_info.x_step)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100357 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100358 AccArrayType acc{};
359 AccArrayType in_sum{};
360 AccArrayType we_sum{};
Giorgio Arenad93e2632019-10-15 11:09:33 +0100361
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100362 auto weights_ptr = base_weights_ptr;
363 auto input_offset = base_input_offset;
364
365 for(size_t h = 0; h < run_info.weights_height; ++h)
366 {
367 int64_t offs = input_offset + x * sizeof(T);
368 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100369 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100370 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
371 const auto input_vals = is_valid_region ?
372 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
373 out_of_bound_vector;
374 const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
375
376 for(size_t i = 0; i < run_info.x_step; ++i)
377 {
378 acc.at(i) += input_vals[i] * weights_vals[i];
379 in_sum.at(i) += input_vals[i];
380 we_sum.at(i) += weights_vals[i];
381 }
382
383 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100384 }
385
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100386 weights_ptr += run_info.weights_stride_z;
387 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100388 }
389
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100390 VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
391 for(size_t i = 0; i < run_info.x_step; ++i)
392 {
393 acc.at(i) -= in_sum.at(i) * weights_qoffset;
394 acc.at(i) -= we_sum.at(i) * input_qoffset;
395 acc.at(i) += k_offset;
396
397 if(has_biases)
398 {
399 acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
400 }
401
402 const int32_t out_mul = output_multiplier.at(x + i);
403 const int32_t out_shift = output_shift.at(x + i);
404 if(out_shift < 0)
405 {
406 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
407 }
408 else
409 {
410 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
411 }
412 out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
413 }
414
415 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100416 }
417
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100418 // left-over
419 for(; x < run_info.x_end; ++x)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100420 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100421 AccType acc = 0;
422 AccType in_sum = 0;
423 AccType we_sum = 0;
424
425 auto weights_ptr = base_weights_ptr;
426 auto input_offset = base_input_offset;
427
428 for(size_t h = 0; h < run_info.weights_height; ++h)
429 {
430 int64_t offs = input_offset + x * sizeof(T);
431 for(size_t w = 0; w < run_info.weights_width; ++w)
432 {
433 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
434 const auto input_val = is_valid_region ?
435 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
436 out_of_bound_value;
437 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
438
439 acc += input_val * weights_val;
440 in_sum += input_val;
441 we_sum += weights_val;
442
443 offs += dilation.x() * run_info.input_stride_y;
444 }
445
446 weights_ptr += run_info.weights_stride_z;
447 input_offset += dilation.y() * run_info.input_stride_z;
448 }
449
450 T out_vals{ 0 };
451
452 acc -= in_sum * weights_qoffset;
453 acc -= we_sum * input_qoffset;
454 acc += k_offset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100455
456 if(has_biases)
457 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100458 acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100459 }
460
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100461 const int32_t out_mul = output_multiplier.at(x);
462 const int32_t out_shift = output_shift.at(x);
463
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000464 if(out_shift < 0)
465 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100466 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000467 }
468 else
469 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100470 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000471 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100472
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100473 out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
474 *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
475 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100476 },
477 input_it, weights_it, biases_it, output_it);
478}
479
Michalis Spyrouf401c742020-05-12 16:18:33 +0100480template <typename T, typename TW>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100481void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100482 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100483{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100484 using AccType = int32_t;
485
486 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
487
488 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
Giorgio Arenad93e2632019-10-15 11:09:33 +0100489
490 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
491 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
492 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100493 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100494
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100495 Window execution_window = window;
496 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100497
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100498 Window win_input = execution_window;
499 win_input.set(Window::DimY, dim_manual_loop);
500 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100501
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100502 Window win_weights = window;
503 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
504 win_weights.set(Window::DimY, dim_manual_loop);
505 win_weights.set(Window::DimZ, dim_manual_loop);
506 win_weights.set(Window::DimW, dim_manual_loop);
507
508 Window win_output = window;
509 win_output.set_dimension_step(Window::DimX, run_info.x_step);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100510
511 Iterator input_it(input, win_input);
512 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100513 Iterator output_it(output, win_output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100514 Iterator biases_it{};
515
516 if(has_biases)
517 {
518 biases_it = Iterator(biases, win_weights);
519 }
520
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100521 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100522 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100523 std::vector<AccType> acc(depth_multiplier, 0);
524 std::vector<AccType> we_sum(depth_multiplier, 0);
525 AccType in_sum = 0;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100526
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100527 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
528 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
529 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100530
531 auto weights_ptr = weights_it.ptr();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100532 for(size_t h = 0; h < run_info.weights_height; ++h)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100533 {
534 int offs = input_offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100535 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100536 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100537 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
538 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100539
540 for(size_t m = 0; m < depth_multiplier; ++m)
541 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100542 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100543 acc.at(m) += input_val * weights_val;
544
545 we_sum.at(m) += weights_val;
546 }
547
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100548 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100549 in_sum += input_val;
550 }
551
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100552 weights_ptr += run_info.weights_stride_z;
553 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100554 }
555
556 for(size_t m = 0; m < depth_multiplier; ++m)
557 {
558 acc.at(m) -= in_sum * weights_qoffset;
559 acc.at(m) -= we_sum.at(m) * input_qoffset;
560 acc.at(m) += k_offset;
561
562 if(has_biases)
563 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000564 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
565 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100566
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100567 const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
568 const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000569 if(out_shift < 0)
570 {
571 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100572 }
573 else
574 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000575 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100576 }
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100577 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100578 }
579 },
580 input_it, weights_it, biases_it, output_it);
581}
582
Giorgio Arena44f55722019-07-12 14:49:49 +0100583Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
584 const Size2D &dilation)
585{
586 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100587 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
588 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000589 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Giorgio Arena44f55722019-07-12 14:49:49 +0100590 ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100591 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
592 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
Giorgio Arena44f55722019-07-12 14:49:49 +0100593 ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
594 ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
595 ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
596
Giorgio Arenad93e2632019-10-15 11:09:33 +0100597 if(is_data_type_quantized_per_channel(weights->data_type()))
598 {
599 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100600 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
601 }
602 else
603 {
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +0100604 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100605 }
606
Giorgio Arena44f55722019-07-12 14:49:49 +0100607 if(biases != nullptr)
608 {
609 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
610 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100611
612 if(is_data_type_quantized_asymmetric(input->data_type()))
613 {
614 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
615 }
616 else
617 {
618 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
619 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100620 }
621
622 if(output->total_size() != 0)
623 {
624 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
625 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +0100626 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100627 }
628
629 return Status{};
630}
Giorgio Arenad93e2632019-10-15 11:09:33 +0100631} // namespace
Giorgio Arena44f55722019-07-12 14:49:49 +0100632
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100633NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100634 : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
Giorgio Arena44f55722019-07-12 14:49:49 +0100635{
636}
637
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100638void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
639 const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100640{
641 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
642 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
643
644 _input = input;
645 _weights = weights;
646 _biases = biases;
647 _output = output;
648 _conv_info = conv_info;
649 _depth_multiplier = depth_multiplier;
Giorgio Arena44f55722019-07-12 14:49:49 +0100650 _dilation = dilation;
Michalis Spyrouf401c742020-05-12 16:18:33 +0100651 _has_biases = (biases != nullptr);
Giorgio Arena44f55722019-07-12 14:49:49 +0100652
Giorgio Arenad93e2632019-10-15 11:09:33 +0100653 if(is_data_type_quantized(_input->info()->data_type()))
Giorgio Arena44f55722019-07-12 14:49:49 +0100654 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100655 const auto input_scale = input->info()->quantization_info().uniform().scale;
656 const auto output_scale = output->info()->quantization_info().uniform().scale;
657
658 auto weights_scale = weights->info()->quantization_info().scale();
659 if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
660 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100661 for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100662 {
663 weights_scale.push_back(weights_scale.front());
664 }
665 }
666
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100667 for(const auto &s : weights_scale)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100668 {
Michalis Spyroue7be8a02019-12-12 16:16:09 +0000669 int32_t out_mult = 0;
670 int32_t out_shift = 0;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100671 const float multiplier = input_scale * s / output_scale;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000672 arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100673
674 _output_multiplier.push_back(out_mult);
675 _output_shift.push_back(out_shift);
676 }
677 }
678
679 switch(_weights->info()->data_type())
680 {
681 case DataType::QASYMM8:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100682 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100683 break;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000684 case DataType::QASYMM8_SIGNED:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100685 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000686 break;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100687 case DataType::QSYMM8_PER_CHANNEL:
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000688 if(_input->info()->data_type() == DataType::QASYMM8)
689 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100690 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000691 }
692 else
693 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100694 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000695 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100696 break;
697#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
698 case DataType::F16:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100699 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100700 break;
701#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arena44f55722019-07-12 14:49:49 +0100702 case DataType::F32:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100703 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
Giorgio Arena44f55722019-07-12 14:49:49 +0100704 break;
705 default:
706 ARM_COMPUTE_ERROR("Data type not supported");
707 break;
708 }
709
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100710 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
711 auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
712
713 Window win = calculate_max_window(*output->info(), Steps());
714 Coordinates coord;
715 coord.set_num_dimensions(output->info()->num_dimensions());
716 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
717 INEKernel::configure(win);
Giorgio Arena44f55722019-07-12 14:49:49 +0100718}
719
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100720Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
721 unsigned int depth_multiplier,
722 const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100723{
724 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
Giorgio Arena44f55722019-07-12 14:49:49 +0100725 return Status{};
726}
727
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100728void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100729{
730 ARM_COMPUTE_UNUSED(info);
731 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
732 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
733
Michalis Spyrouf401c742020-05-12 16:18:33 +0100734 (this->*_func)(window, _has_biases);
Giorgio Arena44f55722019-07-12 14:49:49 +0100735}
736
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100737template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
Michalis Spyrouf401c742020-05-12 16:18:33 +0100738void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100739{
740 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
741 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
742
743 if(_depth_multiplier == 1)
744 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100745 depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
Giorgio Arena44f55722019-07-12 14:49:49 +0100746 }
747 else
748 {
Michalis Spyrouf401c742020-05-12 16:18:33 +0100749 depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100750 }
751}
752
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100753template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
Michalis Spyrouf401c742020-05-12 16:18:33 +0100754void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100755{
756 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
757 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
758
759 if(_depth_multiplier == 1)
760 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100761 depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100762 }
763 else
764 {
Michalis Spyrouf401c742020-05-12 16:18:33 +0100765 depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
Giorgio Arena44f55722019-07-12 14:49:49 +0100766 }
767}
768} // namespace arm_compute