blob: 24fd01fee1d55a2c4c4eb1b0ec36c962b870f8bc [file] [log] [blame]
Giorgio Arena44f55722019-07-12 14:49:49 +01001/*
SiCongLib88272e2021-02-24 15:40:57 +00002 * Copyright (c) 2019-2021 Arm Limited.
Giorgio Arena44f55722019-07-12 14:49:49 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
Giorgio Arena44f55722019-07-12 14:49:49 +010025
Giorgio Arena44f55722019-07-12 14:49:49 +010026#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010027#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Giorgio Arenad93e2632019-10-15 11:09:33 +010029#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010030#include "src/core/NEON/wrapper/traits.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Matthew Bentham758b5ba2020-03-05 23:37:48 +000034#include "support/ToolchainSupport.h"
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +010035
Giorgio Arena44f55722019-07-12 14:49:49 +010036namespace arm_compute
37{
38namespace
39{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010040constexpr auto data_layout = DataLayout::NHWC;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010041const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
42const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
43const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
44
45constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
46constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
47constexpr size_t vector_size = 8;
48
49struct DepthwiseConvolutionRunInfo
Giorgio Arenad93e2632019-10-15 11:09:33 +010050{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010051 const size_t num_read_elements_per_iteration;
52 const uint32_t x_start;
53 const uint32_t x_end;
54 const uint32_t x_step;
55 const uint32_t x_leftover_start;
56 const size_t input_stride_y;
57 const size_t input_stride_z;
58 const size_t input_max_offset;
59 const size_t weights_width;
60 const size_t weights_height;
61 const size_t weights_stride_y;
62 const size_t weights_stride_z;
63 const size_t conv_stride_x;
64 const size_t conv_stride_y;
65 const size_t conv_pad_left;
66 const size_t conv_pad_top;
67 const size_t input_height;
68 const size_t input_width;
69 const size_t input_depth;
70
71 DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
72 : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
73 x_start(w.x().start()),
74 x_end(w.x().end()),
75 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
76 x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
77 input_stride_y(input.strides_in_bytes().y()),
78 input_stride_z(input.strides_in_bytes().z()),
79 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
80 weights_width(weights.dimension(width_idx)),
81 weights_height(weights.dimension(height_idx)),
82 weights_stride_y(weights.strides_in_bytes().y()),
83 weights_stride_z(weights.strides_in_bytes().z()),
84 conv_stride_x(conv_info.stride().first),
85 conv_stride_y(conv_info.stride().second),
86 conv_pad_left(conv_info.pad_left()),
87 conv_pad_top(conv_info.pad_top()),
88 input_height(input.dimension(height_idx)),
89 input_width(input.dimension(width_idx)),
90 input_depth(input.dimension(channel_idx))
Giorgio Arenad93e2632019-10-15 11:09:33 +010091 {
Giorgio Arenad93e2632019-10-15 11:09:33 +010092 }
Sang-Hoon Parke4558b52020-10-01 10:13:07 +010093};
94
95inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
96{
97 const int32_t current_h = base_h + h * dilation.y();
98 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
99
100 const int32_t current_w = base_w + w * dilation.x();
101 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
102
103 return is_valid_h && is_valid_w;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100104}
105
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100106template <typename T>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100107void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100108 const Size2D &dilation, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100109{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100110 constexpr auto element_per_vector = vector_size / sizeof(T);
111 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
112 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
Giorgio Arena44f55722019-07-12 14:49:49 +0100113
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100114 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
115
116 const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
117
118 Window execution_window = window;
119 execution_window.set(Window::DimX, dim_single_unit_step);
Giorgio Arena44f55722019-07-12 14:49:49 +0100120
121 Window win_input = window;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100122 win_input.set(Window::DimX, dim_manual_loop);
123 win_input.set(Window::DimY, dim_manual_loop);
124 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100125
126 Window win_weights = win_input;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100127 win_weights.set(Window::DimW, dim_manual_loop);
128
129 Window win_output = window;
130 win_output.set(Window::DimX, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100131
132 Iterator input_it(input, win_input);
133 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100134 Iterator output_it(output, win_output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100135 Iterator biases_it{};
136
137 if(has_biases)
138 {
139 biases_it = Iterator(biases, win_weights);
140 }
141
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100142 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arena44f55722019-07-12 14:49:49 +0100143 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100144 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
145 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
146 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100147
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100148 auto const base_weights_ptr = weights_it.ptr();
149 uint32_t x = run_info.x_start;
Giorgio Arena44f55722019-07-12 14:49:49 +0100150
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100151 for(; x < run_info.x_leftover_start; x += run_info.x_step)
Giorgio Arena44f55722019-07-12 14:49:49 +0100152 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100153 VectorType acc = zero_vector;
154 auto weights_ptr = base_weights_ptr;
155 int64_t input_offset = base_input_offset;
Giorgio Arena44f55722019-07-12 14:49:49 +0100156
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100157 for(uint32_t h = 0; h < run_info.weights_height; ++h)
158 {
159 int64_t offs = input_offset + x * sizeof(T);
160 for(uint32_t w = 0; w < run_info.weights_width; ++w)
161 {
162 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
163 const auto input_vals = is_valid_region ?
164 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
165 zero_vector;
166 const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
167 acc = wrapper::vmla(acc, weights_vals, input_vals);
168
169 offs += dilation.x() * run_info.input_stride_y;
170 }
171
172 weights_ptr += run_info.weights_stride_z;
173 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100174 }
175
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100176 if(has_biases)
177 {
178 const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
179 acc = wrapper::vadd(acc, biases_vals);
180 }
181
182 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
Giorgio Arena44f55722019-07-12 14:49:49 +0100183 }
184
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100185 for(; x < run_info.x_end; ++x)
Giorgio Arena44f55722019-07-12 14:49:49 +0100186 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100187 auto acc_scalar = T{ 0 };
188 auto weights_ptr = base_weights_ptr;
189 int64_t input_offset = base_input_offset;
Giorgio Arena44f55722019-07-12 14:49:49 +0100190
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100191 for(size_t h = 0; h < run_info.weights_height; ++h)
192 {
193 int64_t offs = input_offset + x * sizeof(T);
194 for(size_t w = 0; w < run_info.weights_width; ++w)
195 {
196 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
197 const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
198 const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
199
200 acc_scalar += (input_vals * weights_vals);
201
202 offs += dilation.x() * run_info.input_stride_y;
203 }
204
205 weights_ptr += run_info.weights_stride_z;
206 input_offset += dilation.y() * run_info.input_stride_z;
207 }
208
209 if(has_biases)
210 {
211 const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
212 acc_scalar += biases_vals;
213 }
214 *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
215 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100216 },
217 input_it, weights_it, biases_it, output_it);
218}
219
Michalis Spyrouf401c742020-05-12 16:18:33 +0100220template <typename T>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100221void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100222 const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100223{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100224 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
Giorgio Arena44f55722019-07-12 14:49:49 +0100225
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100226 Window execution_window = window;
227 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
Giorgio Arena44f55722019-07-12 14:49:49 +0100228
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100229 Window win_input = execution_window;
230 win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
231 win_input.set(Window::DimY, dim_manual_loop);
232 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arena44f55722019-07-12 14:49:49 +0100233
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100234 Window win_weights = window;
235 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
236 win_weights.set(Window::DimY, dim_manual_loop);
237 win_weights.set(Window::DimZ, dim_manual_loop);
238 win_weights.set(Window::DimW, dim_manual_loop);
239
240 Window win_output = window;
241 win_output.set_dimension_step(Window::DimX, run_info.x_step);
Giorgio Arena44f55722019-07-12 14:49:49 +0100242
243 Iterator input_it(input, win_input);
244 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100245 Iterator output_it(output, win_output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100246 Iterator biases_it{};
247
248 if(has_biases)
249 {
250 biases_it = Iterator(biases, win_weights);
251 }
252
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100253 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arena44f55722019-07-12 14:49:49 +0100254 {
255 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
256
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100257 const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
258 const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
259 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100260
261 auto weights_ptr = weights_it.ptr();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100262 for(size_t h = 0; h < run_info.weights_height; ++h)
Giorgio Arena44f55722019-07-12 14:49:49 +0100263 {
264 int offs = input_offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100265 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arena44f55722019-07-12 14:49:49 +0100266 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100267 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
268 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
Giorgio Arena44f55722019-07-12 14:49:49 +0100269
270 for(size_t m = 0; m < depth_multiplier; ++m)
271 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100272 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
Georgios Pinitas1c29ffc2019-08-01 15:03:00 +0100273 acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
Giorgio Arena44f55722019-07-12 14:49:49 +0100274 }
275
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100276 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arena44f55722019-07-12 14:49:49 +0100277 }
278
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100279 weights_ptr += run_info.weights_stride_z;
280 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arena44f55722019-07-12 14:49:49 +0100281 }
282
283 if(has_biases)
284 {
285 for(size_t m = 0; m < depth_multiplier; ++m)
286 {
287 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
288 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
289 }
290 }
291 else
292 {
293 for(size_t m = 0; m < depth_multiplier; ++m)
294 {
295 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
296 }
297 }
298 },
299 input_it, weights_it, biases_it, output_it);
300}
301
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100302template <typename T, typename TW>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100303void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100304 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100305{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100306 constexpr auto element_per_vector = vector_size / sizeof(T);
307 using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
308 using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
309 using AccType = int32_t;
310 using AccArrayType = std::array<AccType, element_per_vector>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100311
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100312 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
313 const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
314
315 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100316
317 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
318 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
319 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100320 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
321
322 Window execution_window = window;
323 execution_window.set(Window::DimX, dim_single_unit_step);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100324
325 Window win_input = window;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100326 win_input.set(Window::DimX, dim_manual_loop);
327 win_input.set(Window::DimY, dim_manual_loop);
328 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100329
330 Window win_weights = win_input;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100331 win_weights.set(Window::DimW, dim_manual_loop);
332
333 Window win_output = window;
334 win_output.set(Window::DimX, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100335
336 Iterator input_it(input, win_input);
337 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100338 Iterator output_it(output, win_output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100339 Iterator biases_it{};
340
341 if(has_biases)
342 {
343 biases_it = Iterator(biases, win_weights);
344 }
345
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100346 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100347 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100348 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
349 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
350 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
351 auto const base_weights_ptr = weights_it.ptr();
352 size_t x = run_info.x_start;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100353
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100354 for(; x < run_info.x_leftover_start; x += run_info.x_step)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100355 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100356 AccArrayType acc{};
357 AccArrayType in_sum{};
358 AccArrayType we_sum{};
Giorgio Arenad93e2632019-10-15 11:09:33 +0100359
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100360 auto weights_ptr = base_weights_ptr;
361 auto input_offset = base_input_offset;
362
363 for(size_t h = 0; h < run_info.weights_height; ++h)
364 {
365 int64_t offs = input_offset + x * sizeof(T);
366 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100367 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100368 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
369 const auto input_vals = is_valid_region ?
370 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
371 out_of_bound_vector;
372 const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
373
Sang-Hoon Park1a0a4bc2020-11-12 17:41:32 +0000374 for(size_t i = 0; i < element_per_vector; ++i)
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100375 {
376 acc.at(i) += input_vals[i] * weights_vals[i];
377 in_sum.at(i) += input_vals[i];
378 we_sum.at(i) += weights_vals[i];
379 }
380
381 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100382 }
383
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100384 weights_ptr += run_info.weights_stride_z;
385 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100386 }
387
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100388 VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
Sang-Hoon Park1a0a4bc2020-11-12 17:41:32 +0000389 for(size_t i = 0; i < element_per_vector; ++i)
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100390 {
391 acc.at(i) -= in_sum.at(i) * weights_qoffset;
392 acc.at(i) -= we_sum.at(i) * input_qoffset;
393 acc.at(i) += k_offset;
394
395 if(has_biases)
396 {
397 acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
398 }
399
400 const int32_t out_mul = output_multiplier.at(x + i);
401 const int32_t out_shift = output_shift.at(x + i);
402 if(out_shift < 0)
403 {
404 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
405 }
406 else
407 {
408 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
409 }
410 out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
411 }
412
413 wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100414 }
415
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100416 // left-over
417 for(; x < run_info.x_end; ++x)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100418 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100419 AccType acc = 0;
420 AccType in_sum = 0;
421 AccType we_sum = 0;
422
423 auto weights_ptr = base_weights_ptr;
424 auto input_offset = base_input_offset;
425
426 for(size_t h = 0; h < run_info.weights_height; ++h)
427 {
428 int64_t offs = input_offset + x * sizeof(T);
429 for(size_t w = 0; w < run_info.weights_width; ++w)
430 {
431 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
432 const auto input_val = is_valid_region ?
433 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
434 out_of_bound_value;
435 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
436
437 acc += input_val * weights_val;
438 in_sum += input_val;
439 we_sum += weights_val;
440
441 offs += dilation.x() * run_info.input_stride_y;
442 }
443
444 weights_ptr += run_info.weights_stride_z;
445 input_offset += dilation.y() * run_info.input_stride_z;
446 }
447
448 T out_vals{ 0 };
449
450 acc -= in_sum * weights_qoffset;
451 acc -= we_sum * input_qoffset;
452 acc += k_offset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100453
454 if(has_biases)
455 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100456 acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100457 }
458
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100459 const int32_t out_mul = output_multiplier.at(x);
460 const int32_t out_shift = output_shift.at(x);
461
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000462 if(out_shift < 0)
463 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100464 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000465 }
466 else
467 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100468 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000469 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100470
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100471 out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
472 *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
473 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100474 },
475 input_it, weights_it, biases_it, output_it);
476}
477
Michalis Spyrouf401c742020-05-12 16:18:33 +0100478template <typename T, typename TW>
Giorgio Arenad93e2632019-10-15 11:09:33 +0100479void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Michalis Spyrouf401c742020-05-12 16:18:33 +0100480 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100481{
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100482 using AccType = int32_t;
483
484 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
485
486 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
Giorgio Arenad93e2632019-10-15 11:09:33 +0100487
488 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
489 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
490 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100491 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100492
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100493 Window execution_window = window;
494 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100495
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100496 Window win_input = execution_window;
497 win_input.set(Window::DimY, dim_manual_loop);
498 win_input.set(Window::DimZ, dim_manual_loop);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100499
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100500 Window win_weights = window;
501 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
502 win_weights.set(Window::DimY, dim_manual_loop);
503 win_weights.set(Window::DimZ, dim_manual_loop);
504 win_weights.set(Window::DimW, dim_manual_loop);
505
506 Window win_output = window;
507 win_output.set_dimension_step(Window::DimX, run_info.x_step);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100508
509 Iterator input_it(input, win_input);
510 Iterator weights_it(weights, win_weights);
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100511 Iterator output_it(output, win_output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100512 Iterator biases_it{};
513
514 if(has_biases)
515 {
516 biases_it = Iterator(biases, win_weights);
517 }
518
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100519 execute_window_loop(execution_window, [&](const Coordinates & id)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100520 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100521 std::vector<AccType> acc(depth_multiplier, 0);
522 std::vector<AccType> we_sum(depth_multiplier, 0);
523 AccType in_sum = 0;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100524
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100525 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
526 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
527 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100528
529 auto weights_ptr = weights_it.ptr();
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100530 for(size_t h = 0; h < run_info.weights_height; ++h)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100531 {
532 int offs = input_offset;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100533 for(size_t w = 0; w < run_info.weights_width; ++w)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100534 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100535 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
536 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100537
538 for(size_t m = 0; m < depth_multiplier; ++m)
539 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100540 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100541 acc.at(m) += input_val * weights_val;
542
543 we_sum.at(m) += weights_val;
544 }
545
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100546 offs += dilation.x() * run_info.input_stride_y;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100547 in_sum += input_val;
548 }
549
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100550 weights_ptr += run_info.weights_stride_z;
551 input_offset += dilation.y() * run_info.input_stride_z;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100552 }
553
554 for(size_t m = 0; m < depth_multiplier; ++m)
555 {
556 acc.at(m) -= in_sum * weights_qoffset;
557 acc.at(m) -= we_sum.at(m) * input_qoffset;
558 acc.at(m) += k_offset;
559
560 if(has_biases)
561 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000562 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
563 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100564
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100565 const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
566 const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000567 if(out_shift < 0)
568 {
569 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100570 }
571 else
572 {
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000573 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100574 }
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100575 *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100576 }
577 },
578 input_it, weights_it, biases_it, output_it);
579}
580
Giorgio Arena3737c792020-11-23 17:47:23 +0000581template <typename T, typename TW>
582void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
583 const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
584{
585 constexpr int half_vec = vector_size / 2;
586
587 using AccType = int32_t;
588 using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
589 using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
590 using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
591
592 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
593
594 const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
595 const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
596 const auto output_qoffset_vec = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
597
598 const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
599 const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
600 const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
601
602 const auto out_mul = output_multiplier.at(0);
603 const auto out_shift = output_shift.at(0);
604
605 Window execution_window = window;
606 execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
607
608 Window win_input = execution_window;
609 win_input.set(Window::DimY, dim_manual_loop);
610 win_input.set(Window::DimZ, dim_manual_loop);
611
612 Window win_weights = window;
613 win_weights.set_dimension_step(Window::DimX, run_info.x_step);
614 win_weights.set(Window::DimY, dim_manual_loop);
615 win_weights.set(Window::DimZ, dim_manual_loop);
616 win_weights.set(Window::DimW, dim_manual_loop);
617
618 Window win_output = window;
619 win_output.set_dimension_step(Window::DimX, run_info.x_step);
620
621 Iterator input_it(input, win_input);
622 Iterator weights_it(weights, win_weights);
623 Iterator output_it(output, win_output);
624 Iterator biases_it{};
625
626 if(has_biases)
627 {
628 biases_it = Iterator(biases, win_weights);
629 }
630
631 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
632 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
633
634 execute_window_loop(execution_window, [&](const Coordinates & id)
635 {
636 std::fill(begin(acc0), end(acc0), zero);
637 std::fill(begin(acc1), end(acc1), zero);
638
639 const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
640 const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
641 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
642
643 auto weights_ptr = weights_it.ptr();
644 for(size_t h = 0; h < run_info.weights_height; ++h)
645 {
646 const int32_t current_h = input_z + h * dilation.y();
647 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
648 {
649 int offs = input_offset;
650 for(size_t w = 0; w < run_info.weights_width; ++w)
651 {
652 const int32_t current_w = input_y + w * dilation.x();
653 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
654 {
655 const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
656 const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
657 const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
658
659 for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
660 {
661 const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
662 const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
663 const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
664
665 acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
666 acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
667 }
668 }
669
670 offs += dilation.x() * run_info.input_stride_y;
671 }
672 }
673
674 weights_ptr += run_info.weights_stride_z;
675 input_offset += dilation.y() * run_info.input_stride_z;
676 }
677
678 for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
679 {
680 if(has_biases)
681 {
682 const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
683 const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
684
685 acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
686 acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
687 }
688
689 if(out_shift < 0)
690 {
691 acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
692 acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
693 }
694 else
695 {
696 acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
697 acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
698 }
699
700 acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
701 acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
702
703 const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
704 wrapper::vmovn(acc1.at(i)));
705
706 if(std::is_same<T, uint8_t>::value)
707 {
708 wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
709 }
710 else
711 {
712 wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
713 }
714 }
715 },
716 input_it, weights_it, biases_it, output_it);
717}
718
Giorgio Arena44f55722019-07-12 14:49:49 +0100719Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
720 const Size2D &dilation)
721{
722 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100723 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
724 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000725 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
Giorgio Arena44f55722019-07-12 14:49:49 +0100726 ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100727 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
728 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
Giorgio Arena44f55722019-07-12 14:49:49 +0100729 ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
730 ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
731 ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
732
Giorgio Arenad93e2632019-10-15 11:09:33 +0100733 if(is_data_type_quantized_per_channel(weights->data_type()))
734 {
735 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100736 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
737 }
738 else
739 {
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +0100740 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100741 }
742
Giorgio Arena44f55722019-07-12 14:49:49 +0100743 if(biases != nullptr)
744 {
745 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
746 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
Giorgio Arenad93e2632019-10-15 11:09:33 +0100747
748 if(is_data_type_quantized_asymmetric(input->data_type()))
749 {
750 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
751 }
752 else
753 {
754 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
755 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100756 }
757
758 if(output->total_size() != 0)
759 {
760 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
761 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +0100762 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Giorgio Arena44f55722019-07-12 14:49:49 +0100763 }
764
765 return Status{};
766}
Giorgio Arenad93e2632019-10-15 11:09:33 +0100767} // namespace
Giorgio Arena44f55722019-07-12 14:49:49 +0100768
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100769NEDepthwiseConvolutionLayerNativeKernel::NEDepthwiseConvolutionLayerNativeKernel()
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100770 : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
Giorgio Arena44f55722019-07-12 14:49:49 +0100771{
772}
773
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100774void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
775 const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100776{
777 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
778 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
779
780 _input = input;
781 _weights = weights;
782 _biases = biases;
783 _output = output;
784 _conv_info = conv_info;
785 _depth_multiplier = depth_multiplier;
Giorgio Arena44f55722019-07-12 14:49:49 +0100786 _dilation = dilation;
Michalis Spyrouf401c742020-05-12 16:18:33 +0100787 _has_biases = (biases != nullptr);
Giorgio Arena44f55722019-07-12 14:49:49 +0100788
Giorgio Arenad93e2632019-10-15 11:09:33 +0100789 if(is_data_type_quantized(_input->info()->data_type()))
Giorgio Arena44f55722019-07-12 14:49:49 +0100790 {
Giorgio Arenad93e2632019-10-15 11:09:33 +0100791 const auto input_scale = input->info()->quantization_info().uniform().scale;
792 const auto output_scale = output->info()->quantization_info().uniform().scale;
793
794 auto weights_scale = weights->info()->quantization_info().scale();
795 if(!is_data_type_quantized_per_channel(_weights->info()->data_type()))
796 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100797 for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100798 {
799 weights_scale.push_back(weights_scale.front());
800 }
801 }
802
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100803 for(const auto &s : weights_scale)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100804 {
Michalis Spyroue7be8a02019-12-12 16:16:09 +0000805 int32_t out_mult = 0;
806 int32_t out_shift = 0;
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100807 const float multiplier = input_scale * s / output_scale;
Michele Di Giorgiof29d1b72019-10-29 10:58:13 +0000808 arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100809
810 _output_multiplier.push_back(out_mult);
811 _output_shift.push_back(out_shift);
812 }
813 }
814
815 switch(_weights->info()->data_type())
816 {
817 case DataType::QASYMM8:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100818 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100819 break;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000820 case DataType::QASYMM8_SIGNED:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100821 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000822 break;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100823 case DataType::QSYMM8_PER_CHANNEL:
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000824 if(_input->info()->data_type() == DataType::QASYMM8)
825 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100826 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000827 }
828 else
829 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100830 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
Michele Di Giorgio8c837ca2020-01-07 15:06:41 +0000831 }
Giorgio Arenad93e2632019-10-15 11:09:33 +0100832 break;
833#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
834 case DataType::F16:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100835 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
Giorgio Arenad93e2632019-10-15 11:09:33 +0100836 break;
837#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arena44f55722019-07-12 14:49:49 +0100838 case DataType::F32:
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100839 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
Giorgio Arena44f55722019-07-12 14:49:49 +0100840 break;
841 default:
842 ARM_COMPUTE_ERROR("Data type not supported");
843 break;
844 }
845
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100846 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
847 auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
848
SiCongLib88272e2021-02-24 15:40:57 +0000849 Window win = calculate_max_window(*output->info(), Steps());
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100850 INEKernel::configure(win);
Giorgio Arena44f55722019-07-12 14:49:49 +0100851}
852
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100853Status NEDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
854 unsigned int depth_multiplier,
855 const Size2D &dilation)
Giorgio Arena44f55722019-07-12 14:49:49 +0100856{
857 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
Giorgio Arena44f55722019-07-12 14:49:49 +0100858 return Status{};
859}
860
Gian Marco Iodicebd9097d2019-07-26 15:31:02 +0100861void NEDepthwiseConvolutionLayerNativeKernel::run(const Window &window, const ThreadInfo &info)
Giorgio Arena44f55722019-07-12 14:49:49 +0100862{
863 ARM_COMPUTE_UNUSED(info);
864 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
865 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
866
Michalis Spyrouf401c742020-05-12 16:18:33 +0100867 (this->*_func)(window, _has_biases);
Giorgio Arena44f55722019-07-12 14:49:49 +0100868}
869
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100870template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
Michalis Spyrouf401c742020-05-12 16:18:33 +0100871void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
Giorgio Arena44f55722019-07-12 14:49:49 +0100872{
873 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
874 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
875
876 if(_depth_multiplier == 1)
877 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100878 depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
Giorgio Arena44f55722019-07-12 14:49:49 +0100879 }
880 else
881 {
Michalis Spyrouf401c742020-05-12 16:18:33 +0100882 depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100883 }
884}
885
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100886template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
Michalis Spyrouf401c742020-05-12 16:18:33 +0100887void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
Giorgio Arenad93e2632019-10-15 11:09:33 +0100888{
889 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
890 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
891
892 if(_depth_multiplier == 1)
893 {
Sang-Hoon Parke4558b52020-10-01 10:13:07 +0100894 depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
Giorgio Arenad93e2632019-10-15 11:09:33 +0100895 }
896 else
897 {
Giorgio Arena3737c792020-11-23 17:47:23 +0000898 const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
899 const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(_weights->info()->data_type()));
900
901 if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
902 {
903 depthwise_loop_pow2_quantized_per_tensor<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
904 }
905 else
906 {
907 depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
908 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100909 }
910}
911} // namespace arm_compute