blob: c2ed901169224981ddf524876bd824fd095c5ac9 [file] [log] [blame]
Michalis Spyrou7362f0d2017-10-18 17:58:22 +01001/*
Georgios Pinitas2481d462019-02-19 18:47:46 +00002 * Copyright (c) 2017-2019 ARM Limited.
Michalis Spyrou7362f0d2017-10-18 17:58:22 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000024#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010025
26#include "arm_compute/core/Helpers.h"
27#include "arm_compute/core/ITensor.h"
28#include "arm_compute/core/PixelValue.h"
Georgios Pinitasd05dce42018-01-22 16:29:17 +000029#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Georgios Pinitasf72f9362018-01-12 16:29:45 +000030#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010031#include "arm_compute/runtime/NEON/NEScheduler.h"
32#include "support/ToolchainSupport.h"
33
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000034#include "arm_compute/core/utils/misc/InfoHelpers.h"
35
Georgios Pinitasd05dce42018-01-22 16:29:17 +000036using namespace arm_compute::misc;
Georgios Pinitas4074c992018-01-30 18:13:46 +000037using namespace arm_compute::misc::shape_calculator;
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010038
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000039namespace arm_compute
40{
41NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3(std::shared_ptr<IMemoryManager> memory_manager)
42 : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
43 _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
44 _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010045{
46}
47
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000048void NEDepthwiseConvolutionLayer3x3::configure_generic(ITensor *input,
49 const ITensor *weights,
50 const ITensor *biases,
51 ITensor *output,
52 const PadStrideInfo &conv_info,
53 unsigned int depth_multiplier,
Usama Arif881f2de2019-04-12 10:29:17 +010054 const ActivationLayerInfo &act_info,
55 const Size2D &dilation)
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010056{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000057 ARM_COMPUTE_UNUSED(act_info);
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010058
Georgios Pinitasf72f9362018-01-12 16:29:45 +000059 PixelValue zero_value(0.f);
60
Georgios Pinitasa799ce02018-09-12 20:11:34 +010061 // Initialize the intermediate accumulator tensor in case of quantized input
62 if(_is_quantized)
63 {
64 TensorShape accum_shape = output->info()->tensor_shape();
65 DataLayout accum_layout = output->info()->data_layout();
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000066 if(!_is_nchw)
Georgios Pinitasa799ce02018-09-12 20:11:34 +010067 {
68 permute(accum_shape, PermutationVector(1U, 2U, 0U));
69 accum_layout = DataLayout::NCHW;
70 }
71
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000072 _memory_group.manage(&_accumulator);
Georgios Pinitas2481d462019-02-19 18:47:46 +000073 _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
Georgios Pinitasa799ce02018-09-12 20:11:34 +010074 _accumulator.info()->set_data_layout(accum_layout);
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010075 zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
Georgios Pinitasa799ce02018-09-12 20:11:34 +010076 }
77
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000078 if(!_is_nchw)
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010079 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000080 _memory_group.manage(&_permuted_input);
81 _memory_group.manage(&_permuted_output);
Georgios Pinitas4074c992018-01-30 18:13:46 +000082
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000083 // Configure the function to transform the input tensor from NHWC -> NCHW
84 _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
85 _permuted_input.info()->set_data_layout(DataLayout::NCHW);
Georgios Pinitas4074c992018-01-30 18:13:46 +000086
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000087 // Configure the function to transform the weights tensor from HWI -> IHW
88 _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
89 _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
Pablo Telloa28aebc2019-06-03 14:59:48 +010090 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
Georgios Pinitas4074c992018-01-30 18:13:46 +000091
Usama Arif881f2de2019-04-12 10:29:17 +010092 // Configure depthwise
93 _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
Georgios Pinitas4074c992018-01-30 18:13:46 +000094
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000095 // Configure border handler
96 _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
97
98 // Allocate tensors
99 _permuted_input.allocator()->allocate();
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000100 }
Georgios Pinitas4074c992018-01-30 18:13:46 +0000101 else
102 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000103 // Configure depthwise convolution kernel
Usama Arif881f2de2019-04-12 10:29:17 +0100104 _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
Georgios Pinitas9be0c5a2018-02-19 12:46:29 +0000105
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000106 // Configure border handler
107 _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
Michalis Spyrou7362f0d2017-10-18 17:58:22 +0100108 }
Giorgio Arena26b22162018-08-13 15:49:49 +0100109
110 // Configure biases accumulation
111 if(_is_quantized)
112 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100113 const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
114 const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
115 const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
Giorgio Arena26b22162018-08-13 15:49:49 +0100116
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100117 float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100118 int output_multiplier;
119 int output_shift;
Giorgio Arena26b22162018-08-13 15:49:49 +0100120 quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100121 _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
Giorgio Arena26b22162018-08-13 15:49:49 +0100122 _accumulator.allocator()->allocate();
123 }
124 else if(_has_bias)
125 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000126 _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
Giorgio Arena26b22162018-08-13 15:49:49 +0100127 }
128
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000129 // Permute output
130 if(!_is_nchw)
Giorgio Arena26b22162018-08-13 15:49:49 +0100131 {
132 // Configure the function to transform the convoluted output to NHWC
133 _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
134 _permuted_output.allocator()->allocate();
135 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000136}
Georgios Pinitas60e98252018-10-22 16:17:20 +0100137
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000138void NEDepthwiseConvolutionLayer3x3::configure_optimized(const ITensor *input,
139 const ITensor *weights,
140 const ITensor *biases,
141 ITensor *output,
142 const PadStrideInfo &conv_info,
143 unsigned int depth_multiplier,
144 const ActivationLayerInfo &act_info)
145{
146 ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
147 const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
148 const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
149 _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
150 if(!_is_activationlayer_enabled)
151 {
152 act_info_to_use = act_info;
153 }
154
155 if(_is_nchw)
156 {
157 _memory_group.manage(&_permuted_input);
158 _memory_group.manage(&_permuted_output);
159
160 // Configure the function to transform the input tensor from NCHW -> NHWC
161 _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
162 _permuted_input.info()->set_data_layout(DataLayout::NHWC);
163
164 // Configure the function to transform the weights tensor from IHW -> HWI
165 _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
166 _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
167
Pablo Telloa28aebc2019-06-03 14:59:48 +0100168 _permuted_output.info()->set_data_layout(DataLayout::NHWC);
169 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
170
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000171 // Configure optimized depthwise
172 _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use);
173
174 // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
175 _permuted_output.info()->set_data_layout(DataLayout::NHWC);
176 _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
177
178 // Allocate tensors
179 _permuted_input.allocator()->allocate();
180 _permuted_output.allocator()->allocate();
181 }
182 else
183 {
184 _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use);
185 }
186}
187
188void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input,
189 const ITensor *weights,
190 const ITensor *biases,
191 ITensor *output, const PadStrideInfo &conv_info,
192 unsigned int depth_multiplier,
Usama Arife73686a2019-04-08 17:30:48 +0100193 const ActivationLayerInfo &act_info,
194 const Size2D &dilation)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000195{
Michele Di Giorgioff271922019-07-17 15:59:32 +0100196 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
197 // Perform validation step
198 ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer3x3::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
199 output->info(), conv_info, depth_multiplier, act_info, dilation));
Usama Arif881f2de2019-04-12 10:29:17 +0100200
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000201 _original_weights = weights;
202 _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
203 _has_bias = biases != nullptr;
204 _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
205 weights->info(),
206 conv_info,
Usama Arif881f2de2019-04-12 10:29:17 +0100207 depth_multiplier, dilation);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000208 _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
209 _permute = _is_optimized == _is_nchw;
210 _is_prepared = false;
Georgios Pinitas60e98252018-10-22 16:17:20 +0100211 _is_activationlayer_enabled = act_info.enabled();
212
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000213 // Configure appropriate pipeline
214 if(_is_optimized)
215 {
216 configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info);
217 }
218 else
219 {
Usama Arif881f2de2019-04-12 10:29:17 +0100220 configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000221 }
222
223 // Configure activation
Georgios Pinitas60e98252018-10-22 16:17:20 +0100224 if(_is_activationlayer_enabled)
225 {
226 _activationlayer_function.configure(output, nullptr, act_info);
227 }
Michalis Spyrou7362f0d2017-10-18 17:58:22 +0100228}
229
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000230Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input,
231 const ITensorInfo *weights,
232 const ITensorInfo *biases,
233 const ITensorInfo *output,
234 const PadStrideInfo &conv_info,
235 unsigned int depth_multiplier,
Usama Arife73686a2019-04-08 17:30:48 +0100236 const ActivationLayerInfo &act_info,
237 const Size2D &dilation)
Abe Mbise7784c832018-05-31 16:48:41 +0100238{
239 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Michele Di Giorgioff271922019-07-17 15:59:32 +0100240 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
241 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100242 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Usama Arif881f2de2019-04-12 10:29:17 +0100243 ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
244 const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
245 const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
246 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
247 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
Abe Mbise7784c832018-05-31 16:48:41 +0100248
Giorgio Arena66cbafb2018-08-23 14:51:00 +0100249 if(biases != nullptr)
250 {
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100251 const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
Giorgio Arena66cbafb2018-08-23 14:51:00 +0100252 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100253 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
Giorgio Arena66cbafb2018-08-23 14:51:00 +0100254 }
255
Usama Arif881f2de2019-04-12 10:29:17 +0100256 if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100257 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000258 const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
259 TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
260 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
261
262 if(is_quantized)
263 {
Michele Di Giorgioff271922019-07-17 15:59:32 +0100264 const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
265 const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
266 const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
267
268 float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
269 int output_multiplier;
270 int output_shift;
271 ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
272 ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output, output_multiplier, output_shift, oq_info.offset));
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000273 }
274 }
275 else
276 {
277 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier));
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100278 }
279
Georgios Pinitas60e98252018-10-22 16:17:20 +0100280 //Validate Activation Layer
281 if(act_info.enabled())
282 {
283 ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
284 }
285
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100286 return Status{};
Abe Mbise7784c832018-05-31 16:48:41 +0100287}
288
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000289void NEDepthwiseConvolutionLayer3x3::run_generic()
Michalis Spyrou7362f0d2017-10-18 17:58:22 +0100290{
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000291 // Fill border
292 NEScheduler::get().schedule(&_border_handler, Window::DimX);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000293
294 // Execute depthwise convolution
295 NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
296
Georgios Pinitas4074c992018-01-30 18:13:46 +0000297 // Add biases
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000298 if(_has_bias || _is_quantized)
Michalis Spyrou7362f0d2017-10-18 17:58:22 +0100299 {
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000300 NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
Michalis Spyrou7362f0d2017-10-18 17:58:22 +0100301 }
Giorgio Arena26b22162018-08-13 15:49:49 +0100302
303 // Permute output
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000304 if(!_is_nchw)
Giorgio Arena26b22162018-08-13 15:49:49 +0100305 {
306 _permute_output.run();
307 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000308}
Georgios Pinitas60e98252018-10-22 16:17:20 +0100309
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000310void NEDepthwiseConvolutionLayer3x3::run_optimized()
311{
312 // Run assembly function
313 _dwc_optimized_func.run();
314
315 // Permute output
316 if(_is_nchw)
317 {
318 _permute_output.run();
319 }
320}
321
322void NEDepthwiseConvolutionLayer3x3::run()
323{
324 prepare();
325
Georgios Pinitasda953f22019-04-02 17:27:03 +0100326 MemoryGroupResourceScope scope_mg(_memory_group);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000327
328 // Permute input
329 if(_permute)
330 {
331 _permute_input.run();
332 }
333
334 _is_optimized ? run_optimized() : run_generic();
335
336 // Run activation
Georgios Pinitas60e98252018-10-22 16:17:20 +0100337 if(_is_activationlayer_enabled)
338 {
339 _activationlayer_function.run();
340 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000341}
342
343void NEDepthwiseConvolutionLayer3x3::prepare()
344{
345 if(!_is_prepared)
346 {
347 // Permute weights
348 if(_permute)
349 {
350 _permuted_weights.allocator()->allocate();
351 _permute_weights.run();
352 _original_weights->mark_as_unused();
353 }
354
355 // Prepare optimized function
356 if(_is_optimized)
357 {
358 _dwc_optimized_func.prepare();
359 if(!_permuted_weights.is_used())
360 {
361 _permuted_weights.allocator()->free();
362 }
363 }
364
365 _is_prepared = true;
366 }
Michalis Spyroub7b31532017-11-23 12:10:21 +0000367}
368
Georgios Pinitas30271c72019-06-24 14:56:34 +0100369NEDepthwiseConvolutionLayerOptimized::NEDepthwiseConvolutionLayerOptimized(std::shared_ptr<IMemoryManager> memory_manager)
370 : _memory_group(memory_manager), _dwc_kernel(), _dwc_optimized_func(memory_manager), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(),
371 _activationlayer_function(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(), _original_weights(nullptr), _has_bias(false), _is_quantized(false), _is_optimized(false),
372 _is_nchw(true), _permute(false), _is_activationlayer_enabled(false), _is_prepared(false)
373{
374}
375
376void NEDepthwiseConvolutionLayerOptimized::configure_generic(ITensor *input,
377 const ITensor *weights,
378 const ITensor *biases,
379 ITensor *output,
380 const PadStrideInfo &conv_info,
381 unsigned int depth_multiplier,
382 const ActivationLayerInfo &act_info,
383 const Size2D &dilation)
384{
385 ARM_COMPUTE_UNUSED(act_info);
386
387 PixelValue zero_value(0.f);
388
389 // Initialize the intermediate accumulator tensor in case of quantized input
390 if(_is_quantized)
391 {
392 TensorShape accum_shape = output->info()->tensor_shape();
393 DataLayout accum_layout = output->info()->data_layout();
394 if(!_is_nchw)
395 {
396 permute(accum_shape, PermutationVector(1U, 2U, 0U));
397 accum_layout = DataLayout::NCHW;
398 }
399
400 _memory_group.manage(&_accumulator);
401 _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, output->info()->quantization_info()));
402 _accumulator.info()->set_data_layout(accum_layout);
403 zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().uniform().offset));
404 }
405
406 if(!_is_nchw)
407 {
408 _memory_group.manage(&_permuted_input);
409 _memory_group.manage(&_permuted_output);
410
411 // Configure the function to transform the input tensor from NHWC -> NCHW
412 _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
413 _permuted_input.info()->set_data_layout(DataLayout::NCHW);
414
415 // Configure the function to transform the weights tensor from HWI -> IHW
416 _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
417 _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
418 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
419
420 // Configure depthwise
421 _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier, dilation);
422
423 // Configure border handler
424 _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
425
426 // Allocate tensors
427 _permuted_input.allocator()->allocate();
428 }
429 else
430 {
431 // Configure depthwise convolution kernel
432 _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier, dilation);
433
434 // Configure border handler
435 _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
436 }
437
438 // Configure biases accumulation
439 if(_is_quantized)
440 {
441 const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
442 const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
443 const UniformQuantizationInfo oq_info = (output->info()->total_size() == 0) ? iq_info : output->info()->quantization_info().uniform();
444
445 float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
446 int output_multiplier;
447 int output_shift;
448 quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
449 _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
450 _accumulator.allocator()->allocate();
451 }
452 else if(_has_bias)
453 {
454 _output_stage_kernel.configure(_is_nchw ? output : &_permuted_output, biases);
455 }
456
457 // Permute output
458 if(!_is_nchw)
459 {
460 // Configure the function to transform the convoluted output to NHWC
461 _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
462 _permuted_output.allocator()->allocate();
463 }
464}
465
466void NEDepthwiseConvolutionLayerOptimized::configure_optimized(const ITensor *input,
467 const ITensor *weights,
468 const ITensor *biases,
469 ITensor *output,
470 const PadStrideInfo &conv_info,
471 unsigned int depth_multiplier,
472 const ActivationLayerInfo &act_info,
473 const Size2D &dilation)
474{
475 ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
476 const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
477 const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
478 _is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
479 if(!_is_activationlayer_enabled)
480 {
481 act_info_to_use = act_info;
482 }
483
484 if(_is_nchw)
485 {
486 _memory_group.manage(&_permuted_input);
487 _memory_group.manage(&_permuted_output);
488
489 // Configure the function to transform the input tensor from NCHW -> NHWC
490 _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
491 _permuted_input.info()->set_data_layout(DataLayout::NHWC);
492
493 // Configure the function to transform the weights tensor from IHW -> HWI
494 _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
495 _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
496
497 _permuted_output.info()->set_data_layout(DataLayout::NHWC);
498 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
499
500 // Configure optimized depthwise
501 _dwc_optimized_func.configure(&_permuted_input, &_permuted_weights, biases, &_permuted_output, conv_info, depth_multiplier, act_info_to_use, dilation);
502
503 // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
504 _permuted_output.info()->set_data_layout(DataLayout::NHWC);
505 _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
506
507 // Allocate tensors
508 _permuted_input.allocator()->allocate();
509 _permuted_output.allocator()->allocate();
510 }
511 else
512 {
513 _dwc_optimized_func.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info_to_use, dilation);
514 }
515}
516
517void NEDepthwiseConvolutionLayerOptimized::configure(ITensor *input,
518 const ITensor *weights,
519 const ITensor *biases,
520 ITensor *output, const PadStrideInfo &conv_info,
521 unsigned int depth_multiplier,
522 const ActivationLayerInfo &act_info,
523 const Size2D &dilation)
524{
Michele Di Giorgioff271922019-07-17 15:59:32 +0100525 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
526 // Perform validation step
527 ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayerOptimized::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
528 output->info(), conv_info, depth_multiplier, act_info, dilation));
Georgios Pinitas30271c72019-06-24 14:56:34 +0100529
530 _original_weights = weights;
531 _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
532 _has_bias = biases != nullptr;
533 _is_optimized = NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input->info(),
534 weights->info(),
535 conv_info,
536 depth_multiplier,
537 dilation);
538 _is_nchw = input->info()->data_layout() == DataLayout::NCHW;
539 _permute = _is_optimized == _is_nchw;
540 _is_prepared = false;
541 _is_activationlayer_enabled = act_info.enabled();
542
543 // Configure appropriate pipeline
544 if(_is_optimized)
545 {
546 configure_optimized(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
547 }
548 else
549 {
550 configure_generic(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
551 }
552
553 // Configure activation
554 if(_is_activationlayer_enabled)
555 {
556 _activationlayer_function.configure(output, nullptr, act_info);
557 }
558}
559
560Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo *input,
561 const ITensorInfo *weights,
562 const ITensorInfo *biases,
563 const ITensorInfo *output,
564 const PadStrideInfo &conv_info,
565 unsigned int depth_multiplier,
566 const ActivationLayerInfo &act_info,
567 const Size2D &dilation)
568{
569 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Michele Di Giorgioff271922019-07-17 15:59:32 +0100570 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
571 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
Georgios Pinitas30271c72019-06-24 14:56:34 +0100572 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
573 ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
574 const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
575 const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
576 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
577 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
578
579 if(biases != nullptr)
580 {
581 const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
582 ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
583 ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
584 }
585
586 if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
587 {
588 const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
589 TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
590 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation));
591
592 if(is_quantized)
593 {
594 ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
595 }
596 }
597 else
598 {
599 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionAssemblyDispatch::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation));
600 }
601
602 //Validate Activation Layer
603 if(act_info.enabled())
604 {
605 ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
606 }
607
608 return Status{};
609}
610
611void NEDepthwiseConvolutionLayerOptimized::run_generic()
612{
613 // Fill border
614 NEScheduler::get().schedule(&_border_handler, Window::DimX);
615
616 // Execute depthwise convolution
617 NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
618
619 // Add biases
620 if(_has_bias || _is_quantized)
621 {
622 NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
623 }
624
625 // Permute output
626 if(!_is_nchw)
627 {
628 _permute_output.run();
629 }
630}
631
632void NEDepthwiseConvolutionLayerOptimized::run_optimized()
633{
634 // Run assembly function
635 _dwc_optimized_func.run();
636
637 // Permute output
638 if(_is_nchw)
639 {
640 _permute_output.run();
641 }
642}
643
644void NEDepthwiseConvolutionLayerOptimized::run()
645{
646 prepare();
647
648 MemoryGroupResourceScope scope_mg(_memory_group);
649
650 // Permute input
651 if(_permute)
652 {
653 _permute_input.run();
654 }
655
656 _is_optimized ? run_optimized() : run_generic();
657
658 // Run activation
659 if(_is_activationlayer_enabled)
660 {
661 _activationlayer_function.run();
662 }
663}
664
665void NEDepthwiseConvolutionLayerOptimized::prepare()
666{
667 if(!_is_prepared)
668 {
669 // Permute weights
670 if(_permute)
671 {
672 _permuted_weights.allocator()->allocate();
673 _permute_weights.run();
674 _original_weights->mark_as_unused();
675 }
676
677 // Prepare optimized function
678 if(_is_optimized)
679 {
680 _dwc_optimized_func.prepare();
681 if(!_permuted_weights.is_used())
682 {
683 _permuted_weights.allocator()->free();
684 }
685 }
686
687 _is_prepared = true;
688 }
689}
690
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000691NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
Giorgio Arena44f55722019-07-12 14:49:49 +0100692 : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _depthwise_conv_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _fill_border(), _v2mm_input_fill_border(),
693 _v2mm_weights_fill_border(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(),
694 _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false), _is_quantized(false), _is_nhwc(false), _is_activationlayer_enabled(false), _is_optimized(false),
695 _original_weights(nullptr)
Michalis Spyroub7b31532017-11-23 12:10:21 +0000696{
697}
698
Georgios Pinitas60e98252018-10-22 16:17:20 +0100699void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
Usama Arife73686a2019-04-08 17:30:48 +0100700 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
Michalis Spyroub7b31532017-11-23 12:10:21 +0000701{
Michele Di Giorgioff271922019-07-17 15:59:32 +0100702 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
703 // Perform validation step
704 ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
705 output->info(), conv_info, depth_multiplier, act_info, dilation));
Michalis Spyroub7b31532017-11-23 12:10:21 +0000706
Giorgio Arena44f55722019-07-12 14:49:49 +0100707 _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
708 _is_optimized = _is_nhwc && input->info()->data_type() == DataType::F32;
Giorgio Arena26b22162018-08-13 15:49:49 +0100709
Giorgio Arena44f55722019-07-12 14:49:49 +0100710 if(!_is_optimized)
Giorgio Arena26b22162018-08-13 15:49:49 +0100711 {
Giorgio Arena44f55722019-07-12 14:49:49 +0100712 ITensor *input_to_use = input;
713 const ITensor *weights_to_use = weights;
714 ITensor *output_to_use = output;
Giorgio Arena26b22162018-08-13 15:49:49 +0100715
Giorgio Arena44f55722019-07-12 14:49:49 +0100716 if(_is_nhwc)
717 {
718 _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
719 _permuted_input.info()->set_data_layout(DataLayout::NCHW);
720 input_to_use = &_permuted_input;
721
722 _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
723 _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
724 weights_to_use = &_permuted_weights;
725 }
726
727 const size_t weights_w = weights_to_use->info()->dimension(0);
728 const size_t weights_h = weights_to_use->info()->dimension(1);
729 const size_t weights_z = weights_to_use->info()->dimension(2);
730
731 _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
732 _is_prepared = false;
733 _original_weights = weights_to_use;
734
735 // Should bias be appended ?
736 bool append_bias = (biases != nullptr) && !_is_quantized;
737
738 // Calculate output shape
739 TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
740
741 // Output auto inizialitation if not yet initialized
742 auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
743 ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
744
745 if(_is_nhwc)
746 {
747 permute(output_shape, PermutationVector(1U, 2U, 0U));
748 _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
749 _permuted_output.info()->set_data_layout(DataLayout::NCHW);
750 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
751 output_to_use = &_permuted_output;
752 }
753
754 // Output width and height
755 const unsigned int conv_w = output_shape.x();
756 const unsigned int conv_h = output_shape.y();
757
758 // Set up intermediate tensors
759 const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
760 const size_t conv_size = conv_w * conv_h;
761
762 // Im2Col configuration
763 TensorShape shape_im2col = input_to_use->info()->tensor_shape();
764 shape_im2col.set(0, patch_size);
765 shape_im2col.set(1, conv_size);
766 shape_im2col.set(2, weights_z);
767 _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
768 _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation);
769
770 // Weights reshape configuration
771 const TensorShape shape_weights_reshape(patch_size, weights_z);
772 _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
773 _weights_reshape_kernel.configure(weights_to_use, &_weights_reshaped, append_bias ? biases : nullptr);
774
775 // GEMV configuration
776 DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
777 TensorShape shape_v2mm_out = input_to_use->info()->tensor_shape();
778 shape_v2mm_out.set(0, conv_size * weights_z);
779 shape_v2mm_out.set(1, 1);
780 shape_v2mm_out.set(2, 1);
781 _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
782 _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
783 _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
784 _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output_to_use, conv_w, conv_h);
785
786 // Output staged configuration
787 if(_is_quantized)
788 {
789 const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
790 const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform();
791 const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
792
793 float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
794 int output_multiplier;
795 int output_shift;
796 quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
797 _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, oq_info.offset);
798 _output_reshaped.allocator()->allocate();
799 }
800
801 if(_is_nhwc)
802 {
803 _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
804
805 _permuted_input.allocator()->allocate();
806 _permuted_weights.allocator()->allocate();
807 _permuted_output.allocator()->allocate();
808 }
809
810 // Fill borders on inputs
811 PixelValue zero_in(static_cast<int32_t>(0));
812 PixelValue zero_w(static_cast<int32_t>(0));
813 if(_is_quantized)
814 {
815 zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().uniform().offset));
816 zero_w = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().uniform().offset));
817 }
818 BorderSize border_size = _v2mm_kernel.border_size();
819 _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
820
821 border_size.bottom = 0;
822 _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
823
824 // Allocate intermediate tensors
825 _input_reshaped.allocator()->allocate();
826 _v2mm_output.allocator()->allocate();
Giorgio Arena26b22162018-08-13 15:49:49 +0100827 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100828 else
Giorgio Arena26b22162018-08-13 15:49:49 +0100829 {
Giorgio Arena44f55722019-07-12 14:49:49 +0100830 // Configure kernel
831 _depthwise_conv_kernel.configure(input, weights, biases, output, conv_info, depth_multiplier, dilation);
832
833 // Fill input borders
834 _fill_border.configure(input, _depthwise_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<uint64_t>(0), input->info()->data_type()));
Giorgio Arena26b22162018-08-13 15:49:49 +0100835 }
836
Georgios Pinitas60e98252018-10-22 16:17:20 +0100837 //Configure Activation Layer
838 _is_activationlayer_enabled = act_info.enabled();
839
840 if(_is_activationlayer_enabled)
841 {
842 _activationlayer_function.configure(output, nullptr, act_info);
843 }
Michalis Spyroub7b31532017-11-23 12:10:21 +0000844}
845
Georgios Pinitas10490202018-08-17 17:16:06 +0100846Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
Usama Arife73686a2019-04-08 17:30:48 +0100847 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
Abe Mbise7784c832018-05-31 16:48:41 +0100848{
849 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100850 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Usama Arif881f2de2019-04-12 10:29:17 +0100851 ARM_COMPUTE_RETURN_ERROR_ON(dilation.x() < 1 || dilation.y() < 1);
Gian Marco Iodice23e24792018-09-07 15:32:14 +0100852
Michele Di Giorgioff271922019-07-17 15:59:32 +0100853 const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
854 const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
855 const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
Abe Mbise7784c832018-05-31 16:48:41 +0100856
Usama Arif881f2de2019-04-12 10:29:17 +0100857 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) + (weights->dimension(width_idx) - 1) * (dilation.x() - 1) > input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right());
858 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) + (weights->dimension(height_idx) - 1) * (dilation.y() - 1) > input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom());
Michele Di Giorgioff271922019-07-17 15:59:32 +0100859 ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) * depth_multiplier) != weights->dimension(channel_idx));
860
Giorgio Arena44f55722019-07-12 14:49:49 +0100861 if(input->data_layout() != DataLayout::NHWC || input->data_type() != DataType::F32)
Giorgio Arena26b22162018-08-13 15:49:49 +0100862 {
Giorgio Arena44f55722019-07-12 14:49:49 +0100863 // Clone output to use auto init
864 auto output_clone = output->clone();
Giorgio Arena26b22162018-08-13 15:49:49 +0100865
Giorgio Arena44f55722019-07-12 14:49:49 +0100866 const ITensorInfo *input_to_use = input;
867 const ITensorInfo *weights_to_use = weights;
868 const ITensorInfo *output_to_use = output_clone.get();
Giorgio Arena26b22162018-08-13 15:49:49 +0100869
Giorgio Arena44f55722019-07-12 14:49:49 +0100870 TensorShape permuted_input_shape = input->tensor_shape();
871 TensorShape permuted_weights_shape = weights->tensor_shape();
872 TensorInfo permuted_input;
873 TensorInfo permuted_weights;
874
875 if(input->data_layout() == DataLayout::NHWC)
876 {
877 permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
878 permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
879
880 permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW));
881 permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW));
882
883 input_to_use = &permuted_input;
884 weights_to_use = &permuted_weights;
885 }
886
887 const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
888 const bool append_bias = (biases != nullptr) && !is_quantized;
889 TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
890 const size_t weights_w = weights_to_use->dimension(0);
891 const size_t weights_h = weights_to_use->dimension(1);
892 const size_t weights_z = weights_to_use->dimension(2);
893 const unsigned int conv_w = output_shape[width_idx];
894 const unsigned int conv_h = output_shape[height_idx];
895 const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
896 const size_t conv_size = conv_w * conv_h;
897
898 // Output auto inizialitation if not yet initialized
899 auto_init_if_empty(*output_clone, input->clone()->set_tensor_shape(output_shape));
900 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
901
902 TensorInfo permuted_output;
903 if(input->data_layout() == DataLayout::NHWC)
904 {
905 permute(output_shape, PermutationVector(1U, 2U, 0U));
906 permuted_output = TensorInfo(output_clone->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_layout(DataLayout::NCHW));
907 output_to_use = &permuted_output;
908 }
909
910 // Im2Col configuration
911 TensorShape shape_im2col = input_to_use->tensor_shape();
912 shape_im2col.set(0, patch_size);
913 shape_im2col.set(1, conv_size);
914 shape_im2col.set(2, weights_z);
915 TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
916 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier, dilation));
917
918 // Weights reshape configuration
919 const TensorShape shape_weights_reshape(patch_size, weights_z);
920 TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
921 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights_to_use, &weights_reshaped, append_bias ? biases : nullptr));
922
923 // GEMV configuration
924 DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
925 TensorShape shape_v2mm_out = input_to_use->tensor_shape();
926 shape_v2mm_out.set(0, conv_size * weights_z);
927 shape_v2mm_out.set(1, 1);
928 shape_v2mm_out.set(2, 1);
929 TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
930 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
931
932 TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_to_use->tensor_shape()));
933 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output_to_use, conv_w, conv_h));
934
935 if(is_quantized)
936 {
937 const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
938 const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
939 const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
940
941 float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
942 int output_multiplier;
943 int output_shift;
944 ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
945 ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use, output_multiplier, output_shift, oq_info.offset));
946 }
Giorgio Arena26b22162018-08-13 15:49:49 +0100947 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100948 else
Giorgio Arena26b22162018-08-13 15:49:49 +0100949 {
Giorgio Arena44f55722019-07-12 14:49:49 +0100950 ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, dilation));
Abe Mbise7784c832018-05-31 16:48:41 +0100951 }
952
Georgios Pinitas60e98252018-10-22 16:17:20 +0100953 // Validate Activation Layer
954 if(act_info.enabled())
955 {
956 ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
957 }
958
Abe Mbise7784c832018-05-31 16:48:41 +0100959 return Status{};
960}
961
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000962void NEDepthwiseConvolutionLayer::run()
Michalis Spyroub7b31532017-11-23 12:10:21 +0000963{
Giorgio Arena44f55722019-07-12 14:49:49 +0100964 if(!_is_optimized)
Giorgio Arena26b22162018-08-13 15:49:49 +0100965 {
Giorgio Arena44f55722019-07-12 14:49:49 +0100966 prepare();
967
968 if(_is_nhwc)
969 {
970 _permute_input.run();
971 }
972
973 NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
974 NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
975 NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
976 NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
977 if(_is_quantized)
978 {
979 NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
980 }
981
982 if(_is_nhwc)
983 {
984 _permute_output.run();
985 }
Giorgio Arena26b22162018-08-13 15:49:49 +0100986 }
Giorgio Arena44f55722019-07-12 14:49:49 +0100987 else
Georgios Pinitasd05dce42018-01-22 16:29:17 +0000988 {
Giorgio Arena44f55722019-07-12 14:49:49 +0100989 NEScheduler::get().schedule(&_fill_border, Window::DimX);
990 NEScheduler::get().schedule(&_depthwise_conv_kernel, Window::DimY);
Giorgio Arena26b22162018-08-13 15:49:49 +0100991 }
Georgios Pinitas60e98252018-10-22 16:17:20 +0100992
993 if(_is_activationlayer_enabled)
994 {
995 _activationlayer_function.run();
996 }
Anthony Barbierfb8dda22018-01-30 09:27:05 +0000997}
Georgios Pinitas72219332018-06-05 14:56:06 +0100998
999void NEDepthwiseConvolutionLayer::prepare()
1000{
Giorgio Arena44f55722019-07-12 14:49:49 +01001001 if(!_is_prepared && !_is_optimized)
Georgios Pinitas72219332018-06-05 14:56:06 +01001002 {
1003 ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
1004
Giorgio Arena26b22162018-08-13 15:49:49 +01001005 if(_is_nhwc)
1006 {
1007 _permute_weights.run();
1008 }
1009
Georgios Pinitas72219332018-06-05 14:56:06 +01001010 // Run reshape and mark original weights as unused
1011 _weights_reshaped.allocator()->allocate();
1012 NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
1013 NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
1014 _original_weights->mark_as_unused();
1015
1016 _is_prepared = true;
1017 }
1018}
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001019} // namespace arm_compute