blob: 049bf66689c553cf2fa1f7188bd749a87c795b29 [file] [log] [blame]
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
26
27#include "arm_compute/core/CPP/Validate.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
30#include "arm_compute/core/Utils.h"
31#include "arm_compute/core/utils/misc/InfoHelpers.h"
32#include "arm_compute/core/utils/misc/ShapeCalculator.h"
33#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
34
35#include "arm_compute/runtime/NEON/NEScheduler.h"
36
37namespace arm_compute
38{
39namespace
40{
41std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensor *input,
42 const ITensor *weights,
43 ITensor *output,
44 PadStrideInfo conv_info,
45 ActivationLayerInfo act_info)
46{
47 const DataType data_type = input->info()->data_type();
48 const TensorShape shape = input->info()->tensor_shape();
49
50 const int n_batches = shape[3];
51 const int in_rows = shape.z();
52 const int in_cols = shape.y();
53 const int n_channels = shape.x();
54 const int padding_top = conv_info.pad_top();
55 const int padding_left = conv_info.pad_left();
56 const int padding_bottom = conv_info.pad_bottom();
57 const int padding_right = conv_info.pad_right();
58
59 const unsigned int stride_x = conv_info.stride().first;
60
61 // Map activation function
62 neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
63 if(arm_compute::utils::info_helpers::is_relu(act_info))
64 {
65 activation = neon_convolution_kernels::ActivationFunction::ReLU;
66 }
67 else if(arm_compute::utils::info_helpers::is_relu6(act_info))
68 {
69 activation = neon_convolution_kernels::ActivationFunction::ReLU6;
70 }
71
72 // Create quantized convolver
73 if(data_type == DataType::QASYMM8)
74 {
75 const QuantizationInfo &input_qinfo = input->info()->quantization_info();
76 const QuantizationInfo &weights_qinfo = weights->info()->quantization_info();
77 const QuantizationInfo &output_qinfo = output->info()->quantization_info();
78
79 // Check that quantization info are in the range [0, 255]
80 ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
81 ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
82 ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
83 const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
84 const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
85 const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };
86
87 // Calculate rescale parameters
88 const float fmultipler = iqinfo.scale * wqinfo.scale / oqinfo.scale;
89 int qmultiplier = 0;
90 int qshift = 0;
91 quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
92 qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);
93
94 // Create convolver
95 switch(stride_x)
96 {
97 case 1:
98 return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
99 n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
100 case 2:
101 return arm_compute::support::cpp14::make_unique<depthwise::QAsymm8DepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
102 n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
103 default:
104 return nullptr;
105 }
106 }
107 else
108 {
109 // Create float convolver
110 switch(data_type)
111 {
112#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
113 case DataType::F16:
114 {
115 switch(stride_x)
116 {
117 case 1:
118 return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
119 n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
120 case 2:
121 return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
122 n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
123 default:
124 return nullptr;
125 }
126 break;
127 }
128#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
129 case DataType::F32:
130 {
131 switch(stride_x)
132 {
133 case 1:
134 return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
135 n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
136 case 2:
137 return arm_compute::support::cpp14::make_unique<depthwise::DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
138 n_batches, in_rows, in_cols, n_channels, activation, padding_top, padding_left, padding_bottom, padding_right);
139 default:
140 return nullptr;
141 }
142 break;
143 }
144 default:
145 return nullptr;
146 }
147 }
148}
149} // namespace
150
151#ifndef DOXYGEN_SKIP_THIS
152NEDepthwiseConvolutionAssemblyDispatch::NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<arm_compute::IMemoryManager> memory_manager)
153 : _memory_group(std::move(memory_manager)), _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _packed_weights(), _workspace(), _is_prepared(false), _dwc_assembly_kernel(nullptr),
154 _dwc_acl_kernel()
155{
156}
157#endif /* DOXYGEN_SKIP_THIS */
158
159void NEDepthwiseConvolutionAssemblyDispatch::configure(const ITensor *input,
160 const ITensor *weights,
161 const ITensor *bias,
162 ITensor *output,
163 const PadStrideInfo &conv_info,
164 unsigned int depth_multiplier,
165 const ActivationLayerInfo &act_info)
166{
167 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
168 ARM_COMPUTE_UNUSED(depth_multiplier);
169 ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionAssemblyDispatch::validate(input->info(),
170 weights->info(),
171 bias != nullptr ? bias->info() : nullptr,
172 output->info(),
173 conv_info,
174 depth_multiplier,
175 act_info));
176
177 // Output auto inizialitation if not yet initialized
178 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
179 auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
180
181 _input = input;
182 _weights = weights;
183 _bias = bias;
184 _output = output;
185 _is_prepared = false;
186
187 // Create convolver
188 _dwc_assembly_kernel = create_convolver(input, weights, output, conv_info, act_info);
189 ARM_COMPUTE_ERROR_ON(_dwc_assembly_kernel == nullptr);
190
191 // Create assembly kernel wrapper
192 _dwc_acl_kernel.configure(_dwc_assembly_kernel.get());
193
194 constexpr size_t alignment = 128;
195
196 // Create workspace
197 const unsigned int num_threads = NEScheduler::get().num_threads();
198 const size_t workspace_size = _dwc_assembly_kernel->get_working_space_size(num_threads);
199 ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
200 _workspace.allocator()->init(TensorInfo(TensorShape{ workspace_size }, 1, DataType::S8), alignment);
201 _memory_group.manage(&_workspace);
202 _workspace.allocator()->allocate();
203
204 // Create packing tensor
205 const size_t pack_tensor_size = _dwc_assembly_kernel->get_packed_params_size();
206 ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");
207 _packed_weights.allocator()->init(TensorInfo(TensorShape{ pack_tensor_size }, 1, DataType::S8), alignment);
208}
209
210Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo *input,
211 const ITensorInfo *weights,
212 const ITensorInfo *bias,
213 const ITensorInfo *output,
214 const PadStrideInfo &conv_info,
215 unsigned int depth_multiplier,
216 const ActivationLayerInfo &act_info)
217{
218 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
219 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
220 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
221 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
222
223 const auto strides = conv_info.stride();
224 const DataLayout data_layout = input->data_layout();
225 unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
226 unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
227 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
228 ARM_COMPUTE_RETURN_ERROR_ON(!((strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2))));
229 ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier != 1);
230
231 const bool is_relu = arm_compute::utils::info_helpers::is_relu(act_info);
232 const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
233 ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !(is_relu || is_relu6));
234
235 // Check bias
236 if(bias != nullptr)
237 {
238 unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
239 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
240 ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
241 }
242
243 // Check output
244 if(output->total_size() != 0)
245 {
246 const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
247 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
248 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
249 }
250
251 return Status{};
252}
253
254bool NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo *input,
255 const ITensorInfo *weights,
256 PadStrideInfo conv_info,
Usama Arif881f2de2019-04-12 10:29:17 +0100257 unsigned int depth_multiplier,
258 const Size2D &dilation)
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000259{
260 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
261
262 // Reshape input shape if in NHWC format
263 const DataLayout data_layout = input->data_layout();
264 TensorShape in_shape{ input->tensor_shape() };
265 if(data_layout == DataLayout::NHWC)
266 {
267 in_shape.set(Window::DimX, input->tensor_shape().y());
268 in_shape.set(Window::DimY, input->tensor_shape().z());
269 in_shape.set(Window::DimZ, input->tensor_shape().x());
270 }
271
272 // Check data type
273 const DataType data_type = weights->data_type();
274 bool is_data_type_valid = is_data_type_float(data_type) || is_data_type_quantized_asymmetric(data_type);
275
276 // Check weighs size
277 const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
278 const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
279 bool weights_supported = (weights->dimension(width_idx) == 3) && (weights->dimension(height_idx) == 3);
280
281 // Check for supported strides
282 const auto &strides = conv_info.stride();
283 bool supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
284
285 // Check for supported padding
286 const auto pad_top = conv_info.pad_top();
287 const auto pad_right = conv_info.pad_right();
288 const auto pad_bottom = conv_info.pad_bottom();
289 const auto pad_left = conv_info.pad_left();
290 PadStrideInfo same_pad = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
291 bool is_same_padding = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
292 bool is_valid_padding = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
293 bool supported_padding = is_same_padding || is_valid_padding;
Usama Arif881f2de2019-04-12 10:29:17 +0100294 bool is_dilation_1 = dilation.x() == 1 && dilation.y() == 1;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000295
Usama Arif881f2de2019-04-12 10:29:17 +0100296 return is_data_type_valid && weights_supported && supported_strides && supported_padding && (depth_multiplier == 1) && is_dilation_1;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000297}
298
299void NEDepthwiseConvolutionAssemblyDispatch::run()
300{
301 // Prepare assembly kernel
302 prepare();
303
Georgios Pinitasda953f22019-04-02 17:27:03 +0100304 MemoryGroupResourceScope scope_mg(_memory_group);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000305
306 // Setup inputs/outputs
307 ARM_COMPUTE_ERROR_ON(_workspace.buffer() == nullptr);
308 _dwc_assembly_kernel->set_working_space(static_cast<void *>(_workspace.buffer()));
309
310 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
311 const int input_element_size = _input->info()->element_size();
312 const int input_batch_stride = _input->info()->strides_in_bytes()[3] / input_element_size;
313 const int input_row_stride = _input->info()->strides_in_bytes().z() / input_element_size;
314 const int input_col_stride = _input->info()->strides_in_bytes().y() / input_element_size;
315 const void *input_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes();
316 _dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);
317
318 ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
319 const int output_element_size = _output->info()->element_size();
320 const int output_batch_stride = _output->info()->strides_in_bytes()[3] / output_element_size;
321 const int output_row_stride = _output->info()->strides_in_bytes().z() / output_element_size;
322 const int output_col_stride = _output->info()->strides_in_bytes().y() / output_element_size;
323 void *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes();
324 _dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);
325
326 // Schedule assembly kernel
327 NEScheduler::get().schedule(&_dwc_acl_kernel, Window::DimX);
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000328}
329
330void NEDepthwiseConvolutionAssemblyDispatch::prepare()
331{
332 if(!_is_prepared)
333 {
334 _packed_weights.allocator()->allocate();
335 ARM_COMPUTE_ERROR_ON(_packed_weights.buffer() == nullptr);
336
337 // Pack weights and bias
338 const int weights_element_size = _weights->info()->element_size();
339 const int weights_row_stride = _weights->info()->strides_in_bytes().z() / weights_element_size;
340 const int weights_col_stride = _weights->info()->strides_in_bytes().y() / weights_element_size;
341 _dwc_assembly_kernel->pack_params(_packed_weights.buffer(),
342 _weights->buffer() + _weights->info()->offset_first_element_in_bytes(),
343 weights_row_stride,
344 weights_col_stride,
345 (_bias != nullptr) ? _bias->buffer() : nullptr);
346 _dwc_assembly_kernel->set_packed_params_buffer(_packed_weights.buffer());
347
348 _weights->mark_as_unused();
349 if(_bias != nullptr)
350 {
351 _bias->mark_as_unused();
352 }
353 _is_prepared = true;
354 }
355}
356} // namespace arm_compute