blob: f5c63b763f5bc54f26981f1ea681fcf607869622 [file] [log] [blame]
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00001/*
2 * Copyright (c) 2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
25
26#include "arm_compute/core/Utils.h"
27#include "arm_compute/core/Validate.h"
28#include "arm_compute/core/utils/misc/ShapeCalculator.h"
29#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
30#include "src/core/CPP/Validate.h"
31#include "src/core/helpers/AutoConfiguration.h"
32#include "src/core/helpers/WindowHelpers.h"
33#include "src/core/utils/AssemblyUtils.h"
34
35#include "src/core/NEON/kernels/assembly/depthwise.hpp"
36
37#include "depthwise_common.hpp"
38
39#include <arm_neon.h>
40
41namespace arm_compute
42{
43namespace cpu
44{
45namespace kernels
46{
47using namespace arm_compute::misc::shape_calculator;
48
49namespace
50{
51constexpr unsigned int idx_width = 1;
52constexpr unsigned int idx_height = 2;
53constexpr unsigned int idx_channels = 0;
54constexpr unsigned int idx_batches = 3;
55
56template <typename TSrc, typename TWeights, typename TDst>
57void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
58 const ConvolutionInfo &info, const CPUInfo &cpu_info,
59 std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel)
60{
61 unsigned int stride_cols{};
62 unsigned int stride_rows{};
63 std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
64
65 const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
66
67 const unsigned int n_batches = src->dimension(idx_batches);
68 const unsigned int src_rows = src->dimension(idx_height);
69 const unsigned int src_cols = src->dimension(idx_width);
70 const unsigned int n_channels = src->dimension(idx_channels);
71 const unsigned int dst_rows = dst->dimension(idx_height);
72 const unsigned int dst_cols = dst->dimension(idx_width);
73
74 const unsigned int kernel_cols = weights->dimension(idx_width);
75 const unsigned int kernel_rows = weights->dimension(idx_height);
76
77 const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
78
79 arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
80 n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
81 padding, activation, nullptr);
82
83 // Configure assembly pooling kernel
84 auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
85 if(dwc_kernel_asm == nullptr)
86 {
87 // Configuration not supported: Leave function unconfigured:
88 return;
89 }
90
91 kernel = std::move(dwc_kernel_asm);
92}
93
94template <typename TSrc, typename TWeights, typename TDst>
95void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
96 const ConvolutionInfo &info, const CPUInfo &cpu_info,
97 std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
98 std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts)
99{
100 unsigned int stride_cols{};
101 unsigned int stride_rows{};
102 std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
103
104 const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
105
106 const unsigned int n_batches = src->dimension(idx_batches);
107 const unsigned int src_rows = src->dimension(idx_height);
108 const unsigned int src_cols = src->dimension(idx_width);
109 const unsigned int n_channels = src->dimension(idx_channels);
110 const unsigned int dst_rows = dst->dimension(idx_height);
111 const unsigned int dst_cols = dst->dimension(idx_width);
112
113 const unsigned int kernel_cols = weights->dimension(idx_width);
114 const unsigned int kernel_rows = weights->dimension(idx_height);
115
116 const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
117
118 arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
119 n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
120 padding, activation, nullptr);
121
122 const auto src_qinfo = src->quantization_info().uniform();
123 const auto weights_qinfo = weights->quantization_info();
124 const auto dst_qinfo = dst->quantization_info().uniform();
125
126 const unsigned int num_filters = weights_qinfo.scale().size();
127
128 multipliers.resize(num_filters);
129 std::vector<int32_t> dst_shifts(num_filters);
130 quantization::compute_quantized_multipliers_and_shifts(src,
131 weights,
132 dst,
133 multipliers.data(),
134 dst_shifts.data());
135
136 // Quantize activation bounds
137 int32_t min_activation = std::numeric_limits<TSrc>::lowest();
138 int32_t max_activation = std::numeric_limits<TSrc>::max();
139 if(info.act_info.enabled())
140 {
141 std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
142 }
143
144 // Set quantization parameters for assembly kernels
145 arm_gemm::Requantize32 requant_args{};
146 if(is_data_type_quantized_per_channel(weights->data_type()))
147 {
148 left_shifts.resize(num_filters);
149 right_shifts.resize(num_filters);
150 bool need_left_shift = false; // Select more optimized path if left shift is not needed
151 for(unsigned int i = 0; i < num_filters; ++i)
152 {
153 left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0));
154 right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
155 if(dst_shifts[i] < 0 && !need_left_shift)
156 {
157 need_left_shift = true;
158 }
159 }
160
161 requant_args = arm_gemm::Requantize32(nullptr,
162 0,
163 src_qinfo.offset,
164 weights_qinfo.uniform().offset,
165 dst_qinfo.offset,
166 (need_left_shift) ? left_shifts.data() : nullptr,
167 right_shifts.data(),
168 multipliers.data(),
169 static_cast<TSrc>(min_activation),
170 static_cast<TSrc>(max_activation));
171 }
172 else
173 {
174 requant_args = arm_gemm::Requantize32(nullptr,
175 0,
176 src_qinfo.offset,
177 weights_qinfo.uniform().offset,
178 dst_qinfo.offset,
179 -dst_shifts[0],
180 multipliers[0],
181 static_cast<TSrc>(min_activation),
182 static_cast<TSrc>(max_activation));
183 }
184
185 // Configure assembly pooling kernel with requantization
186 auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
187 if(dwc_kernel_asm == nullptr)
188 {
189 // Configuration not supported: Leave function unconfigured:
190 return;
191 }
192
193 kernel = std::move(dwc_kernel_asm);
194}
195} // namespace
196
197CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
198 : _kernel_asm(nullptr),
199 _multipliers(),
200 _left_shifts(),
201 _right_shifts()
202{
203}
204
205CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
206
207void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
208 const ConvolutionInfo &info, const CPUInfo &cpu_info)
209{
210 ARM_COMPUTE_UNUSED(cpu_info);
211 ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
212
213 // Destination initialization if not yet initialized
214 const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
215 auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
216
217#if defined(__aarch64__)
218 switch(src->data_type())
219 {
220 case DataType::QASYMM8:
221 if(is_data_type_quantized_per_channel(weights->data_type()))
222 {
223 create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
224 }
225 else
226 {
227 create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
228 }
229 break;
230 case DataType::QASYMM8_SIGNED:
231 create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
232 break;
233#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
234 case DataType::F16:
235 create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm);
236 break;
237#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
238 case DataType::F32:
239 create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm);
240 break;
241 default:
242 break;
243 }
244#endif // defined(__aarch64__)
245
246 Window win = calculate_max_window(*dst, Steps());
247 ICpuKernel::configure(win);
248}
249
250Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
251{
252 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
253
254#if !defined(__aarch64__)
255 ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
256#endif // !defined(__aarch64__)
257 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
258 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
259 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
260 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)");
261
262 if(is_data_type_quantized_per_channel(weights->data_type()))
263 {
264 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
265 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
266 }
267 else
268 {
269 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
270 }
271
272 if(bias != nullptr)
273 {
274 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
275 ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
276
277 if(is_data_type_quantized(src->data_type()))
278 {
279 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
280 }
281 else
282 {
283 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
284 }
285 }
286
287 if(dst->total_size() > 0)
288 {
289 const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
290 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
291 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
292 }
293 return Status{};
294}
295
296void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
297{
298 ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
299 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
300 ARM_COMPUTE_UNUSED(window);
301 ARM_COMPUTE_UNUSED(info);
302
303 ARM_COMPUTE_ERROR_ON(tensors.empty());
304
305 const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
306 ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
307 ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
308 ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1);
309
310 const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
311 auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
312 auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
313 auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
314
315 const auto src_shape = src->info()->tensor_shape();
316 const auto dst_shape = dst->info()->tensor_shape();
317 const auto src_padding = src->info()->padding();
318 const auto dst_padding = dst->info()->padding();
319
320 const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
321 const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
322 const size_t ld_src_batch = ld_src_row * src_shape[2];
323 const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
324 const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
325 const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
326
327 _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
328 parameters_ptr,
329 dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
330 working_space, info.thread_id, info.num_threads);
331}
332
333void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
334{
335 _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
336}
337
338size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const
339{
340 return _kernel_asm->get_storage_size();
341}
342
343size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
344{
345 return _kernel_asm->get_working_size(num_threads, num_input_channels);
346}
347
348bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const
349{
350 return _kernel_asm != nullptr;
351}
352
353const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const
354{
355 return "CpuDepthwiseConv2dAssemblyWrapperKernel";
356}
357} // namespace kernels
358} // namespace cpu
359} // namespace arm_compute