blob: 65b7087d7ec1663d58229c1aa6f7770b288195f9 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/NEON/NEFixedPoint.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
34
35#include <arm_neon.h>
36#include <cstddef>
37#include <cstdint>
38
39using namespace arm_compute;
40
41namespace
42{
Michalis Spyrouafa5d812017-11-30 14:25:57 +000043Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
44{
45 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, bias);
46 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
47 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
48 if(is_data_type_quantized(input->data_type()))
49 {
50 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
51 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
52 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
53 }
54 else
55 {
56 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
57 }
58
59 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
60
61 // Checks performed when output is configured
62 if((output != nullptr) && (output->total_size() != 0))
63 {
64 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
65 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
66 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(bias, output);
67 }
68
69 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
70
71 return Status{};
72}
73
74std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
75{
76 bool window_changed = false;
77 const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
78
79 // Configure kernel window
80 Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
81 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
82 AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
83 if(output != nullptr && (output->total_size() != 0))
84 {
85 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
86 window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
87 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
88 }
89 else
90 {
91 window_changed = update_window_and_padding(win, input_access, bias_access);
92 input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
93 }
94
95 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
96 return std::make_pair(err, win);
97}
98
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099// Internal load
100inline float32x4_t internal_vld1q(const float *in)
101{
102 return vld1q_f32(in);
103}
104inline qint8x16_t internal_vld1q(const qint8_t *in)
105{
106 return vld1q_qs8(in);
107}
108inline qint16x8_t internal_vld1q(const qint16_t *in)
109{
110 return vld1q_qs16(in);
111}
112
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100113inline qint32x4_t internal_vld1q(const qint32_t *in)
114{
115 return vld1q_s32(in);
116}
117
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100118// Internal store
119inline void internal_vst1q(float *p, const float32x4_t &v)
120{
121 vst1q_f32(p, v);
122}
123inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
124{
125 vst1q_qs8(p, v);
126}
127inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
128{
129 vst1_qs8(p, vqmovn_s16(v));
130}
131inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
132{
133 vst1q_qs16(p, v);
134}
135
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100136inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
137{
138 vst1q_s32(p, v);
139}
140
141inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
142{
143 vst1_qs16(p, vqmovn_qs32(v));
144}
145
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100146// Internal vdup
147inline float32x4_t internal_vdupq_n(float v)
148{
149 return vdupq_n_f32(v);
150}
151inline qint8x16_t internal_vdupq_n(qint8_t v)
152{
153 return vdupq_n_qs8(v);
154}
155inline qint16x8_t internal_vdupq_n(qint16_t v)
156{
157 return vdupq_n_qs16(v);
158}
159
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100160inline qint32x4_t internal_vdupq_n(qint32_t v)
161{
162 return vdupq_n_qs32(v);
163}
164
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100165// Internal vadd
166inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
167{
168 return vaddq_f32(x, y);
169}
170inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
171{
172 return vqaddq_qs8(x, y);
173}
174inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
175{
176 return vqaddq_qs16(x, y);
177}
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100178inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
179{
180 return vqaddq_qs32(x, y);
181}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100182
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000183#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0d176142017-07-06 16:43:14 +0100184inline float16x8_t internal_vld1q(const float16_t *in)
185{
186 return vld1q_f16(in);
187}
188inline void internal_vst1q(float16_t *p, const float16x8_t &v)
189{
190 vst1q_f16(p, v);
191}
192inline float16x8_t internal_vdupq_n(float16_t v)
193{
194 return vdupq_n_f16(v);
195}
196inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
197{
198 return vaddq_f16(x, y);
199}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000200#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0d176142017-07-06 16:43:14 +0100201
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100202template <typename T1, typename T2, bool in_place>
203void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
204{
205 Iterator in(input, window);
206
207 if(in_place) // In place accumulate
208 {
209 execute_window_loop(window, [&](const Coordinates & id)
210 {
211 // Get bias and pointer to input
212 const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
213 const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
214
215 // Accumulate bias
216 internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
217 },
218 in);
219 }
220 else // Out of place accumulate
221 {
222 Iterator out(output, window);
223 execute_window_loop(window, [&](const Coordinates & id)
224 {
225 // Get bias and pointer to input
226 const auto in_ptr = reinterpret_cast<const T1 *>(in.ptr());
227 const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
228 const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
229
230 // Accumulate bias
231 internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
232 },
233 in, out);
234 }
235}
236} // namespace
237
238NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel()
239 : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
240{
241}
242
243void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
244{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000245 ARM_COMPUTE_ERROR_ON_NULLPTR(input, bias);
246
Georgios Pinitas0223a782017-12-12 11:44:44 +0000247 // Auto-initialize output output if required
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100248 if(output != nullptr)
249 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000250 // Output tensor auto initialization if not yet initialized
251 auto_init_if_empty(*output->info(), *input->info());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100252 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000253
254 // Perform validation step
255 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100256
257 _func = nullptr;
258 _bias = bias;
259 _input = input;
260 _output = output;
261
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100262 // Configure kernel window
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000263 auto win_config = validate_and_configure_window(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info());
264 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
265 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100266
267 // Set appropriate function
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100268 switch(input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100269 {
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100270 case DataType::QS8:
271 {
272 _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
273 break;
274 }
275 case DataType::QS16:
276 {
277 if(bias->info()->data_type() == DataType::QS8)
278 {
279 _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
280 }
281 else
282 {
283 ARM_COMPUTE_ERROR("Not implemented");
284 }
285 break;
286 }
287 case DataType::QS32:
288 {
289 _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
290 break;
291 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000292#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100293 case DataType::F16:
294 {
295 _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
296 break;
297 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000298#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100299 case DataType::F32:
300 {
301 _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
302 break;
303 }
304 default:
305 {
306 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
307 break;
308 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100309 }
310}
311
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000312Status NEDirectConvolutionLayerBiasAccumulateKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
313{
314 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output));
315 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first);
316
317 return Status{};
318}
319
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100320void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100321{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100322 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100323 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
324 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
325 ARM_COMPUTE_ERROR_ON(_func == nullptr);
326
327 (*_func)(_input, _bias, window, _output);
328}