blob: 7c029a6b542777dd57ab24c4042474cebf9d0faf [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michalis Spyroua4f378d2019-04-26 14:54:54 +01002 * Copyright (c) 2017-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyroub91e34c2017-12-20 15:50:55 +000024#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/AccessWindowStatic.h"
Anthony Barbiereaefd002018-07-20 17:49:35 +010027#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/Error.h"
29#include "arm_compute/core/Helpers.h"
30#include "arm_compute/core/ITensor.h"
Georgios Pinitasf72f9362018-01-12 16:29:45 +000031#include "arm_compute/core/NEON/NEAsymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/NEON/NEFixedPoint.h"
33#include "arm_compute/core/Types.h"
34#include "arm_compute/core/Validate.h"
35#include "arm_compute/core/Window.h"
36
37#include <arm_neon.h>
38#include <cstddef>
39#include <cstdint>
40
41using namespace arm_compute;
42
43namespace
44{
Michele Di Giorgioff271922019-07-17 15:59:32 +010045Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
46 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
Michalis Spyrouafa5d812017-11-30 14:25:57 +000047{
Michele Di Giorgioff271922019-07-17 15:59:32 +010048 ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
49 ARM_COMPUTE_UNUSED(result_offset_after_shift);
Anthony Barbiereaefd002018-07-20 17:49:35 +010050 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
Giorgio Arena1ed1fc62018-03-26 16:20:05 +010051 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010052 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
53 DataType::F16,
Vidhya Sudhan Loganathanf4cb81b2018-07-04 15:13:14 +010054 DataType::S32, DataType::F32);
Michalis Spyroub91e34c2017-12-20 15:50:55 +000055
Michele Di Giorgioff271922019-07-17 15:59:32 +010056 ARM_COMPUTE_RETURN_ERROR_ON_MSG(result_shift < 0, "Result shift must be a non negative integer");
Michalis Spyroub91e34c2017-12-20 15:50:55 +000057 if(bias != nullptr)
Michalis Spyrouafa5d812017-11-30 14:25:57 +000058 {
Vidhya Sudhan Loganathanf4cb81b2018-07-04 15:13:14 +010059 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::S32, DataType::F32);
Michalis Spyroub91e34c2017-12-20 15:50:55 +000060
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010061 if(is_data_type_quantized_asymmetric(input->data_type()))
Georgios Pinitas19d05472018-02-01 16:44:12 +000062 {
63 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
64 }
Michalis Spyroub91e34c2017-12-20 15:50:55 +000065 else
66 {
67 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
68 }
69
Giorgio Arena1ed1fc62018-03-26 16:20:05 +010070 ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)));
Michalis Spyroub91e34c2017-12-20 15:50:55 +000071 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
Michalis Spyrouafa5d812017-11-30 14:25:57 +000072 }
73 else
74 {
Georgios Pinitas19d05472018-02-01 16:44:12 +000075 ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()), "Calling output stage kernel with floating point arguments");
Michalis Spyrouafa5d812017-11-30 14:25:57 +000076 }
77
Michalis Spyrouafa5d812017-11-30 14:25:57 +000078 // Checks performed when output is configured
79 if((output != nullptr) && (output->total_size() != 0))
80 {
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010081 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::F32);
Giorgio Arena1ed1fc62018-03-26 16:20:05 +010082 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
83
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010084 if(is_data_type_quantized_asymmetric(output->data_type()))
Georgios Pinitasf72f9362018-01-12 16:29:45 +000085 {
86 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias");
Michalis Spyroub91e34c2017-12-20 15:50:55 +000087 }
88 else
89 {
90 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
91 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +000092 }
93
Michalis Spyrouafa5d812017-11-30 14:25:57 +000094 return Status{};
95}
96
97std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
98{
Giorgio Arena1ed1fc62018-03-26 16:20:05 +010099 ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
100
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000101 bool window_changed = false;
102 unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
103
104 // Update processed elements when input is S32 (comes from quantization input)
105 if(input->data_type() == DataType::S32)
106 {
107 num_elems_processed_per_iteration = 16;
108 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000109
110 // Configure kernel window
111 Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
112 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000113
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000114 if(output != nullptr && (output->total_size() != 0))
115 {
116 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000117
118 if(bias == nullptr)
119 {
120 window_changed = update_window_and_padding(win, input_access, output_access);
121 }
122 else
123 {
124 AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
125 window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
126 }
127
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000128 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
129 }
130 else
131 {
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000132 if(bias == nullptr)
133 {
134 window_changed = update_window_and_padding(win, input_access);
135 }
136 else
137 {
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100138 if(input->data_layout() == DataLayout::NCHW)
139 {
140 AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
141 window_changed = update_window_and_padding(win, input_access, bias_access);
142 }
143 else
144 {
145 AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
146 window_changed = update_window_and_padding(win, input_access, bias_access);
147 }
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000148 }
149
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000150 input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
151 }
152
153 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
154 return std::make_pair(err, win);
155}
156
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157// Internal load
158inline float32x4_t internal_vld1q(const float *in)
159{
160 return vld1q_f32(in);
161}
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100162
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100163// Internal store
164inline void internal_vst1q(float *p, const float32x4_t &v)
165{
166 vst1q_f32(p, v);
167}
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100168
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100169// Internal vdup
170inline float32x4_t internal_vdupq_n(float v)
171{
172 return vdupq_n_f32(v);
173}
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100174
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100175// Internal vadd
176inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
177{
178 return vaddq_f32(x, y);
179}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100180
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000181#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0d176142017-07-06 16:43:14 +0100182inline float16x8_t internal_vld1q(const float16_t *in)
183{
184 return vld1q_f16(in);
185}
186inline void internal_vst1q(float16_t *p, const float16x8_t &v)
187{
188 vst1q_f16(p, v);
189}
190inline float16x8_t internal_vdupq_n(float16_t v)
191{
192 return vdupq_n_f16(v);
193}
194inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
195{
196 return vaddq_f16(x, y);
197}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000198#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0d176142017-07-06 16:43:14 +0100199
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000200template <typename T1, typename T2, bool in_place, bool has_bias>
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100201void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
202 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100203{
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100204 ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000205 ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
206 ARM_COMPUTE_UNUSED(result_shift);
207 ARM_COMPUTE_UNUSED(result_offset_after_shift);
208
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100209 Iterator in(input, window);
210
211 if(in_place) // In place accumulate
212 {
213 execute_window_loop(window, [&](const Coordinates & id)
214 {
215 // Get bias and pointer to input
216 const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100217
218 // Accumulate bias
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000219 if(has_bias)
220 {
221 const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
222 internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
223 }
224 else
225 {
226 internal_vst1q(in_ptr, internal_vld1q(in_ptr));
227 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100228 },
229 in);
230 }
231 else // Out of place accumulate
232 {
233 Iterator out(output, window);
234 execute_window_loop(window, [&](const Coordinates & id)
235 {
236 // Get bias and pointer to input
237 const auto in_ptr = reinterpret_cast<const T1 *>(in.ptr());
238 const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100239
240 // Accumulate bias
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000241 if(has_bias)
242 {
243 const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
244 internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
245 }
246 else
247 {
248 internal_vst1q(out_ptr, internal_vld1q(in_ptr));
249 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100250 },
251 in, out);
252 }
253}
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000254
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100255template <typename T1, typename T2, bool in_place, bool has_bias>
256void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
257 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
258{
259 ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
260 ARM_COMPUTE_UNUSED(result_shift);
261 ARM_COMPUTE_UNUSED(result_offset_after_shift);
262
263 Window window_bias = window;
264 window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
265 window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
266 window_bias.set(3, Window::Dimension(0, 0, 0));
267
268 Iterator in(input, window);
269 Iterator bi(bias, window_bias);
270
271 if(in_place) // In place accumulate
272 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100273 execute_window_loop(window, [&](const Coordinates &)
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100274 {
275 // Get bias and pointer to input
276 const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
277 const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
278
279 // Accumulate bias
280 if(has_bias)
281 {
282 internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
283 }
284 else
285 {
286 internal_vst1q(in_ptr, internal_vld1q(in_ptr));
287 }
288 },
289 in, bi);
290 }
291 else // Out of place accumulate
292 {
293 Iterator out(output, window);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100294 execute_window_loop(window, [&](const Coordinates &)
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100295 {
296 // Get bias and pointer to input
297 const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
298 const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
299 const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
300
301 // Accumulate bias
302 if(has_bias)
303 {
304 internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
305 }
306 else
307 {
308 internal_vst1q(out_ptr, internal_vld1q(in_ptr));
309 }
310 },
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100311 in, bi, out);
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100312 }
313}
314
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000315// QASYMM8 specializations
316template <>
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100317void output_stage_nchw<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
318 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000319{
320 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
321 uint8x16_t min = vdupq_n_u8(0);
322 uint8x16_t max = vdupq_n_u8(255);
323
324 Iterator in(input, window);
325 Iterator out(output, window);
326
327 execute_window_loop(window, [&](const Coordinates & id)
328 {
329 // Get bias and pointer to input
330 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
331 int32x4x4_t v_in =
332 {
333 {
334 vld1q_s32(in_ptr),
335 vld1q_s32(in_ptr + 4),
336 vld1q_s32(in_ptr + 8),
337 vld1q_s32(in_ptr + 12)
338 }
339 };
340
341 // Accumulate bias
342 const auto vb = vdupq_n_s32(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))));
343 v_in =
344 {
345 {
346 vaddq_s32(v_in.val[0], vb),
347 vaddq_s32(v_in.val[1], vb),
348 vaddq_s32(v_in.val[2], vb),
349 vaddq_s32(v_in.val[3], vb)
350 }
351 };
352
353 const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
354 vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
355 },
356 in, out);
357}
358template <>
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100359void output_stage_nchw<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
360 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000361{
362 ARM_COMPUTE_UNUSED(bias);
363
364 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
365 uint8x16_t min = vdupq_n_u8(0);
366 uint8x16_t max = vdupq_n_u8(255);
367
368 Iterator in(input, window);
369 Iterator out(output, window);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100370 execute_window_loop(window, [&](const Coordinates &)
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000371 {
372 // Get bias and pointer to input
373 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
374 int32x4x4_t v_in =
375 {
376 {
377 vld1q_s32(in_ptr),
378 vld1q_s32(in_ptr + 4),
379 vld1q_s32(in_ptr + 8),
380 vld1q_s32(in_ptr + 12)
381 }
382 };
383
384 const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
385 vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
386 },
387 in, out);
388}
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100389template <>
390void output_stage_nhwc<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
391 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
392{
393 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
394 uint8x16_t min = vdupq_n_u8(0);
395 uint8x16_t max = vdupq_n_u8(255);
396
397 Window window_bias = window;
398 window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
399 window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
400 window_bias.set(3, Window::Dimension(0, 0, 0));
401
402 Iterator in(input, window);
403 Iterator bi(bias, window_bias);
404
405 Iterator out(output, window);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100406 execute_window_loop(window, [&](const Coordinates &)
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100407 {
408 // Get bias and pointer to input
409 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
410 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
411
412 // Accumulate bias
413 int32x4x4_t v_in =
414 {
415 {
416 vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)),
417 vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)),
418 vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)),
419 vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12))
420 }
421 };
422
423 const auto out_ptr = out.ptr();
424 vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
425 },
426 in, bi, out);
427}
428template <>
429void output_stage_nhwc<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
430 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
431{
432 ARM_COMPUTE_UNUSED(bias);
433
434 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
435 uint8x16_t min = vdupq_n_u8(0);
436 uint8x16_t max = vdupq_n_u8(255);
437
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100438 Iterator in(input, window);
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100439 Iterator out(output, window);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100440 execute_window_loop(window, [&](const Coordinates &)
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100441 {
Georgios Pinitas551852f2018-11-29 13:58:31 +0000442 // Get pointer to input
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100443 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
444
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100445 int32x4x4_t v_in =
446 {
447 {
448 vld1q_s32(in_ptr),
449 vld1q_s32(in_ptr + 4),
450 vld1q_s32(in_ptr + 8),
451 vld1q_s32(in_ptr + 12)
452 }
453 };
454
455 const auto out_ptr = out.ptr();
456 vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
457 },
Georgios Pinitas551852f2018-11-29 13:58:31 +0000458 in, out);
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100459}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460} // namespace
461
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000462NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000463 : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100464{
465}
466
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000467void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output,
468 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100469{
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000470 ARM_COMPUTE_ERROR_ON_NULLPTR(input);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000471
Georgios Pinitas0223a782017-12-12 11:44:44 +0000472 // Auto-initialize output output if required
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100473 if(output != nullptr)
474 {
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000475 // Work out expected output data type
476 const DataType output_dt = (input->info()->data_type() == DataType::S32) ? DataType::QASYMM8 : input->info()->data_type();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000477 // Output tensor auto initialization if not yet initialized
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000478 auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100479 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000480
481 // Perform validation step
Michele Di Giorgioff271922019-07-17 15:59:32 +0100482 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(),
483 result_fixedpoint_multiplier, result_shift, result_offset_after_shift));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100484
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000485 _func = nullptr;
486 _bias = bias;
487 _input = input;
488 _output = output;
489 _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
490 _result_shift = result_shift;
491 _result_offset_after_shift = result_offset_after_shift;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100492
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100493 // Configure kernel window
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000494 auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info());
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000495 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
496 INEKernel::configure(win_config.second);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100497
Gian Marco Iodice618493d2018-11-27 16:38:33 +0000498 const bool has_bias = bias != nullptr;
499
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100500 // Set appropriate function
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100501 if(input->info()->data_layout() == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100502 {
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100503 switch(input->info()->data_type())
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100504 {
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100505 case DataType::S32:
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000506 {
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100507 _func = (bias == nullptr) ? &output_stage_nchw<int32_t, uint8_t, false, false> : &output_stage_nchw<int32_t, uint8_t, false, true>;
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100508 break;
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100509 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000510#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100511 case DataType::F16:
512 {
Gian Marco Iodice618493d2018-11-27 16:38:33 +0000513 if(has_bias)
514 {
515 _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
516 }
517 else
518 {
519 _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, false> : &output_stage_nchw<float16_t, float16_t, false, false>;
520 }
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100521 break;
522 }
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000523#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100524 case DataType::F32:
525 {
Gian Marco Iodice618493d2018-11-27 16:38:33 +0000526 if(has_bias)
527 {
528 _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
529 }
530 else
531 {
532 _func = (output == nullptr) ? &output_stage_nchw<float, float, true, false> : &output_stage_nchw<float, float, false, false>;
533 }
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100534 break;
535 }
536 default:
537 {
538 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
539 }
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100540 }
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100541 }
542 else
543 {
544 switch(input->info()->data_type())
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100545 {
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100546 case DataType::S32:
547 {
Gian Marco Iodice618493d2018-11-27 16:38:33 +0000548 _func = (bias == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100549 break;
550 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100551#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
552 case DataType::F16:
553 {
Gian Marco Iodice618493d2018-11-27 16:38:33 +0000554 if(has_bias)
555 {
556 _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
557 }
558 else
559 {
560 _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, false> : &output_stage_nhwc<float16_t, float16_t, false, false>;
561 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100562 break;
563 }
564#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100565 case DataType::F32:
566 {
Gian Marco Iodice618493d2018-11-27 16:38:33 +0000567 if(has_bias)
568 {
569 _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
570 }
571 else
572 {
573 _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, false> : &output_stage_nhwc<float, float, false, false>;
574 }
Giorgio Arena1ed1fc62018-03-26 16:20:05 +0100575 break;
576 }
577 default:
578 {
579 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
580 }
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100581 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100582 }
583}
584
Michele Di Giorgioff271922019-07-17 15:59:32 +0100585Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
586 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000587{
Michele Di Giorgioff271922019-07-17 15:59:32 +0100588 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift));
Anthony Barbierde014682018-07-03 15:10:48 +0100589 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias == nullptr ? nullptr : bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000590
591 return Status{};
592}
593
Michalis Spyroub91e34c2017-12-20 15:50:55 +0000594void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100595{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100596 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100597 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
598 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
599 ARM_COMPUTE_ERROR_ON(_func == nullptr);
600
Georgios Pinitasf72f9362018-01-12 16:29:45 +0000601 (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100602}