blob: 5df3e3ee7df719157576347b3f54895f566f754f [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000024#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/NEON/NEFixedPoint.h"
Michalis Spyroue2588182018-12-13 18:31:18 +000031#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
Michalis Spyrou4de2d592020-02-21 18:58:38 +000034#include "arm_compute/core/utils/misc/SaturateCast.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035
Georgios Pinitase8291ac2020-02-26 09:58:13 +000036#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037
38using namespace arm_compute;
39
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010040namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010042Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
43{
44 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
45 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
Georgios Pinitase8291ac2020-02-26 09:58:13 +000046 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(input);
47 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(output);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010048 ARM_COMPUTE_UNUSED(policy);
49 ARM_COMPUTE_RETURN_ERROR_ON(input == output);
Georgios Pinitase8291ac2020-02-26 09:58:13 +000050 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
51 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
52 DataType::F32, DataType::S32);
53 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
54 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
55 DataType::U32, DataType::S32, DataType::F32);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010056 ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
57
Luca Foschianidaa3aba2020-01-08 15:55:08 +000058 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8_SIGNED && (output->data_type() != DataType::S16 && output->data_type() != DataType::S32
59 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
60 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
61
Usama Arif9e631c22019-05-14 17:10:40 +010062 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Michalis Spyrou6bff1952019-10-02 17:22:11 +010063 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
Usama Arif9e631c22019-05-14 17:10:40 +010064 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
Michalis Spyroue2588182018-12-13 18:31:18 +000065
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010066 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Usama Arif9e631c22019-05-14 17:10:40 +010067 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
68 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010069
70 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
71 "Only data_types supported [in] U16 -> [out] U8, U32");
72
Luca Foschianidaa3aba2020-01-08 15:55:08 +000073 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010074 "Only data_types supported [in] S16 -> [out] U8, S32");
75
Georgios Pinitase8291ac2020-02-26 09:58:13 +000076 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::BFLOAT16 && output->data_type() != DataType::F32,
77 "Only data_types supported [in] BFLOAT16 -> [out] F32");
78
Luca Foschianidaa3aba2020-01-08 15:55:08 +000079 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
80 && output->data_type() != DataType::U8
81 && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
Usama Arif9e631c22019-05-14 17:10:40 +010082 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010083
Luca Foschianidaa3aba2020-01-08 15:55:08 +000084 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
Georgios Pinitase8291ac2020-02-26 09:58:13 +000085 && output->data_type() != DataType::F16 && output->data_type() != DataType::BFLOAT16
Luca Foschianidaa3aba2020-01-08 15:55:08 +000086 && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
Georgios Pinitase8291ac2020-02-26 09:58:13 +000087 "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
Usama Arif9e631c22019-05-14 17:10:40 +010088
Luca Foschianidaa3aba2020-01-08 15:55:08 +000089 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
90 && output->data_type() != DataType::F16
91 && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
Usama Arif9e631c22019-05-14 17:10:40 +010092 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010093
94 // Validate in case of configured output
95 if(output->total_size() > 0)
96 {
97 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
98 }
99
100 return Status{};
101}
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100102} // namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000104NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100105 : _input(nullptr), _output(nullptr), _policy(), _shift(0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100106{
107}
108
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100109void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100110{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100111 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
112
113 // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
114 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
Georgios Pinitase2229412017-07-12 12:30:40 +0100115
116 _input = input;
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100117 _output = output;
Georgios Pinitase2229412017-07-12 12:30:40 +0100118 _policy = policy;
119 _shift = shift;
120
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100121 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
Georgios Pinitase2229412017-07-12 12:30:40 +0100122
123 // Configure kernel window
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000124 Window win = calculate_max_window(*input->info(), Steps());
125 Coordinates coord;
126 coord.set_num_dimensions(output->info()->num_dimensions());
127 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
128
129 ICPPKernel::configure(win);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100130}
Georgios Pinitase2229412017-07-12 12:30:40 +0100131
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100132Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
133{
134 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100135 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100136}
137
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000138void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100139{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100140 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100141 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Georgios Pinitase2229412017-07-12 12:30:40 +0100142 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100143 ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144 ARM_COMPUTE_ERROR_ON(_input == _output);
145
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000146 const auto window_start_x = static_cast<int>(window.x().start());
147 const auto window_end_x = static_cast<int>(window.x().end());
148 const int window_step_x = 16;
149
150 Window win{ window };
151 win.set(Window::DimX, Window::Dimension(0, 1, 1));
152
153 Iterator input(_input, win);
154 Iterator output(_output, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155
156 switch(_input->info()->data_type())
157 {
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000158 case DataType::QASYMM8_SIGNED:
159 {
160 const int16x8_t b = vdupq_n_s16(_shift);
161
162 switch(_output->info()->data_type())
163 {
164 case DataType::S16:
165 {
166 /* Up-conversion QASYMM8_SIGNED -> S16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000167 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000168 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000169 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
170 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
171 int x = window_start_x;
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000172
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000173 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000174 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000175 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000176
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000177 const int16x8x2_t texels =
178 {
179 {
180 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
181 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
182 }
183 };
184
185 vst1q_s16(output_ptr + x, texels.val[0]);
186 vst1q_s16(output_ptr + x + 8, texels.val[1]);
187 }
188
189 // Compute left-over elements
190 for(; x < window_end_x; ++x)
191 {
192 *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
193 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000194 },
195 input, output);
196 break;
197 }
198 case DataType::S32:
199 {
200 /* Up-conversion QASYMM8_SIGNED -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000201 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000202 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000203 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
204 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
205 int x = window_start_x;
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000206
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000207 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000208 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000209 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000210
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000211 const int16x8x2_t texels =
212 {
213 {
214 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
215 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
216 }
217 };
218
219 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
220 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
221 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
222 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
223 }
224
225 // Compute left-over elements
226 for(; x < window_end_x; ++x)
227 {
228 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
229 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000230 },
231 input, output);
232 break;
233 }
234 case DataType::F32:
235 {
236 /* Up-conversion QASYMM8_SIGNED -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000237 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000238 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000239 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
240 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000241
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000242 int x = window_start_x;
243 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000244 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000245 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(input.ptr()));
246
247 const int16x8x2_t texels =
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000248 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000249 {
250 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
251 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
252 }
253 };
254 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
255 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
256 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
257 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
258 }
259
260 // Compute left-over elements
261 for(; x < window_end_x; ++x)
262 {
263 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
264 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000265 },
266 input, output);
267 break;
268 }
269#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
270 case DataType::F16:
271 {
272 /* Up-conversion QASYMM8_SIGNED -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000273 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000274 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000275 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
276 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
277 int x = window_start_x;
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000278
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000279 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000280 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000281 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
282
283 const int16x8x2_t texels =
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000284 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000285 {
286 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
287 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
288 }
289 };
290 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
291 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
292 }
293
294 // Compute left-over elements
295 for(; x < window_end_x; ++x)
296 {
297 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
298 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000299 },
300 input, output);
301 break;
302 }
303#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
304
305 default:
306 ARM_COMPUTE_ERROR("Output data type not supported");
307 }
308 break;
309 }
310
Michalis Spyroue2588182018-12-13 18:31:18 +0000311 case DataType::QASYMM8:
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100312 case DataType::U8:
313 {
314 const int16x8_t b = vdupq_n_s16(_shift);
315
316 switch(_output->info()->data_type())
317 {
318 case DataType::S16:
319 {
320 /* Up-conversion U8 -> S16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000321 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100322 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000323 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
324 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100325
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000326 int x = window_start_x;
327 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100328 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000329 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100330
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000331 const int16x8x2_t texels =
332 {
333 {
334 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
335 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
336 }
337 };
338
339 vst1q_s16(output_ptr + x, texels.val[0]);
340 vst1q_s16(output_ptr + x + 8, texels.val[1]);
341 }
342
343 // Compute left-over elements
344 for(; x < window_end_x; ++x)
345 {
346 auto in = static_cast<int32_t>(*(input_ptr + x));
347 *(output_ptr + x) = in << _shift;
348 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100349 },
350 input, output);
351 break;
352 }
353 case DataType::S32:
354 {
355 /* Up-conversion U8 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000356 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100357 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000358 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
359 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100360
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000361 int x = window_start_x;
362 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100363 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000364 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100365
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000366 const int16x8x2_t texels =
367 {
368 {
369 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
370 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
371 }
372 };
373
374 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
375 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
376 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
377 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
378 }
379
380 // Compute left-over elements
381 for(; x < window_end_x; ++x)
382 {
383 auto in = static_cast<uint32_t>(*(input_ptr + x));
384 *(output_ptr + x) = in << _shift;
385 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100386 },
387 input, output);
388 break;
389 }
Usama Arif9e631c22019-05-14 17:10:40 +0100390 case DataType::F32:
391 {
392 /* Up-conversion U8 -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000393 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100394 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000395 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
396 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100397
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000398 int x = window_start_x;
399 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Usama Arif9e631c22019-05-14 17:10:40 +0100400 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000401 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
402
403 const int16x8x2_t texels =
Usama Arif9e631c22019-05-14 17:10:40 +0100404 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000405 {
406 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
407 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
408 }
409 };
410 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
411 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
412 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
413 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
414 }
415
416 // Compute left-over elements
417 for(; x < window_end_x; ++x)
418 {
419 auto in = static_cast<uint32_t>(*(input_ptr + x));
420 *(output_ptr + x) = static_cast<float>(in << _shift);
421 }
Usama Arif9e631c22019-05-14 17:10:40 +0100422 },
423 input, output);
424 break;
425 }
426#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
427 case DataType::F16:
428 {
429 /* Up-conversion U8 -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000430 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100431 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000432 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
433 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100434
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000435 int x = window_start_x;
436 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Usama Arif9e631c22019-05-14 17:10:40 +0100437 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000438 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
439
440 const int16x8x2_t texels =
Usama Arif9e631c22019-05-14 17:10:40 +0100441 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000442 {
443 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
444 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
445 }
446 };
447 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
448 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
449 }
450
451 // Compute left-over elements
452 for(; x < window_end_x; ++x)
453 {
454 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
455 }
Usama Arif9e631c22019-05-14 17:10:40 +0100456 },
457 input, output);
458 break;
459 }
460#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100461 case DataType::U16:
462 {
463 /* Up-conversion U8 -> U16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000464 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100465 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000466 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
467 const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100468
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000469 int x = window_start_x;
470 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100471 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000472 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100473
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000474 const uint16x8x2_t texels =
475 {
476 {
477 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
478 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
479 }
480 };
481
482 vst1q_u16(output_ptr + x, texels.val[0]);
483 vst1q_u16(output_ptr + x + 8, texels.val[1]);
484 }
485
486 // Compute left-over elements
487 for(; x < window_end_x; ++x)
488 {
489 *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
490 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100491 },
492 input, output);
493 break;
494 }
495 default:
496 ARM_COMPUTE_ERROR("Output data type not supported");
497 }
498 break;
499 }
500 case DataType::S16:
501 {
502 switch(_output->info()->data_type())
503 {
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000504 case DataType::QASYMM8_SIGNED:
505 {
506 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
507
508 /* Down-conversion S16 -> QASYMM8_SIGNED */
509 if(ConvertPolicy::SATURATE == _policy)
510 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000511 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000512 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000513 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
514 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000515
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000516 int x = window_start_x;
517 for(; x <= (window_end_x - window_step_x); x += window_step_x)
518 {
519 const int16x8x2_t texels =
520 {
521 {
522 vqshlq_s16(vld1q_s16(input_ptr + x), b),
523 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
524 }
525 };
526
527 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
528 }
529
530 // Compute left-over elements
531 for(; x < window_end_x; ++x)
532 {
533 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
534 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000535 },
536 input, output);
537 }
538 else
539 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000540 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000541 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000542 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
543 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000544
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000545 int x = window_start_x;
546 for(; x <= (window_end_x - window_step_x); x += window_step_x)
547 {
548 const int16x8x2_t texels =
549 {
550 {
551 vshlq_s16(vld1q_s16(input_ptr + x), b),
552 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
553 }
554 };
555
556 vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
557 }
558
559 // Compute left-over elements
560 for(; x < window_end_x; ++x)
561 {
562 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
563 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000564 },
565 input, output);
566 }
567 break;
568 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100569 case DataType::U8:
570 {
571 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
572
573 /* Down-conversion S16 -> U8 */
574 if(ConvertPolicy::SATURATE == _policy)
575 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000576 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100577 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000578 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
579 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100580
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000581 int x = window_start_x;
582 for(; x <= (window_end_x - window_step_x); x += window_step_x)
583 {
584 const int16x8x2_t texels =
585 {
586 {
587 vqshlq_s16(vld1q_s16(input_ptr + x), b),
588 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
589 }
590 };
591
592 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
593 }
594
595 // Compute left-over elements
596 for(; x < window_end_x; ++x)
597 {
598 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
599 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100600 },
601 input, output);
602 }
603 else
604 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000605 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100606 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000607 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
608 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100609
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000610 int x = window_start_x;
611 for(; x <= (window_end_x - window_step_x); x += window_step_x)
612 {
613 const int16x8x2_t texels =
614 {
615 {
616 vshlq_s16(vld1q_s16(input_ptr + x), b),
617 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
618 }
619 };
620
621 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
622 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
623 }
624
625 // Compute left-over elements
626 for(; x < window_end_x; ++x)
627 {
628 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
629 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100630 },
631 input, output);
632 }
633 break;
634 }
635 case DataType::S32:
636 {
637 const int32x4_t b = vdupq_n_s32(_shift);
638
639 /* Up-conversion S16 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000640 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100641 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000642 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
643 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100644
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000645 int x = window_start_x;
646 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100647 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000648 const int16x8x2_t texels =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100649 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000650 {
651 vld1q_s16(input_ptr + x),
652 vld1q_s16(input_ptr + x + 8)
653 }
654 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100655
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000656 const int32x4x4_t texels_s32 =
657 {
658 {
659 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
660 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
661 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
662 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
663 }
664 };
665
666 vst1q_s32(output_ptr + x, texels_s32.val[0]);
667 vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
668 vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
669 vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
670 }
671
672 // Compute left-over elements
673 for(; x < window_end_x; ++x)
674 {
675 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
676 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100677 },
678 input, output);
679 break;
680 }
681 default:
682 ARM_COMPUTE_ERROR("Output data type not supported");
683 }
684 break;
685 }
686 case DataType::U16:
687 {
688 switch(_output->info()->data_type())
689 {
690 case DataType::U8:
691 {
692 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
693
694 /* Down-conversion U16 -> U8 */
695 if(ConvertPolicy::SATURATE == _policy)
696 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000697 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100698 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000699 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
700 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100701
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000702 int x = window_start_x;
703 for(; x <= (window_end_x - window_step_x); x += window_step_x)
704 {
705 const uint16x8x2_t texels =
706 {
707 {
708 vqshlq_u16(vld1q_u16(input_ptr + x), b),
709 vqshlq_u16(vld1q_u16(input_ptr + x + 8), b)
710 }
711 };
712
713 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
714 }
715
716 // Compute left-over elements
717 for(; x < window_end_x; ++x)
718 {
719 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
720 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100721 },
722 input, output);
723 }
724 else
725 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000726 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100727 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000728 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
729 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100730
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000731 int x = window_start_x;
732 for(; x <= (window_end_x - window_step_x); x += window_step_x)
733 {
734 const uint16x8x2_t texels =
735 {
736 {
737 vshlq_u16(vld1q_u16(input_ptr + x), b),
738 vshlq_u16(vld1q_u16(input_ptr + x + 8), b)
739 }
740 };
741
742 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
743 }
744
745 // Compute left-over elements
746 for(; x < window_end_x; ++x)
747 {
748 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
749 }
750
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100751 },
752 input, output);
753 }
754 break;
755 }
756 case DataType::U32:
757 {
758 const int32x4_t b = vdupq_n_s32(_shift);
759
760 /* Up-conversion U16 -> U32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000761 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100762 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000763 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
764 const auto output_ptr = reinterpret_cast<uint32_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000766 int x = window_start_x;
767 for(; x <= (window_end_x - window_step_x); x += window_step_x)
768 {
769 const uint16x8x2_t texels =
770 {
771 {
772 vld1q_u16(input_ptr + x),
773 vld1q_u16(input_ptr + x + 8)
774 }
775 };
776
777 vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
778 vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
779 vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
780 vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
781 }
782 // Compute left-over elements
783 for(; x < window_end_x; ++x)
784 {
785 *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
786 }
787
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100788 },
789 input, output);
790 break;
791 }
792 default:
793 ARM_COMPUTE_ERROR("Output data type not supported");
794 }
795 break;
796 }
Georgios Pinitase8291ac2020-02-26 09:58:13 +0000797#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
798 case DataType::BFLOAT16:
799 switch(_output->info()->data_type())
800 {
801 case DataType::F32:
802 {
803 /* Up-conversion BFLOAT16 -> F32 */
804 execute_window_loop(win, [&](const Coordinates &)
805 {
806 const auto input_ptr = reinterpret_cast<const bfloat16 *>(input.ptr());
807 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
808
809 int x = window_start_x;
810 for(; x <= (window_end_x - window_step_x); x += window_step_x)
811 {
812 const uint16x8x2_t texels =
813 {
814 {
815 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
816 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
817 }
818 };
819
820 vst1q_f32(reinterpret_cast<float *>(output.ptr()),
821 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
822 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4,
823 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
824 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8,
825 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
826 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12,
827 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
828 }
829
830 for(; x < window_end_x; ++x)
831 {
832 *(output_ptr + x) = float(*(input_ptr + x));
833 }
834 },
835 input, output);
836 break;
837 }
838 default:
839 ARM_COMPUTE_ERROR("Output data type unsupported");
840 }
841 break;
842#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100843#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
844 case DataType::F16:
845 switch(_output->info()->data_type())
846 {
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000847 case DataType::QASYMM8_SIGNED:
848 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000849 const float16_t scale_s = 1 << _shift;
850 const float16x8_t scale = vdupq_n_f16(scale_s);
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000851
SiCong Lieb727f42020-06-09 18:37:19 +0100852 /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000853 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000854 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000855 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
856 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000857
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000858 int x = window_start_x;
859 for(; x <= (window_end_x - window_step_x); x += window_step_x)
860 {
861 const float16x8x2_t texels =
862 {
863 {
864 vmulq_f16(vld1q_f16(input_ptr + x), scale),
865 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
866 }
867 };
868
869 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
870 }
871
872 // Compute left-over elements
873 for(; x < window_end_x; ++x)
874 {
SiCong Lieb727f42020-06-09 18:37:19 +0100875 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000876 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000877 },
878 input, output);
879 break;
880 }
Michalis Spyroue2588182018-12-13 18:31:18 +0000881 case DataType::QASYMM8:
Usama Arif9e631c22019-05-14 17:10:40 +0100882 case DataType::U8:
Michalis Spyroue2588182018-12-13 18:31:18 +0000883 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000884 const float16_t scale_s = 1 << _shift;
885 const float16x8_t scale = vdupq_n_f16(scale_s);
Michalis Spyroue2588182018-12-13 18:31:18 +0000886
SiCong Lieb727f42020-06-09 18:37:19 +0100887 /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000888 execute_window_loop(win, [&](const Coordinates &)
Michalis Spyroue2588182018-12-13 18:31:18 +0000889 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000890 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
891 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Michalis Spyroue2588182018-12-13 18:31:18 +0000892
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000893 int x = window_start_x;
894 for(; x <= (window_end_x - window_step_x); x += window_step_x)
895 {
896 const float16x8x2_t texels =
897 {
898 {
899 vmulq_f16(vld1q_f16(input_ptr + x), scale),
900 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
901 }
902 };
903
904 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
905 }
906
907 // Compute left-over elements
908 for(; x < window_end_x; ++x)
909 {
SiCong Lieb727f42020-06-09 18:37:19 +0100910 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000911 }
912
Michalis Spyroue2588182018-12-13 18:31:18 +0000913 },
914 input, output);
915 break;
916 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100917 case DataType::F32:
918 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000919 const float scale_s = 1 << _shift;
920 const float32x4_t scale = vdupq_n_f32(scale_s);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100921
922 /* Up-conversion F16 -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000923 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100924 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000925 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
926 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100927
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000928 int x = window_start_x;
929 for(; x <= (window_end_x - window_step_x); x += window_step_x)
930 {
931 const float16x8x2_t texels =
932 {
933 {
934 vld1q_f16(input_ptr + x),
935 vld1q_f16(input_ptr + x + 8)
936 }
937 };
938 vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
939 vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
940 vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
941 vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
942 }
943
944 // Compute left-over elements
945 for(; x < window_end_x; ++x)
946 {
947 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
948 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100949 },
950 input, output);
951 break;
952 }
Usama Arif9e631c22019-05-14 17:10:40 +0100953 case DataType::S32:
954 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000955 const float scale_s = 1 << _shift;
956 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +0100957
958 /* Up-conversion F16 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000959 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100960 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000961 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
962 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100963
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000964 int x = window_start_x;
965 for(; x <= (window_end_x - window_step_x); x += window_step_x)
966 {
967 const float16x8x2_t texels =
968 {
969 {
970 vld1q_f16(input_ptr + x),
971 vld1q_f16(input_ptr + x + 8)
972 }
973 };
974
975 vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
976 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
977 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
978 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
979 }
980
981 // Compute left-over elements
982 for(; x < window_end_x; ++x)
983 {
984 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
985 }
Usama Arif9e631c22019-05-14 17:10:40 +0100986 },
987 input, output);
988 break;
989 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100990 default:
991 ARM_COMPUTE_ERROR("Output data type not supported");
992 }
993 break;
Michalis Spyroue2588182018-12-13 18:31:18 +0000994#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100995 case DataType::F32:
996 switch(_output->info()->data_type())
997 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000998#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100999 case DataType::F16:
1000 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001001 const float scale_s = 1.f / (1 << _shift);
1002 const float32x4_t scale = vdupq_n_f32(scale_s);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +01001003
1004 /* Down-conversion F32 -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001005 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +01001006 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001007 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1008 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
Michele Di Giorgio3e570db2018-08-24 18:28:48 +01001009
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001010 int x = window_start_x;
1011 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1012 {
1013 const float32x4x4_t texels =
1014 {
1015 {
1016 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1017 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1018 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1019 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale)
1020 }
1021 };
1022
1023 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1024 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1025 }
1026
1027 // Compute left-over elements
1028 for(; x < window_end_x; ++x)
1029 {
1030 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1031 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +01001032 },
1033 input, output);
1034 break;
1035 }
Michalis Spyroue2588182018-12-13 18:31:18 +00001036#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Georgios Pinitase8291ac2020-02-26 09:58:13 +00001037#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
1038 case DataType::BFLOAT16:
1039 {
1040 /* Down-conversion F32 -> BFLOAT16 */
1041 execute_window_loop(win, [&](const Coordinates &)
1042 {
1043 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1044 const auto output_ptr = reinterpret_cast<bfloat16 *>(output.ptr());
1045
1046 int x = window_start_x;
1047 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1048 {
1049 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()),
1050 reinterpret_cast<uint16_t *>(output.ptr()));
1051 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()) + 8,
1052 reinterpret_cast<uint16_t *>(output.ptr()) + 8);
1053 }
1054
1055 for(; x < window_end_x; ++x)
1056 {
1057 *(output_ptr + x) = *(input_ptr + x);
1058 }
1059 },
1060 input, output);
1061 break;
1062 }
1063#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
Usama Arif9e631c22019-05-14 17:10:40 +01001064 case DataType::S32:
1065 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001066 const float scale_s = 1.f / (1 << _shift);
1067 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001068
1069 /* Conversion F32 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001070 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001071 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001072 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1073 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001074
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001075 int x = window_start_x;
1076 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1077 {
1078 const float32x4x4_t texels =
1079 {
1080 {
1081 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1082 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1083 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1084 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1085 }
1086 };
1087
1088 vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1089 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1090 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1091 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1092 }
1093
1094 // Compute left-over elements
1095 for(; x < window_end_x; ++x)
1096 {
1097 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1098 }
Usama Arif9e631c22019-05-14 17:10:40 +01001099 },
1100 input, output);
1101 break;
1102 }
1103 case DataType::QASYMM8:
1104 case DataType::U8:
1105 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001106 const float scale_s = 1.f / (1 << _shift);
1107 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001108
1109 /* Down-conversion F32 -> U8 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001110 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001111 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001112 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1113 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001114
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001115 int x = window_start_x;
1116 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1117 {
1118 const float32x4x4_t texels =
1119 {
1120 {
1121 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1122 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1123 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1124 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1125 }
1126 };
1127
1128 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1129 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1130 }
1131
1132 // Compute left-over elements
1133 for(; x < window_end_x; ++x)
1134 {
1135 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1136 }
Usama Arif9e631c22019-05-14 17:10:40 +01001137 },
1138 input, output);
1139 break;
1140 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001141 case DataType::QASYMM8_SIGNED:
1142 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001143 const float scale_s = 1.f / (1 << _shift);
1144 const float32x4_t scale = vdupq_n_f32(scale_s);
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001145
1146 /* Down-conversion F32 -> QASYMM8_SIGNED */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001147 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001148 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001149 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1150 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001151
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001152 int x = window_start_x;
1153 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1154 {
1155 const float32x4x4_t texels =
1156 {
1157 {
1158 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1159 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1160 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1161 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1162 }
1163 };
1164
1165 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1166 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1167 }
1168 // Compute left-over elements
1169 for(; x < window_end_x; ++x)
1170 {
1171 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1172 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001173 },
1174 input, output);
1175 break;
1176 }
Usama Arif9e631c22019-05-14 17:10:40 +01001177
Usama Arif9e631c22019-05-14 17:10:40 +01001178 default:
1179 ARM_COMPUTE_ERROR("Output data type not supported");
1180 }
1181 break;
1182
1183 case DataType::S32:
1184 switch(_output->info()->data_type())
1185 {
1186#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1187 case DataType::F16:
1188 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001189 const float scale_s = 1.f / (1 << _shift);
1190 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001191
1192 /* Down-conversion S32 -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001193 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001194 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001195 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1196 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001197
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001198 int x = window_start_x;
1199 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1200 {
1201 const float32x4x4_t texels =
1202 {
1203 {
1204 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)), scale),
1205 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)), scale),
1206 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)), scale),
1207 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)), scale)
1208 }
1209 };
1210
1211 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1212 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1213 }
1214
1215 // Compute left-over elements
1216 for(; x < window_end_x; ++x)
1217 {
SiCong Lieb727f42020-06-09 18:37:19 +01001218 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001219 }
Usama Arif9e631c22019-05-14 17:10:40 +01001220 },
1221 input, output);
1222 break;
1223 }
1224#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1225 case DataType::F32:
1226 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001227 const int scale_s = 1.f / (1 << _shift);
1228 const int32x4_t scale = vdupq_n_s32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001229
1230 /* Conversion S32 -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001231 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001232 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001233 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1234 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001235
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001236 int x = window_start_x;
1237 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1238 {
1239 const int32x4x4_t texels =
1240 {
1241 {
1242 vmulq_s32(vld1q_s32(input_ptr + x), scale),
1243 vmulq_s32(vld1q_s32(input_ptr + x + 4), scale),
1244 vmulq_s32(vld1q_s32(input_ptr + x + 8), scale),
1245 vmulq_s32(vld1q_s32(input_ptr + x + 12), scale),
1246 }
1247 };
1248
1249 vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1250 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1251 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1252 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1253 }
1254
1255 // Compute left-over elements
1256 for(; x < window_end_x; ++x)
1257 {
1258 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1259 }
Usama Arif9e631c22019-05-14 17:10:40 +01001260 },
1261 input, output);
1262 break;
1263 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001264 case DataType::QASYMM8_SIGNED:
1265 {
1266 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1267
1268 /* Down-conversion S32 -> QASYMM8_SIGNED */
1269 if(ConvertPolicy::SATURATE == _policy)
1270 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001271 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001272 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001273 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1274 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1275
1276 int x = window_start_x;
1277 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001278 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001279 const int32x4x4_t texels =
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001280 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001281 {
1282 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1283 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1284 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1285 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1286 }
1287 };
1288 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1289 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1290 }
1291
1292 // Compute left-over elements
1293 for(; x < window_end_x; ++x)
1294 {
1295 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1296 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001297 },
1298 input, output);
1299 }
1300 else
1301 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001302 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001303 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001304 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1305 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001306
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001307 int x = window_start_x;
1308 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1309 {
1310 const int32x4x4_t texels =
1311 {
1312 {
1313 vshlq_s32(vld1q_s32(input_ptr + x), b),
1314 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1315 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1316 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1317 }
1318 };
1319
1320 vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1321 vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1322 }
1323
1324 // Compute left-over elements
1325 for(; x < window_end_x; ++x)
1326 {
1327 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1328 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001329 },
1330 input, output);
1331 }
1332 break;
1333 }
Usama Arif9e631c22019-05-14 17:10:40 +01001334 case DataType::QASYMM8:
1335 case DataType::U8:
1336 {
1337 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1338
1339 /* Down-conversion S32 -> U8 */
1340 if(ConvertPolicy::SATURATE == _policy)
1341 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001342 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001343 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001344 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1345 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1346
1347 int x = window_start_x;
1348 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Usama Arif9e631c22019-05-14 17:10:40 +01001349 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001350 const int32x4x4_t texels =
Usama Arif9e631c22019-05-14 17:10:40 +01001351 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001352 {
1353 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1354 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1355 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1356 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1357 }
1358 };
1359 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1360 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1361 }
1362
1363 // Compute left-over elements
1364 for(; x < window_end_x; ++x)
1365 {
1366 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1367 }
Usama Arif9e631c22019-05-14 17:10:40 +01001368 },
1369 input, output);
1370 }
1371 else
1372 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001373 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001374 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001375 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1376 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001377
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001378 int x = window_start_x;
1379 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1380 {
1381 const int32x4x4_t texels =
1382 {
1383 {
1384 vshlq_s32(vld1q_s32(input_ptr + x), b),
1385 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1386 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1387 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1388 }
1389 };
1390
1391 vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1392 vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1393 }
1394
1395 // Compute left-over elements
1396 for(; x < window_end_x; ++x)
1397 {
1398 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
1399 }
Usama Arif9e631c22019-05-14 17:10:40 +01001400 },
1401 input, output);
1402 }
1403 break;
1404 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +01001405 default:
1406 ARM_COMPUTE_ERROR("Output data type not supported");
1407 }
1408 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001409 default:
1410 ARM_COMPUTE_ERROR("Not supported");
1411 }
1412}