blob: f824f7ac5828ef378693d77aba52420731e18274 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Luca Foschianidaa3aba2020-01-08 15:55:08 +00002 * Copyright (c) 2016-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000024#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/NEON/NEFixedPoint.h"
Michalis Spyroue2588182018-12-13 18:31:18 +000031#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
Michalis Spyrou4de2d592020-02-21 18:58:38 +000034#include "arm_compute/core/utils/misc/SaturateCast.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035
36#include <arm_neon.h>
37
38using namespace arm_compute;
39
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010040namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010041{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010042Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
43{
44 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
45 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
46 ARM_COMPUTE_UNUSED(policy);
47 ARM_COMPUTE_RETURN_ERROR_ON(input == output);
Luca Foschianidaa3aba2020-01-08 15:55:08 +000048 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32);
49 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16,
50 DataType::F32);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010051 ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
52
Luca Foschianidaa3aba2020-01-08 15:55:08 +000053 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8_SIGNED && (output->data_type() != DataType::S16 && output->data_type() != DataType::S32
54 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
55 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
56
Usama Arif9e631c22019-05-14 17:10:40 +010057 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Michalis Spyrou6bff1952019-10-02 17:22:11 +010058 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
Usama Arif9e631c22019-05-14 17:10:40 +010059 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
Michalis Spyroue2588182018-12-13 18:31:18 +000060
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010061 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Usama Arif9e631c22019-05-14 17:10:40 +010062 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
63 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010064
65 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
66 "Only data_types supported [in] U16 -> [out] U8, U32");
67
Luca Foschianidaa3aba2020-01-08 15:55:08 +000068 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010069 "Only data_types supported [in] S16 -> [out] U8, S32");
70
Luca Foschianidaa3aba2020-01-08 15:55:08 +000071 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
72 && output->data_type() != DataType::U8
73 && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
Usama Arif9e631c22019-05-14 17:10:40 +010074 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010075
Luca Foschianidaa3aba2020-01-08 15:55:08 +000076 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
77 && output->data_type() != DataType::F16
78 && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
Usama Arif9e631c22019-05-14 17:10:40 +010079 "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8");
80
Luca Foschianidaa3aba2020-01-08 15:55:08 +000081 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8_SIGNED && output->data_type() != DataType::QASYMM8
82 && output->data_type() != DataType::F16
83 && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
Usama Arif9e631c22019-05-14 17:10:40 +010084 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010085
86 // Validate in case of configured output
87 if(output->total_size() > 0)
88 {
89 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
90 }
91
92 return Status{};
93}
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010094} // namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010095
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000096NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010097 : _input(nullptr), _output(nullptr), _policy(), _shift(0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010098{
99}
100
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100101void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100103 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
104
105 // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
106 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
Georgios Pinitase2229412017-07-12 12:30:40 +0100107
108 _input = input;
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100109 _output = output;
Georgios Pinitase2229412017-07-12 12:30:40 +0100110 _policy = policy;
111 _shift = shift;
112
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100113 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
Georgios Pinitase2229412017-07-12 12:30:40 +0100114
115 // Configure kernel window
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000116 Window win = calculate_max_window(*input->info(), Steps());
117 Coordinates coord;
118 coord.set_num_dimensions(output->info()->num_dimensions());
119 output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
120
121 ICPPKernel::configure(win);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100122}
Georgios Pinitase2229412017-07-12 12:30:40 +0100123
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100124Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
125{
126 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100127 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100128}
129
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000130void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100131{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100132 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Georgios Pinitase2229412017-07-12 12:30:40 +0100134 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100135 ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100136 ARM_COMPUTE_ERROR_ON(_input == _output);
137
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000138 const auto window_start_x = static_cast<int>(window.x().start());
139 const auto window_end_x = static_cast<int>(window.x().end());
140 const int window_step_x = 16;
141
142 Window win{ window };
143 win.set(Window::DimX, Window::Dimension(0, 1, 1));
144
145 Iterator input(_input, win);
146 Iterator output(_output, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100147
148 switch(_input->info()->data_type())
149 {
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000150 case DataType::QASYMM8_SIGNED:
151 {
152 const int16x8_t b = vdupq_n_s16(_shift);
153
154 switch(_output->info()->data_type())
155 {
156 case DataType::S16:
157 {
158 /* Up-conversion QASYMM8_SIGNED -> S16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000159 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000160 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000161 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
162 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
163 int x = window_start_x;
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000164
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000165 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000166 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000167 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000168
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000169 const int16x8x2_t texels =
170 {
171 {
172 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
173 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
174 }
175 };
176
177 vst1q_s16(output_ptr + x, texels.val[0]);
178 vst1q_s16(output_ptr + x + 8, texels.val[1]);
179 }
180
181 // Compute left-over elements
182 for(; x < window_end_x; ++x)
183 {
184 *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
185 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000186 },
187 input, output);
188 break;
189 }
190 case DataType::S32:
191 {
192 /* Up-conversion QASYMM8_SIGNED -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000193 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000194 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000195 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
196 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
197 int x = window_start_x;
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000198
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000199 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000200 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000201 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000202
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000203 const int16x8x2_t texels =
204 {
205 {
206 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
207 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
208 }
209 };
210
211 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
212 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
213 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
214 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
215 }
216
217 // Compute left-over elements
218 for(; x < window_end_x; ++x)
219 {
220 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
221 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000222 },
223 input, output);
224 break;
225 }
226 case DataType::F32:
227 {
228 /* Up-conversion QASYMM8_SIGNED -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000229 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000230 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000231 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
232 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000233
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000234 int x = window_start_x;
235 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000236 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000237 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(input.ptr()));
238
239 const int16x8x2_t texels =
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000240 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000241 {
242 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
243 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
244 }
245 };
246 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
247 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
248 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
249 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
250 }
251
252 // Compute left-over elements
253 for(; x < window_end_x; ++x)
254 {
255 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
256 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000257 },
258 input, output);
259 break;
260 }
261#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
262 case DataType::F16:
263 {
264 /* Up-conversion QASYMM8_SIGNED -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000265 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000266 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000267 const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
268 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
269 int x = window_start_x;
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000270
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000271 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000272 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000273 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
274
275 const int16x8x2_t texels =
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000276 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000277 {
278 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
279 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
280 }
281 };
282 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
283 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
284 }
285
286 // Compute left-over elements
287 for(; x < window_end_x; ++x)
288 {
289 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
290 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000291 },
292 input, output);
293 break;
294 }
295#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
296
297 default:
298 ARM_COMPUTE_ERROR("Output data type not supported");
299 }
300 break;
301 }
302
Michalis Spyroue2588182018-12-13 18:31:18 +0000303 case DataType::QASYMM8:
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100304 case DataType::U8:
305 {
306 const int16x8_t b = vdupq_n_s16(_shift);
307
308 switch(_output->info()->data_type())
309 {
310 case DataType::S16:
311 {
312 /* Up-conversion U8 -> S16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000313 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100314 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000315 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
316 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100317
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000318 int x = window_start_x;
319 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100320 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000321 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100322
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000323 const int16x8x2_t texels =
324 {
325 {
326 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
327 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
328 }
329 };
330
331 vst1q_s16(output_ptr + x, texels.val[0]);
332 vst1q_s16(output_ptr + x + 8, texels.val[1]);
333 }
334
335 // Compute left-over elements
336 for(; x < window_end_x; ++x)
337 {
338 auto in = static_cast<int32_t>(*(input_ptr + x));
339 *(output_ptr + x) = in << _shift;
340 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100341 },
342 input, output);
343 break;
344 }
345 case DataType::S32:
346 {
347 /* Up-conversion U8 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000348 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100349 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000350 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
351 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100352
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000353 int x = window_start_x;
354 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100355 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000356 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100357
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000358 const int16x8x2_t texels =
359 {
360 {
361 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
362 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
363 }
364 };
365
366 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
367 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
368 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
369 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
370 }
371
372 // Compute left-over elements
373 for(; x < window_end_x; ++x)
374 {
375 auto in = static_cast<uint32_t>(*(input_ptr + x));
376 *(output_ptr + x) = in << _shift;
377 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100378 },
379 input, output);
380 break;
381 }
Usama Arif9e631c22019-05-14 17:10:40 +0100382 case DataType::F32:
383 {
384 /* Up-conversion U8 -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000385 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100386 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000387 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
388 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100389
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000390 int x = window_start_x;
391 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Usama Arif9e631c22019-05-14 17:10:40 +0100392 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000393 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
394
395 const int16x8x2_t texels =
Usama Arif9e631c22019-05-14 17:10:40 +0100396 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000397 {
398 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
399 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
400 }
401 };
402 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
403 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
404 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
405 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
406 }
407
408 // Compute left-over elements
409 for(; x < window_end_x; ++x)
410 {
411 auto in = static_cast<uint32_t>(*(input_ptr + x));
412 *(output_ptr + x) = static_cast<float>(in << _shift);
413 }
Usama Arif9e631c22019-05-14 17:10:40 +0100414 },
415 input, output);
416 break;
417 }
418#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
419 case DataType::F16:
420 {
421 /* Up-conversion U8 -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000422 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100423 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000424 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
425 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100426
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000427 int x = window_start_x;
428 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Usama Arif9e631c22019-05-14 17:10:40 +0100429 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000430 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
431
432 const int16x8x2_t texels =
Usama Arif9e631c22019-05-14 17:10:40 +0100433 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000434 {
435 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
436 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
437 }
438 };
439 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
440 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
441 }
442
443 // Compute left-over elements
444 for(; x < window_end_x; ++x)
445 {
446 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
447 }
Usama Arif9e631c22019-05-14 17:10:40 +0100448 },
449 input, output);
450 break;
451 }
452#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100453 case DataType::U16:
454 {
455 /* Up-conversion U8 -> U16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000456 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100457 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000458 const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
459 const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000461 int x = window_start_x;
462 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100463 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000464 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100465
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000466 const uint16x8x2_t texels =
467 {
468 {
469 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
470 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
471 }
472 };
473
474 vst1q_u16(output_ptr + x, texels.val[0]);
475 vst1q_u16(output_ptr + x + 8, texels.val[1]);
476 }
477
478 // Compute left-over elements
479 for(; x < window_end_x; ++x)
480 {
481 *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
482 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100483 },
484 input, output);
485 break;
486 }
487 default:
488 ARM_COMPUTE_ERROR("Output data type not supported");
489 }
490 break;
491 }
492 case DataType::S16:
493 {
494 switch(_output->info()->data_type())
495 {
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000496 case DataType::QASYMM8_SIGNED:
497 {
498 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
499
500 /* Down-conversion S16 -> QASYMM8_SIGNED */
501 if(ConvertPolicy::SATURATE == _policy)
502 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000503 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000504 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000505 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
506 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000507
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000508 int x = window_start_x;
509 for(; x <= (window_end_x - window_step_x); x += window_step_x)
510 {
511 const int16x8x2_t texels =
512 {
513 {
514 vqshlq_s16(vld1q_s16(input_ptr + x), b),
515 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
516 }
517 };
518
519 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
520 }
521
522 // Compute left-over elements
523 for(; x < window_end_x; ++x)
524 {
525 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
526 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000527 },
528 input, output);
529 }
530 else
531 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000532 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000533 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000534 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
535 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000536
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000537 int x = window_start_x;
538 for(; x <= (window_end_x - window_step_x); x += window_step_x)
539 {
540 const int16x8x2_t texels =
541 {
542 {
543 vshlq_s16(vld1q_s16(input_ptr + x), b),
544 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
545 }
546 };
547
548 vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
549 }
550
551 // Compute left-over elements
552 for(; x < window_end_x; ++x)
553 {
554 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
555 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000556 },
557 input, output);
558 }
559 break;
560 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100561 case DataType::U8:
562 {
563 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
564
565 /* Down-conversion S16 -> U8 */
566 if(ConvertPolicy::SATURATE == _policy)
567 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000568 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100569 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000570 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
571 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100572
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000573 int x = window_start_x;
574 for(; x <= (window_end_x - window_step_x); x += window_step_x)
575 {
576 const int16x8x2_t texels =
577 {
578 {
579 vqshlq_s16(vld1q_s16(input_ptr + x), b),
580 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
581 }
582 };
583
584 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
585 }
586
587 // Compute left-over elements
588 for(; x < window_end_x; ++x)
589 {
590 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
591 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100592 },
593 input, output);
594 }
595 else
596 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000597 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100598 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000599 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
600 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100601
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000602 int x = window_start_x;
603 for(; x <= (window_end_x - window_step_x); x += window_step_x)
604 {
605 const int16x8x2_t texels =
606 {
607 {
608 vshlq_s16(vld1q_s16(input_ptr + x), b),
609 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
610 }
611 };
612
613 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
614 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
615 }
616
617 // Compute left-over elements
618 for(; x < window_end_x; ++x)
619 {
620 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
621 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100622 },
623 input, output);
624 }
625 break;
626 }
627 case DataType::S32:
628 {
629 const int32x4_t b = vdupq_n_s32(_shift);
630
631 /* Up-conversion S16 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000632 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100633 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000634 const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
635 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100636
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000637 int x = window_start_x;
638 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100639 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000640 const int16x8x2_t texels =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100641 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000642 {
643 vld1q_s16(input_ptr + x),
644 vld1q_s16(input_ptr + x + 8)
645 }
646 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100647
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000648 const int32x4x4_t texels_s32 =
649 {
650 {
651 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
652 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
653 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
654 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
655 }
656 };
657
658 vst1q_s32(output_ptr + x, texels_s32.val[0]);
659 vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
660 vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
661 vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
662 }
663
664 // Compute left-over elements
665 for(; x < window_end_x; ++x)
666 {
667 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
668 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100669 },
670 input, output);
671 break;
672 }
673 default:
674 ARM_COMPUTE_ERROR("Output data type not supported");
675 }
676 break;
677 }
678 case DataType::U16:
679 {
680 switch(_output->info()->data_type())
681 {
682 case DataType::U8:
683 {
684 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
685
686 /* Down-conversion U16 -> U8 */
687 if(ConvertPolicy::SATURATE == _policy)
688 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000689 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100690 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000691 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
692 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100693
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000694 int x = window_start_x;
695 for(; x <= (window_end_x - window_step_x); x += window_step_x)
696 {
697 const uint16x8x2_t texels =
698 {
699 {
700 vqshlq_u16(vld1q_u16(input_ptr + x), b),
701 vqshlq_u16(vld1q_u16(input_ptr + x + 8), b)
702 }
703 };
704
705 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
706 }
707
708 // Compute left-over elements
709 for(; x < window_end_x; ++x)
710 {
711 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
712 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100713 },
714 input, output);
715 }
716 else
717 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000718 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100719 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000720 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
721 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100722
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000723 int x = window_start_x;
724 for(; x <= (window_end_x - window_step_x); x += window_step_x)
725 {
726 const uint16x8x2_t texels =
727 {
728 {
729 vshlq_u16(vld1q_u16(input_ptr + x), b),
730 vshlq_u16(vld1q_u16(input_ptr + x + 8), b)
731 }
732 };
733
734 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
735 }
736
737 // Compute left-over elements
738 for(; x < window_end_x; ++x)
739 {
740 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
741 }
742
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100743 },
744 input, output);
745 }
746 break;
747 }
748 case DataType::U32:
749 {
750 const int32x4_t b = vdupq_n_s32(_shift);
751
752 /* Up-conversion U16 -> U32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000753 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100754 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000755 const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
756 const auto output_ptr = reinterpret_cast<uint32_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100757
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000758 int x = window_start_x;
759 for(; x <= (window_end_x - window_step_x); x += window_step_x)
760 {
761 const uint16x8x2_t texels =
762 {
763 {
764 vld1q_u16(input_ptr + x),
765 vld1q_u16(input_ptr + x + 8)
766 }
767 };
768
769 vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
770 vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
771 vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
772 vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
773 }
774 // Compute left-over elements
775 for(; x < window_end_x; ++x)
776 {
777 *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
778 }
779
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100780 },
781 input, output);
782 break;
783 }
784 default:
785 ARM_COMPUTE_ERROR("Output data type not supported");
786 }
787 break;
788 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100789#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
790 case DataType::F16:
791 switch(_output->info()->data_type())
792 {
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000793 case DataType::QASYMM8_SIGNED:
794 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000795 const float16_t scale_s = 1 << _shift;
796 const float16x8_t scale = vdupq_n_f16(scale_s);
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000797
798 /* Up-conversion F16 -> QASYMM8_SIGNED */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000799 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000800 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000801 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
802 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000803
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000804 int x = window_start_x;
805 for(; x <= (window_end_x - window_step_x); x += window_step_x)
806 {
807 const float16x8x2_t texels =
808 {
809 {
810 vmulq_f16(vld1q_f16(input_ptr + x), scale),
811 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
812 }
813 };
814
815 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
816 }
817
818 // Compute left-over elements
819 for(; x < window_end_x; ++x)
820 {
821 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) * scale_s);
822 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +0000823 },
824 input, output);
825 break;
826 }
Michalis Spyroue2588182018-12-13 18:31:18 +0000827 case DataType::QASYMM8:
Usama Arif9e631c22019-05-14 17:10:40 +0100828 case DataType::U8:
Michalis Spyroue2588182018-12-13 18:31:18 +0000829 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000830 const float16_t scale_s = 1 << _shift;
831 const float16x8_t scale = vdupq_n_f16(scale_s);
Michalis Spyroue2588182018-12-13 18:31:18 +0000832
Usama Arif9e631c22019-05-14 17:10:40 +0100833 /* Up-conversion F16 -> U8 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000834 execute_window_loop(win, [&](const Coordinates &)
Michalis Spyroue2588182018-12-13 18:31:18 +0000835 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000836 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
837 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Michalis Spyroue2588182018-12-13 18:31:18 +0000838
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000839 int x = window_start_x;
840 for(; x <= (window_end_x - window_step_x); x += window_step_x)
841 {
842 const float16x8x2_t texels =
843 {
844 {
845 vmulq_f16(vld1q_f16(input_ptr + x), scale),
846 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
847 }
848 };
849
850 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
851 }
852
853 // Compute left-over elements
854 for(; x < window_end_x; ++x)
855 {
856 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) * scale_s);
857 }
858
Michalis Spyroue2588182018-12-13 18:31:18 +0000859 },
860 input, output);
861 break;
862 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100863 case DataType::F32:
864 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000865 const float scale_s = 1 << _shift;
866 const float32x4_t scale = vdupq_n_f32(scale_s);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100867
868 /* Up-conversion F16 -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000869 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100870 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000871 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
872 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100873
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000874 int x = window_start_x;
875 for(; x <= (window_end_x - window_step_x); x += window_step_x)
876 {
877 const float16x8x2_t texels =
878 {
879 {
880 vld1q_f16(input_ptr + x),
881 vld1q_f16(input_ptr + x + 8)
882 }
883 };
884 vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
885 vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
886 vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
887 vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
888 }
889
890 // Compute left-over elements
891 for(; x < window_end_x; ++x)
892 {
893 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
894 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100895 },
896 input, output);
897 break;
898 }
Usama Arif9e631c22019-05-14 17:10:40 +0100899 case DataType::S32:
900 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000901 const float scale_s = 1 << _shift;
902 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +0100903
904 /* Up-conversion F16 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000905 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100906 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000907 const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
908 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100909
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000910 int x = window_start_x;
911 for(; x <= (window_end_x - window_step_x); x += window_step_x)
912 {
913 const float16x8x2_t texels =
914 {
915 {
916 vld1q_f16(input_ptr + x),
917 vld1q_f16(input_ptr + x + 8)
918 }
919 };
920
921 vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
922 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
923 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
924 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
925 }
926
927 // Compute left-over elements
928 for(; x < window_end_x; ++x)
929 {
930 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
931 }
Usama Arif9e631c22019-05-14 17:10:40 +0100932 },
933 input, output);
934 break;
935 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100936 default:
937 ARM_COMPUTE_ERROR("Output data type not supported");
938 }
939 break;
Michalis Spyroue2588182018-12-13 18:31:18 +0000940#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100941 case DataType::F32:
942 switch(_output->info()->data_type())
943 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000944#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100945 case DataType::F16:
946 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000947 const float scale_s = 1.f / (1 << _shift);
948 const float32x4_t scale = vdupq_n_f32(scale_s);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100949
950 /* Down-conversion F32 -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000951 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100952 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000953 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
954 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100955
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000956 int x = window_start_x;
957 for(; x <= (window_end_x - window_step_x); x += window_step_x)
958 {
959 const float32x4x4_t texels =
960 {
961 {
962 vmulq_f32(vld1q_f32(input_ptr + x), scale),
963 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
964 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
965 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale)
966 }
967 };
968
969 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
970 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
971 }
972
973 // Compute left-over elements
974 for(; x < window_end_x; ++x)
975 {
976 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
977 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100978 },
979 input, output);
980 break;
981 }
Michalis Spyroue2588182018-12-13 18:31:18 +0000982#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Usama Arif9e631c22019-05-14 17:10:40 +0100983 case DataType::S32:
984 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000985 const float scale_s = 1.f / (1 << _shift);
986 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +0100987
988 /* Conversion F32 -> S32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000989 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100990 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000991 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
992 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +0100993
Michalis Spyrou4de2d592020-02-21 18:58:38 +0000994 int x = window_start_x;
995 for(; x <= (window_end_x - window_step_x); x += window_step_x)
996 {
997 const float32x4x4_t texels =
998 {
999 {
1000 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1001 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1002 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1003 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1004 }
1005 };
1006
1007 vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1008 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1009 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1010 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1011 }
1012
1013 // Compute left-over elements
1014 for(; x < window_end_x; ++x)
1015 {
1016 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1017 }
Usama Arif9e631c22019-05-14 17:10:40 +01001018 },
1019 input, output);
1020 break;
1021 }
1022 case DataType::QASYMM8:
1023 case DataType::U8:
1024 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001025 const float scale_s = 1.f / (1 << _shift);
1026 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001027
1028 /* Down-conversion F32 -> U8 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001029 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001030 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001031 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1032 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001033
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001034 int x = window_start_x;
1035 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1036 {
1037 const float32x4x4_t texels =
1038 {
1039 {
1040 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1041 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1042 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1043 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1044 }
1045 };
1046
1047 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1048 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1049 }
1050
1051 // Compute left-over elements
1052 for(; x < window_end_x; ++x)
1053 {
1054 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1055 }
Usama Arif9e631c22019-05-14 17:10:40 +01001056 },
1057 input, output);
1058 break;
1059 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001060 case DataType::QASYMM8_SIGNED:
1061 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001062 const float scale_s = 1.f / (1 << _shift);
1063 const float32x4_t scale = vdupq_n_f32(scale_s);
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001064
1065 /* Down-conversion F32 -> QASYMM8_SIGNED */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001066 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001067 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001068 const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1069 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001070
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001071 int x = window_start_x;
1072 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1073 {
1074 const float32x4x4_t texels =
1075 {
1076 {
1077 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1078 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1079 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1080 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1081 }
1082 };
1083
1084 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1085 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1086 }
1087 // Compute left-over elements
1088 for(; x < window_end_x; ++x)
1089 {
1090 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1091 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001092 },
1093 input, output);
1094 break;
1095 }
Usama Arif9e631c22019-05-14 17:10:40 +01001096
Usama Arif9e631c22019-05-14 17:10:40 +01001097 default:
1098 ARM_COMPUTE_ERROR("Output data type not supported");
1099 }
1100 break;
1101
1102 case DataType::S32:
1103 switch(_output->info()->data_type())
1104 {
1105#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1106 case DataType::F16:
1107 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001108 const float scale_s = 1.f / (1 << _shift);
1109 const float32x4_t scale = vdupq_n_f32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001110
1111 /* Down-conversion S32 -> F16 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001112 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001113 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001114 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1115 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001116
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001117 int x = window_start_x;
1118 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1119 {
1120 const float32x4x4_t texels =
1121 {
1122 {
1123 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)), scale),
1124 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)), scale),
1125 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)), scale),
1126 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)), scale)
1127 }
1128 };
1129
1130 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1131 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1132 }
1133
1134 // Compute left-over elements
1135 for(; x < window_end_x; ++x)
1136 {
1137 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) * scale_s);
1138 }
Usama Arif9e631c22019-05-14 17:10:40 +01001139 },
1140 input, output);
1141 break;
1142 }
1143#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1144 case DataType::F32:
1145 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001146 const int scale_s = 1.f / (1 << _shift);
1147 const int32x4_t scale = vdupq_n_s32(scale_s);
Usama Arif9e631c22019-05-14 17:10:40 +01001148
1149 /* Conversion S32 -> F32 */
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001150 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001151 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001152 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1153 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001154
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001155 int x = window_start_x;
1156 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1157 {
1158 const int32x4x4_t texels =
1159 {
1160 {
1161 vmulq_s32(vld1q_s32(input_ptr + x), scale),
1162 vmulq_s32(vld1q_s32(input_ptr + x + 4), scale),
1163 vmulq_s32(vld1q_s32(input_ptr + x + 8), scale),
1164 vmulq_s32(vld1q_s32(input_ptr + x + 12), scale),
1165 }
1166 };
1167
1168 vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1169 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1170 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1171 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1172 }
1173
1174 // Compute left-over elements
1175 for(; x < window_end_x; ++x)
1176 {
1177 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1178 }
Usama Arif9e631c22019-05-14 17:10:40 +01001179 },
1180 input, output);
1181 break;
1182 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001183 case DataType::QASYMM8_SIGNED:
1184 {
1185 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1186
1187 /* Down-conversion S32 -> QASYMM8_SIGNED */
1188 if(ConvertPolicy::SATURATE == _policy)
1189 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001190 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001191 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001192 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1193 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1194
1195 int x = window_start_x;
1196 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001197 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001198 const int32x4x4_t texels =
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001199 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001200 {
1201 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1202 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1203 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1204 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1205 }
1206 };
1207 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1208 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1209 }
1210
1211 // Compute left-over elements
1212 for(; x < window_end_x; ++x)
1213 {
1214 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1215 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001216 },
1217 input, output);
1218 }
1219 else
1220 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001221 execute_window_loop(win, [&](const Coordinates &)
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001222 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001223 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1224 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001225
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001226 int x = window_start_x;
1227 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1228 {
1229 const int32x4x4_t texels =
1230 {
1231 {
1232 vshlq_s32(vld1q_s32(input_ptr + x), b),
1233 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1234 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1235 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1236 }
1237 };
1238
1239 vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1240 vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1241 }
1242
1243 // Compute left-over elements
1244 for(; x < window_end_x; ++x)
1245 {
1246 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1247 }
Luca Foschianidaa3aba2020-01-08 15:55:08 +00001248 },
1249 input, output);
1250 }
1251 break;
1252 }
Usama Arif9e631c22019-05-14 17:10:40 +01001253 case DataType::QASYMM8:
1254 case DataType::U8:
1255 {
1256 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1257
1258 /* Down-conversion S32 -> U8 */
1259 if(ConvertPolicy::SATURATE == _policy)
1260 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001261 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001262 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001263 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1264 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1265
1266 int x = window_start_x;
1267 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Usama Arif9e631c22019-05-14 17:10:40 +01001268 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001269 const int32x4x4_t texels =
Usama Arif9e631c22019-05-14 17:10:40 +01001270 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001271 {
1272 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1273 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1274 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1275 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1276 }
1277 };
1278 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1279 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1280 }
1281
1282 // Compute left-over elements
1283 for(; x < window_end_x; ++x)
1284 {
1285 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1286 }
Usama Arif9e631c22019-05-14 17:10:40 +01001287 },
1288 input, output);
1289 }
1290 else
1291 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001292 execute_window_loop(win, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +01001293 {
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001294 const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1295 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Usama Arif9e631c22019-05-14 17:10:40 +01001296
Michalis Spyrou4de2d592020-02-21 18:58:38 +00001297 int x = window_start_x;
1298 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1299 {
1300 const int32x4x4_t texels =
1301 {
1302 {
1303 vshlq_s32(vld1q_s32(input_ptr + x), b),
1304 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1305 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1306 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1307 }
1308 };
1309
1310 vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1311 vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1312 }
1313
1314 // Compute left-over elements
1315 for(; x < window_end_x; ++x)
1316 {
1317 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
1318 }
Usama Arif9e631c22019-05-14 17:10:40 +01001319 },
1320 input, output);
1321 }
1322 break;
1323 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +01001324 default:
1325 ARM_COMPUTE_ERROR("Output data type not supported");
1326 }
1327 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001328 default:
1329 ARM_COMPUTE_ERROR("Not supported");
1330 }
1331}