blob: d00c5009d2fd4f7015cdff11802d6337011074da [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michalis Spyroua4f378d2019-04-26 14:54:54 +01002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000024#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/NEON/NEFixedPoint.h"
Michalis Spyroue2588182018-12-13 18:31:18 +000031#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
34
35#include <arm_neon.h>
36
37using namespace arm_compute;
38
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010039namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010041Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
42{
43 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
44 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
45 ARM_COMPUTE_UNUSED(policy);
46 ARM_COMPUTE_RETURN_ERROR_ON(input == output);
Usama Arif9e631c22019-05-14 17:10:40 +010047 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32);
Michalis Spyroue2588182018-12-13 18:31:18 +000048 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010049 ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
50
Usama Arif9e631c22019-05-14 17:10:40 +010051 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Michalis Spyrou6bff1952019-10-02 17:22:11 +010052 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
Usama Arif9e631c22019-05-14 17:10:40 +010053 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
Michalis Spyroue2588182018-12-13 18:31:18 +000054
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010055 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Usama Arif9e631c22019-05-14 17:10:40 +010056 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
57 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010058
59 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
60 "Only data_types supported [in] U16 -> [out] U8, U32");
61
62 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
63 "Only data_types supported [in] S16 -> [out] U8, S32");
64
Michalis Spyrou6bff1952019-10-02 17:22:11 +010065 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::U8 && output->data_type() != DataType::F32
66 && output->data_type() != DataType::S32),
Usama Arif9e631c22019-05-14 17:10:40 +010067 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010068
Michalis Spyrou6bff1952019-10-02 17:22:11 +010069 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::S32
70 && output->data_type() != DataType::U8),
Usama Arif9e631c22019-05-14 17:10:40 +010071 "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8");
72
Michalis Spyrou6bff1952019-10-02 17:22:11 +010073 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32
74 && output->data_type() != DataType::U8),
Usama Arif9e631c22019-05-14 17:10:40 +010075 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010076
77 // Validate in case of configured output
78 if(output->total_size() > 0)
79 {
80 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
81 }
82
83 return Status{};
84}
85
86std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
87{
88 constexpr unsigned int num_elems_processed_per_iteration = 16;
89
90 Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
91
92 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
93 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
94 bool window_changed = update_window_and_padding(win, input_access, output_access);
95 output_access.set_valid_region(win, output->valid_region());
96
97 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
98 return std::make_pair(err, win);
99}
100} // namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100101
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000102NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100103 : _input(nullptr), _output(nullptr), _policy(), _shift(0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100104{
105}
106
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100107void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100109 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
110
111 // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
112 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
Georgios Pinitase2229412017-07-12 12:30:40 +0100113
114 _input = input;
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100115 _output = output;
Georgios Pinitase2229412017-07-12 12:30:40 +0100116 _policy = policy;
117 _shift = shift;
118
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100119 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
Georgios Pinitase2229412017-07-12 12:30:40 +0100120
121 // Configure kernel window
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100122 auto win_config = validate_and_configure_window(input->info(), output->info());
123 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
124 ICPPKernel::configure(win_config.second);
125}
Georgios Pinitase2229412017-07-12 12:30:40 +0100126
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100127Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
128{
129 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
130 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
131
132 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133}
134
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000135void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100136{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100137 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100138 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Georgios Pinitase2229412017-07-12 12:30:40 +0100139 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100140 ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100141 ARM_COMPUTE_ERROR_ON(_input == _output);
142
143 Iterator input(_input, window);
144 Iterator output(_output, window);
145
146 switch(_input->info()->data_type())
147 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000148 case DataType::QASYMM8:
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149 case DataType::U8:
150 {
151 const int16x8_t b = vdupq_n_s16(_shift);
152
153 switch(_output->info()->data_type())
154 {
155 case DataType::S16:
156 {
157 /* Up-conversion U8 -> S16 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100158 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159 {
160 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
161
162 const int16x8x2_t texels =
163 {
164 {
165 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
166 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
167 }
168 };
169
170 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]);
171 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]);
172 },
173 input, output);
174 break;
175 }
176 case DataType::S32:
177 {
178 /* Up-conversion U8 -> S32 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100179 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100180 {
181 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
182
183 const int16x8x2_t texels =
184 {
185 {
186 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
187 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
188 }
189 };
190
191 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
192 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
193 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
194 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
195 },
196 input, output);
197 break;
198 }
Usama Arif9e631c22019-05-14 17:10:40 +0100199 case DataType::F32:
200 {
201 /* Up-conversion U8 -> F32 */
202 execute_window_loop(window, [&](const Coordinates &)
203 {
204 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
205
206 const int16x8x2_t texels =
207 {
208 {
209 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
210 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
211 }
212 };
213 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
214 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
215 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
216 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
217 },
218 input, output);
219 break;
220 }
221#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
222 case DataType::F16:
223 {
224 /* Up-conversion U8 -> F16 */
225 execute_window_loop(window, [&](const Coordinates &)
226 {
227 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
228
229 const int16x8x2_t texels =
230 {
231 {
232 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
233 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
234 }
235 };
236 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcvtq_f16_s16(texels.val[0]));
237 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcvtq_f16_s16(texels.val[1]));
238 },
239 input, output);
240 break;
241 }
242#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
243
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100244 case DataType::U16:
245 {
246 /* Up-conversion U8 -> U16 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100247 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100248 {
249 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
250
251 const uint16x8x2_t texels =
252 {
253 {
254 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
255 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
256 }
257 };
258
259 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), texels.val[0]);
260 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()) + 8, texels.val[1]);
261 },
262 input, output);
263 break;
264 }
265 default:
266 ARM_COMPUTE_ERROR("Output data type not supported");
267 }
268 break;
269 }
270 case DataType::S16:
271 {
272 switch(_output->info()->data_type())
273 {
274 case DataType::U8:
275 {
276 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
277
278 /* Down-conversion S16 -> U8 */
279 if(ConvertPolicy::SATURATE == _policy)
280 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100281 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100282 {
283 const int16x8x2_t texels =
284 {
285 {
286 vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
287 vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
288 }
289 };
290
291 vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
292 },
293 input, output);
294 }
295 else
296 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100297 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100298 {
299 const int16x8x2_t texels =
300 {
301 {
302 vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
303 vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
304 }
305 };
306
307 vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
308 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
309 },
310 input, output);
311 }
312 break;
313 }
314 case DataType::S32:
315 {
316 const int32x4_t b = vdupq_n_s32(_shift);
317
318 /* Up-conversion S16 -> S32 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100319 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100320 {
321 const int16x8x2_t texels =
322 {
323 {
324 vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
325 vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
326 }
327 };
328
329 const int32x4x4_t texels_s32 =
330 {
331 {
332 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
333 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
334 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
335 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
336 }
337 };
338
339 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
340 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
341 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
342 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
343 },
344 input, output);
345 break;
346 }
347 default:
348 ARM_COMPUTE_ERROR("Output data type not supported");
349 }
350 break;
351 }
352 case DataType::U16:
353 {
354 switch(_output->info()->data_type())
355 {
356 case DataType::U8:
357 {
358 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
359
360 /* Down-conversion U16 -> U8 */
361 if(ConvertPolicy::SATURATE == _policy)
362 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100363 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100364 {
365 const uint16x8x2_t texels =
366 {
367 {
368 vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
369 vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
370 }
371 };
372
373 vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
374 },
375 input, output);
376 }
377 else
378 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100379 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100380 {
381 const uint16x8x2_t texels =
382 {
383 {
384 vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
385 vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
386 }
387 };
388
389 vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
390 },
391 input, output);
392 }
393 break;
394 }
395 case DataType::U32:
396 {
397 const int32x4_t b = vdupq_n_s32(_shift);
398
399 /* Up-conversion U16 -> U32 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100400 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100401 {
402 const uint16x8x2_t texels =
403 {
404 {
405 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
406 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
407 }
408 };
409
410 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
411 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
412 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
413 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
414 },
415 input, output);
416 break;
417 }
418 default:
419 ARM_COMPUTE_ERROR("Output data type not supported");
420 }
421 break;
422 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100423#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
424 case DataType::F16:
425 switch(_output->info()->data_type())
426 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000427 case DataType::QASYMM8:
Usama Arif9e631c22019-05-14 17:10:40 +0100428 case DataType::U8:
Michalis Spyroue2588182018-12-13 18:31:18 +0000429 {
Usama Arif9e631c22019-05-14 17:10:40 +0100430 const float16x8_t scale = vdupq_n_f16(1 << _shift);
Michalis Spyroue2588182018-12-13 18:31:18 +0000431
Usama Arif9e631c22019-05-14 17:10:40 +0100432 /* Up-conversion F16 -> U8 */
Michalis Spyrou6bff1952019-10-02 17:22:11 +0100433 execute_window_loop(window, [&](const Coordinates &)
Michalis Spyroue2588182018-12-13 18:31:18 +0000434 {
435 const float16x8x2_t texels =
436 {
437 {
438 vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())), scale),
439 vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8), scale),
440 }
441 };
442
Usama Arif9e631c22019-05-14 17:10:40 +0100443 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
Michalis Spyroue2588182018-12-13 18:31:18 +0000444 },
445 input, output);
446 break;
447 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100448 case DataType::F32:
449 {
450 const float32x4_t scale = vdupq_n_f32(1 << _shift);
451
452 /* Up-conversion F16 -> F32 */
Michalis Spyrou6bff1952019-10-02 17:22:11 +0100453 execute_window_loop(window, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100454 {
455 const float16x8x2_t texels =
456 {
457 {
458 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
459 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
460 }
461 };
462
463 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
464 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
465 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
466 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
467 },
468 input, output);
469 break;
470 }
Usama Arif9e631c22019-05-14 17:10:40 +0100471 case DataType::S32:
472 {
473 const float32x4_t scale = vdupq_n_f32(1 << _shift);
474
475 /* Up-conversion F16 -> S32 */
Michalis Spyrou6bff1952019-10-02 17:22:11 +0100476 execute_window_loop(window, [&](const Coordinates &)
Usama Arif9e631c22019-05-14 17:10:40 +0100477 {
478 const float16x8x2_t texels =
479 {
480 {
481 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
482 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
483 }
484 };
485
486 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
487 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
488 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
489 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
490 },
491 input, output);
492 break;
493 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100494 default:
495 ARM_COMPUTE_ERROR("Output data type not supported");
496 }
497 break;
Michalis Spyroue2588182018-12-13 18:31:18 +0000498#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100499 case DataType::F32:
500 switch(_output->info()->data_type())
501 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000502#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100503 case DataType::F16:
504 {
505 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
506
507 /* Down-conversion F32 -> F16 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100508 execute_window_loop(window, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100509 {
510 const float32x4x4_t texels =
511 {
512 {
513 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
514 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
515 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
516 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
517 }
518 };
519
520 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
521 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
522 },
523 input, output);
524 break;
525 }
Michalis Spyroue2588182018-12-13 18:31:18 +0000526#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Usama Arif9e631c22019-05-14 17:10:40 +0100527 case DataType::S32:
528 {
529 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
530
531 /* Conversion F32 -> S32 */
532 execute_window_loop(window, [&](const Coordinates &)
533 {
534 const float32x4x4_t texels =
535 {
536 {
537 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
538 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
539 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
540 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale),
541 }
542 };
543
544 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(texels.val[0]));
545 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(texels.val[1]));
546 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(texels.val[2]));
547 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(texels.val[3]));
548 },
549 input, output);
550 break;
551 }
552 case DataType::QASYMM8:
553 case DataType::U8:
554 {
555 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
556
557 /* Down-conversion F32 -> U8 */
558 execute_window_loop(window, [&](const Coordinates &)
559 {
560 const float32x4x4_t texels =
561 {
562 {
563 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
564 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
565 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
566 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale),
567 }
568 };
569
570 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
Michalis Spyrou6bff1952019-10-02 17:22:11 +0100571 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()) + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
Usama Arif9e631c22019-05-14 17:10:40 +0100572 },
573 input, output);
574 break;
575 }
576
Usama Arif9e631c22019-05-14 17:10:40 +0100577 default:
578 ARM_COMPUTE_ERROR("Output data type not supported");
579 }
580 break;
581
582 case DataType::S32:
583 switch(_output->info()->data_type())
584 {
585#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
586 case DataType::F16:
587 {
588 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
589
590 /* Down-conversion S32 -> F16 */
591 execute_window_loop(window, [&](const Coordinates &)
592 {
593 const float32x4x4_t texels =
594 {
595 {
596 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()))), scale),
597 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4)), scale),
598 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8)), scale),
599 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12)), scale)
600 }
601 };
602
603 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
604 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
605 },
606 input, output);
607 break;
608 }
609#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
610 case DataType::F32:
611 {
612 const int32x4_t scale = vdupq_n_s32(1.f / (1 << _shift));
613
614 /* Conversion S32 -> F32 */
615 execute_window_loop(window, [&](const Coordinates &)
616 {
617 const int32x4x4_t texels =
618 {
619 {
620 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), scale),
621 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), scale),
622 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), scale),
623 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), scale),
624 }
625 };
626
627 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(texels.val[0]));
628 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(texels.val[1]));
629 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(texels.val[2]));
630 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(texels.val[3]));
631 },
632 input, output);
633 break;
634 }
635 case DataType::QASYMM8:
636 case DataType::U8:
637 {
638 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
639
640 /* Down-conversion S32 -> U8 */
641 if(ConvertPolicy::SATURATE == _policy)
642 {
643 execute_window_loop(window, [&](const Coordinates &)
644 {
645 const int32x4x4_t texels =
646 {
647 {
648 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b),
649 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b),
650 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b),
651 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b)
652 }
653 };
Usama Arifc5c750d2019-05-24 11:13:20 +0100654 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
Michalis Spyrou6bff1952019-10-02 17:22:11 +0100655 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()) + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
Usama Arif9e631c22019-05-14 17:10:40 +0100656 },
657 input, output);
658 }
659 else
660 {
661 execute_window_loop(window, [&](const Coordinates &)
662 {
663 const int32x4x4_t texels =
664 {
665 {
666 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b),
667 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b),
668 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b),
669 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b)
670 }
671 };
672
Michalis Spyrou6bff1952019-10-02 17:22:11 +0100673 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
674 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()) + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
Usama Arif9e631c22019-05-14 17:10:40 +0100675 },
676 input, output);
677 }
678 break;
679 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100680 default:
681 ARM_COMPUTE_ERROR("Output data type not supported");
682 }
683 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100684 default:
685 ARM_COMPUTE_ERROR("Not supported");
686 }
687}