blob: db76df9076ff4bafbc112f4e89ee48dd59d5bbbd [file] [log] [blame]
Georgios Pinitas11d84152021-04-28 10:20:18 +01001/*
2 * Copyright (c) 2016-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuCastKernel.h"
Georgios Pinitas11d84152021-04-28 10:20:18 +010025
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Validate.h"
31#include "src/core/CPP/Validate.h"
32#include "src/core/NEON/NEFixedPoint.h"
33#include "src/core/NEON/NEMath.h"
34#include "src/core/NEON/wrapper/wrapper.h"
35#include "src/core/helpers/AutoConfiguration.h"
36#include "src/core/helpers/WindowHelpers.h"
37#include "support/SaturateCast.h"
38
39namespace arm_compute
40{
41namespace cpu
42{
43namespace kernels
44{
45namespace
46{
47Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
48{
49 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
50 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
51 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
52 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
53 ARM_COMPUTE_UNUSED(policy);
54 ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
55 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
56 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
57 DataType::F32, DataType::S32);
58 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
59 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
60 DataType::U32, DataType::S32, DataType::F32);
61
62 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
63 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
64 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
65
66 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
67 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
68 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
69
70 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
71 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
72 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
73
74 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
75 "Only data_types supported [in] U16 -> [out] U8, U32");
76
77 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
78 "Only data_types supported [in] S16 -> [out] U8, S32");
79
80 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
81 "Only data_types supported [in] BFLOAT16 -> [out] F32");
82
83 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
84 && dst->data_type() != DataType::U8
85 && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
86 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
87
88 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
89 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
90 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
91 "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
92
93 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
94 && dst->data_type() != DataType::F16
95 && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
96 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
97
98 // Validate in case of configured dst
99 if(dst->total_size() > 0)
100 {
101 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
102 }
103
104 return Status{};
105}
106} // namespace
107
108void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
109{
110 ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
111
112 // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
113 set_shape_if_empty(*dst, src->tensor_shape());
114
115 _policy = policy;
116
117 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
118
119 // Configure kernel window
120 Window win = calculate_max_window(*src, Steps());
121
122 ICPPKernel::configure(win);
123}
124
125Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
126{
127 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
128 return Status{};
129}
130
131void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
132{
133 ARM_COMPUTE_UNUSED(info);
134 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
135 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
136
137 const auto window_start_x = static_cast<int>(window.x().start());
138 const auto window_end_x = static_cast<int>(window.x().end());
139 const int window_step_x = 16;
140
141 const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
142 ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST);
143 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
144 ARM_COMPUTE_ERROR_ON(_src == _dst);
145
146 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
147
148 Window win{ window };
149 win.set(Window::DimX, Window::Dimension(0, 1, 1));
150
151 Iterator src(_src, win);
152 Iterator dst(_dst, win);
153
154 switch(_src->info()->data_type())
155 {
156 case DataType::QASYMM8_SIGNED:
157 {
158 switch(_dst->info()->data_type())
159 {
160 case DataType::S16:
161 {
162 /* Up-conversion QASYMM8_SIGNED -> S16 */
163 execute_window_loop(win, [&](const Coordinates &)
164 {
165 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
166 const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
167 int x = window_start_x;
168
169 for(; x <= (window_end_x - window_step_x); x += window_step_x)
170 {
171 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
172
173 const int16x8x2_t texels =
174 {
175 {
176 vmovl_s8(vget_low_s8(texels_s8)),
177 vmovl_s8(vget_high_s8(texels_s8))
178 }
179 };
180
181 vst1q_s16(dst_ptr + x, texels.val[0]);
182 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
183 }
184
185 // Compute left-over elements
186 for(; x < window_end_x; ++x)
187 {
188 *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
189 }
190 },
191 src, dst);
192 break;
193 }
194 case DataType::S32:
195 {
196 /* Up-conversion QASYMM8_SIGNED -> S32 */
197 execute_window_loop(win, [&](const Coordinates &)
198 {
199 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
200 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
201 int x = window_start_x;
202
203 for(; x <= (window_end_x - window_step_x); x += window_step_x)
204 {
205 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
206
207 const int16x8x2_t texels =
208 {
209 {
210 vmovl_s8(vget_low_s8(texels_s8)),
211 vmovl_s8(vget_high_s8(texels_s8))
212 }
213 };
214
215 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
216 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
217 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
218 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
219 }
220
221 // Compute left-over elements
222 for(; x < window_end_x; ++x)
223 {
224 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
225 }
226 },
227 src, dst);
228 break;
229 }
230 case DataType::F32:
231 {
232 /* Up-conversion QASYMM8_SIGNED -> F32 */
233 execute_window_loop(win, [&](const Coordinates &)
234 {
235 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
236 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
237
238 int x = window_start_x;
239 for(; x <= (window_end_x - window_step_x); x += window_step_x)
240 {
241 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.ptr()));
242
243 const int16x8x2_t texels =
244 {
245 {
246 vmovl_s8(vget_low_s8(texels_s8)),
247 vmovl_s8(vget_high_s8(texels_s8))
248 }
249 };
250 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
251 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
252 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
253 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
254 }
255
256 // Compute left-over elements
257 for(; x < window_end_x; ++x)
258 {
259 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
260 }
261 },
262 src, dst);
263 break;
264 }
265#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
266 case DataType::F16:
267 {
268 /* Up-conversion QASYMM8_SIGNED -> F16 */
269 execute_window_loop(win, [&](const Coordinates &)
270 {
271 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
272 const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
273 int x = window_start_x;
274
275 for(; x <= (window_end_x - window_step_x); x += window_step_x)
276 {
277 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
278
279 const int16x8x2_t texels =
280 {
281 {
282 vmovl_s8(vget_low_s8(texels_s8)),
283 vmovl_s8(vget_high_s8(texels_s8))
284 }
285 };
286 vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
287 vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
288 }
289
290 // Compute left-over elements
291 for(; x < window_end_x; ++x)
292 {
293 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
294 }
295 },
296 src, dst);
297 break;
298 }
299#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
300
301 default:
302 ARM_COMPUTE_ERROR("dst data type not supported");
303 }
304 break;
305 }
306
307 case DataType::QASYMM8:
308 case DataType::U8:
309 {
310 switch(_dst->info()->data_type())
311 {
312 case DataType::S16:
313 {
314 /* Up-conversion U8 -> S16 */
315 execute_window_loop(win, [&](const Coordinates &)
316 {
317 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
318 const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
319
320 int x = window_start_x;
321 for(; x <= (window_end_x - window_step_x); x += window_step_x)
322 {
323 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
324
325 const int16x8x2_t texels =
326 {
327 {
328 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
329 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
330 }
331 };
332
333 vst1q_s16(dst_ptr + x, texels.val[0]);
334 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
335 }
336
337 // Compute left-over elements
338 for(; x < window_end_x; ++x)
339 {
340 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
341 }
342 },
343 src, dst);
344 break;
345 }
346 case DataType::S32:
347 {
348 /* Up-conversion U8 -> S32 */
349 execute_window_loop(win, [&](const Coordinates &)
350 {
351 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
352 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
353
354 int x = window_start_x;
355 for(; x <= (window_end_x - window_step_x); x += window_step_x)
356 {
357 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
358
359 const int16x8x2_t texels =
360 {
361 {
362 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
363 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
364 }
365 };
366
367 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
368 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
369 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
370 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
371 }
372
373 // Compute left-over elements
374 for(; x < window_end_x; ++x)
375 {
376 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
377 }
378 },
379 src, dst);
380 break;
381 }
382 case DataType::F32:
383 {
384 /* Up-conversion U8 -> F32 */
385 execute_window_loop(win, [&](const Coordinates &)
386 {
387 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
388 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
389
390 int x = window_start_x;
391 for(; x <= (window_end_x - window_step_x); x += window_step_x)
392 {
393 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
394
395 const int16x8x2_t texels =
396 {
397 {
398 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
399 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
400 }
401 };
402 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
403 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
404 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
405 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
406 }
407
408 // Compute left-over elements
409 for(; x < window_end_x; ++x)
410 {
411 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
412 }
413 },
414 src, dst);
415 break;
416 }
417#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
418 case DataType::F16:
419 {
420 /* Up-conversion U8 -> F16 */
421 execute_window_loop(win, [&](const Coordinates &)
422 {
423 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
424 const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
425
426 int x = window_start_x;
427 for(; x <= (window_end_x - window_step_x); x += window_step_x)
428 {
429 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
430
431 const int16x8x2_t texels =
432 {
433 {
434 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
435 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
436 }
437 };
438 vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
439 vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
440 }
441
442 // Compute left-over elements
443 for(; x < window_end_x; ++x)
444 {
445 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
446 }
447 },
448 src, dst);
449 break;
450 }
451#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
452 case DataType::U16:
453 {
454 /* Up-conversion U8 -> U16 */
455 execute_window_loop(win, [&](const Coordinates &)
456 {
457 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
458 const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
459
460 int x = window_start_x;
461 for(; x <= (window_end_x - window_step_x); x += window_step_x)
462 {
463 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
464
465 const uint16x8x2_t texels =
466 {
467 {
468 vmovl_u8(vget_low_u8(texels_u8)),
469 vmovl_u8(vget_high_u8(texels_u8))
470 }
471 };
472
473 vst1q_u16(dst_ptr + x, texels.val[0]);
474 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
475 }
476
477 // Compute left-over elements
478 for(; x < window_end_x; ++x)
479 {
480 *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
481 }
482 },
483 src, dst);
484 break;
485 }
486 default:
487 ARM_COMPUTE_ERROR("dst data type not supported");
488 }
489 break;
490 }
491 case DataType::S16:
492 {
493 switch(_dst->info()->data_type())
494 {
495 case DataType::QASYMM8_SIGNED:
496 {
497 /* Down-conversion S16 -> QASYMM8_SIGNED */
498 if(ConvertPolicy::SATURATE == _policy)
499 {
500 execute_window_loop(win, [&](const Coordinates &)
501 {
502 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
503 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
504
505 int x = window_start_x;
506 for(; x <= (window_end_x - window_step_x); x += window_step_x)
507 {
508 const int16x8x2_t texels =
509 {
510 {
511 vld1q_s16(src_ptr + x),
512 vld1q_s16(src_ptr + x + 8)
513 }
514 };
515
516 vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
517 }
518
519 // Compute left-over elements
520 for(; x < window_end_x; ++x)
521 {
522 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
523 }
524 },
525 src, dst);
526 }
527 else
528 {
529 execute_window_loop(win, [&](const Coordinates &)
530 {
531 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
532 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
533
534 int x = window_start_x;
535 for(; x <= (window_end_x - window_step_x); x += window_step_x)
536 {
537 const int16x8x2_t texels =
538 {
539 {
540 vld1q_s16(src_ptr + x),
541 vld1q_s16(src_ptr + x + 8)
542 }
543 };
544
545 vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
546 }
547
548 // Compute left-over elements
549 for(; x < window_end_x; ++x)
550 {
551 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
552 }
553 },
554 src, dst);
555 }
556 break;
557 }
558 case DataType::U8:
559 {
560 /* Down-conversion S16 -> U8 */
561 if(ConvertPolicy::SATURATE == _policy)
562 {
563 execute_window_loop(win, [&](const Coordinates &)
564 {
565 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
566 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
567
568 int x = window_start_x;
569 for(; x <= (window_end_x - window_step_x); x += window_step_x)
570 {
571 const int16x8x2_t texels =
572 {
573 {
574 vld1q_s16(src_ptr + x),
575 vld1q_s16(src_ptr + x + 8)
576 }
577 };
578
579 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
580 }
581
582 // Compute left-over elements
583 for(; x < window_end_x; ++x)
584 {
585 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
586 }
587 },
588 src, dst);
589 }
590 else
591 {
592 execute_window_loop(win, [&](const Coordinates &)
593 {
594 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
595 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
596
597 int x = window_start_x;
598 for(; x <= (window_end_x - window_step_x); x += window_step_x)
599 {
600 const int16x8x2_t texels =
601 {
602 {
603 vld1q_s16(src_ptr + x),
604 vld1q_s16(src_ptr + x + 8)
605 }
606 };
607
608 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
609 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
610 }
611
612 // Compute left-over elements
613 for(; x < window_end_x; ++x)
614 {
615 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
616 }
617 },
618 src, dst);
619 }
620 break;
621 }
622 case DataType::S32:
623 {
624 /* Up-conversion S16 -> S32 */
625 execute_window_loop(win, [&](const Coordinates &)
626 {
627 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
628 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
629
630 int x = window_start_x;
631 for(; x <= (window_end_x - window_step_x); x += window_step_x)
632 {
633 const int16x8x2_t texels =
634 {
635 {
636 vld1q_s16(src_ptr + x),
637 vld1q_s16(src_ptr + x + 8)
638 }
639 };
640
641 const int32x4x4_t texels_s32 =
642 {
643 {
644 vmovl_s16(vget_low_s16(texels.val[0])),
645 vmovl_s16(vget_high_s16(texels.val[0])),
646 vmovl_s16(vget_low_s16(texels.val[1])),
647 vmovl_s16(vget_high_s16(texels.val[1]))
648 }
649 };
650
651 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
652 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
653 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
654 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
655 }
656
657 // Compute left-over elements
658 for(; x < window_end_x; ++x)
659 {
660 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
661 }
662 },
663 src, dst);
664 break;
665 }
666 default:
667 ARM_COMPUTE_ERROR("dst data type not supported");
668 }
669 break;
670 }
671 case DataType::U16:
672 {
673 switch(_dst->info()->data_type())
674 {
675 case DataType::U8:
676 {
677 /* Down-conversion U16 -> U8 */
678 if(ConvertPolicy::SATURATE == _policy)
679 {
680 execute_window_loop(win, [&](const Coordinates &)
681 {
682 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
683 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
684
685 int x = window_start_x;
686 for(; x <= (window_end_x - window_step_x); x += window_step_x)
687 {
688 const uint16x8x2_t texels =
689 {
690 {
691 vld1q_u16(src_ptr + x),
692 vld1q_u16(src_ptr + x + 8)
693 }
694 };
695
696 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
697 }
698
699 // Compute left-over elements
700 for(; x < window_end_x; ++x)
701 {
702 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
703 }
704 },
705 src, dst);
706 }
707 else
708 {
709 execute_window_loop(win, [&](const Coordinates &)
710 {
711 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
712 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
713
714 int x = window_start_x;
715 for(; x <= (window_end_x - window_step_x); x += window_step_x)
716 {
717 const uint16x8x2_t texels =
718 {
719 {
720 vld1q_u16(src_ptr + x),
721 vld1q_u16(src_ptr + x + 8)
722 }
723 };
724
725 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
726 }
727
728 // Compute left-over elements
729 for(; x < window_end_x; ++x)
730 {
731 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
732 }
733
734 },
735 src, dst);
736 }
737 break;
738 }
739 case DataType::U32:
740 {
741 /* Up-conversion U16 -> U32 */
742 execute_window_loop(win, [&](const Coordinates &)
743 {
744 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
745 const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
746
747 int x = window_start_x;
748 for(; x <= (window_end_x - window_step_x); x += window_step_x)
749 {
750 const uint16x8x2_t texels =
751 {
752 {
753 vld1q_u16(src_ptr + x),
754 vld1q_u16(src_ptr + x + 8)
755 }
756 };
757
758 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
759 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
760 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
761 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
762 }
763 // Compute left-over elements
764 for(; x < window_end_x; ++x)
765 {
766 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
767 }
768
769 },
770 src, dst);
771 break;
772 }
773 default:
774 ARM_COMPUTE_ERROR("dst data type not supported");
775 }
776 break;
777 }
778#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
779 case DataType::BFLOAT16:
780 switch(_dst->info()->data_type())
781 {
782 case DataType::F32:
783 {
784 /* Up-conversion BFLOAT16 -> F32 */
785 execute_window_loop(win, [&](const Coordinates &)
786 {
787 const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr());
788 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
789
790 int x = window_start_x;
791 for(; x <= (window_end_x - window_step_x); x += window_step_x)
792 {
793 const uint16x8x2_t texels =
794 {
795 {
796 vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())),
797 vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8)
798 }
799 };
800
801 vst1q_f32(reinterpret_cast<float *>(dst.ptr()),
802 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
803 vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4,
804 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
805 vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8,
806 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
807 vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12,
808 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
809 }
810
811 for(; x < window_end_x; ++x)
812 {
813 *(dst_ptr + x) = float(*(src_ptr + x));
814 }
815 },
816 src, dst);
817 break;
818 }
819 default:
820 ARM_COMPUTE_ERROR("dst data type unsupported");
821 }
822 break;
823#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
824#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
825 case DataType::F16:
826 switch(_dst->info()->data_type())
827 {
828 case DataType::QASYMM8_SIGNED:
829 {
830 /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
831 execute_window_loop(win, [&](const Coordinates &)
832 {
833 const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
834 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
835
836 int x = window_start_x;
837 for(; x <= (window_end_x - window_step_x); x += window_step_x)
838 {
839 const float16x8x2_t texels =
840 {
841 {
842 vld1q_f16(src_ptr + x),
843 vld1q_f16(src_ptr + x + 8),
844 }
845 };
846
847 vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
848 }
849
850 // Compute left-over elements
851 for(; x < window_end_x; ++x)
852 {
853 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
854 }
855 },
856 src, dst);
857 break;
858 }
859 case DataType::QASYMM8:
860 case DataType::U8:
861 {
862 /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
863 execute_window_loop(win, [&](const Coordinates &)
864 {
865 const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
866 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
867
868 int x = window_start_x;
869 for(; x <= (window_end_x - window_step_x); x += window_step_x)
870 {
871 const float16x8x2_t texels =
872 {
873 {
874 vld1q_f16(src_ptr + x),
875 vld1q_f16(src_ptr + x + 8),
876 }
877 };
878
879 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
880 }
881
882 // Compute left-over elements
883 for(; x < window_end_x; ++x)
884 {
885 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
886 }
887
888 },
889 src, dst);
890 break;
891 }
892 case DataType::F32:
893 {
894 /* Up-conversion F16 -> F32 */
895 execute_window_loop(win, [&](const Coordinates &)
896 {
897 const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
898 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
899
900 int x = window_start_x;
901 for(; x <= (window_end_x - window_step_x); x += window_step_x)
902 {
903 const float16x8x2_t texels =
904 {
905 {
906 vld1q_f16(src_ptr + x),
907 vld1q_f16(src_ptr + x + 8)
908 }
909 };
910 vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
911 vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
912 vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
913 vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
914 }
915
916 // Compute left-over elements
917 for(; x < window_end_x; ++x)
918 {
919 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
920 }
921 },
922 src, dst);
923 break;
924 }
925 case DataType::S32:
926 {
927 /* Up-conversion F16 -> S32 */
928 execute_window_loop(win, [&](const Coordinates &)
929 {
930 const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
931 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
932
933 int x = window_start_x;
934 for(; x <= (window_end_x - window_step_x); x += window_step_x)
935 {
936 const float16x8x2_t texels =
937 {
938 {
939 vld1q_f16(src_ptr + x),
940 vld1q_f16(src_ptr + x + 8)
941 }
942 };
943
944 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
945 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
946 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
947 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
948 }
949
950 // Compute left-over elements
951 for(; x < window_end_x; ++x)
952 {
953 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
954 }
955 },
956 src, dst);
957 break;
958 }
959 default:
960 ARM_COMPUTE_ERROR("dst data type not supported");
961 }
962 break;
963#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
964 case DataType::F32:
965 switch(_dst->info()->data_type())
966 {
967#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
968 case DataType::F16:
969 {
970 /* Down-conversion F32 -> F16 */
971 execute_window_loop(win, [&](const Coordinates &)
972 {
973 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
974 const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
975
976 int x = window_start_x;
977 for(; x <= (window_end_x - window_step_x); x += window_step_x)
978 {
979 const float32x4x4_t texels =
980 {
981 {
982 vld1q_f32(src_ptr + x),
983 vld1q_f32(src_ptr + x + 4),
984 vld1q_f32(src_ptr + x + 8),
985 vld1q_f32(src_ptr + x + 12)
986 }
987 };
988
989 vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
990 vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
991 }
992
993 // Compute left-over elements
994 for(; x < window_end_x; ++x)
995 {
996 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
997 }
998 },
999 src, dst);
1000 break;
1001 }
1002#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1003#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
1004 case DataType::BFLOAT16:
1005 {
1006 /* Down-conversion F32 -> BFLOAT16 */
1007 execute_window_loop(win, [&](const Coordinates &)
1008 {
1009 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1010 const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr());
1011
1012 int x = window_start_x;
1013 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1014 {
1015 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()),
1016 reinterpret_cast<uint16_t *>(dst.ptr()));
1017 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8,
1018 reinterpret_cast<uint16_t *>(dst.ptr()) + 8);
1019 }
1020
1021 for(; x < window_end_x; ++x)
1022 {
1023 *(dst_ptr + x) = *(src_ptr + x);
1024 }
1025 },
1026 src, dst);
1027 break;
1028 }
1029#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
1030 case DataType::S32:
1031 {
1032 /* Conversion F32 -> S32 */
1033 execute_window_loop(win, [&](const Coordinates &)
1034 {
1035 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1036 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
1037
1038 int x = window_start_x;
1039 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1040 {
1041 const float32x4x4_t texels =
1042 {
1043 {
1044 vld1q_f32(src_ptr + x),
1045 vld1q_f32(src_ptr + x + 4),
1046 vld1q_f32(src_ptr + x + 8),
1047 vld1q_f32(src_ptr + x + 12),
1048 }
1049 };
1050
1051 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
1052 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1053 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1054 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1055 }
1056
1057 // Compute left-over elements
1058 for(; x < window_end_x; ++x)
1059 {
1060 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
1061 }
1062 },
1063 src, dst);
1064 break;
1065 }
1066 case DataType::QASYMM8:
1067 case DataType::U8:
1068 {
1069 /* Down-conversion F32 -> U8 */
1070 execute_window_loop(win, [&](const Coordinates &)
1071 {
1072 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1073 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1074
1075 int x = window_start_x;
1076 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1077 {
1078 const float32x4x4_t texels =
1079 {
1080 {
1081 vld1q_f32(src_ptr + x),
1082 vld1q_f32(src_ptr + x + 4),
1083 vld1q_f32(src_ptr + x + 8),
1084 vld1q_f32(src_ptr + x + 12),
1085 }
1086 };
1087
1088 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1089 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1090 }
1091
1092 // Compute left-over elements
1093 for(; x < window_end_x; ++x)
1094 {
1095 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1096 }
1097 },
1098 src, dst);
1099 break;
1100 }
1101 case DataType::QASYMM8_SIGNED:
1102 {
1103 /* Down-conversion F32 -> QASYMM8_SIGNED */
1104 execute_window_loop(win, [&](const Coordinates &)
1105 {
1106 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1107 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
1108
1109 int x = window_start_x;
1110 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1111 {
1112 const float32x4x4_t texels =
1113 {
1114 {
1115 vld1q_f32(src_ptr + x),
1116 vld1q_f32(src_ptr + x + 4),
1117 vld1q_f32(src_ptr + x + 8),
1118 vld1q_f32(src_ptr + x + 12),
1119 }
1120 };
1121
1122 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1123 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1124 }
1125 // Compute left-over elements
1126 for(; x < window_end_x; ++x)
1127 {
1128 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1129 }
1130 },
1131 src, dst);
1132 break;
1133 }
1134
1135 default:
1136 ARM_COMPUTE_ERROR("dst data type not supported");
1137 }
1138 break;
1139
1140 case DataType::S32:
1141 switch(_dst->info()->data_type())
1142 {
1143#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1144 case DataType::F16:
1145 {
1146 /* Down-conversion S32 -> F16 */
1147 execute_window_loop(win, [&](const Coordinates &)
1148 {
1149 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1150 const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
1151
1152 int x = window_start_x;
1153 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1154 {
1155 const float32x4x4_t texels =
1156 {
1157 {
1158 vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
1159 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
1160 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
1161 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
1162 }
1163 };
1164
1165 vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1166 vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1167 }
1168
1169 // Compute left-over elements
1170 for(; x < window_end_x; ++x)
1171 {
1172 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
1173 }
1174 },
1175 src, dst);
1176 break;
1177 }
1178#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1179 case DataType::F32:
1180 {
1181 /* Conversion S32 -> F32 */
1182 execute_window_loop(win, [&](const Coordinates &)
1183 {
1184 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1185 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
1186
1187 int x = window_start_x;
1188 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1189 {
1190 const int32x4x4_t texels =
1191 {
1192 {
1193 vld1q_s32(src_ptr + x),
1194 vld1q_s32(src_ptr + x + 4),
1195 vld1q_s32(src_ptr + x + 8),
1196 vld1q_s32(src_ptr + x + 12),
1197 }
1198 };
1199
1200 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
1201 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1202 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1203 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1204 }
1205
1206 // Compute left-over elements
1207 for(; x < window_end_x; ++x)
1208 {
1209 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
1210 }
1211 },
1212 src, dst);
1213 break;
1214 }
1215 case DataType::QASYMM8_SIGNED:
1216 {
1217 /* Down-conversion S32 -> QASYMM8_SIGNED */
1218 if(ConvertPolicy::SATURATE == _policy)
1219 {
1220 execute_window_loop(win, [&](const Coordinates &)
1221 {
1222 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1223 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
1224
1225 int x = window_start_x;
1226 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1227 {
1228 const int32x4x4_t texels =
1229 {
1230 {
1231 vld1q_s32(src_ptr + x),
1232 vld1q_s32(src_ptr + x + 4),
1233 vld1q_s32(src_ptr + x + 8),
1234 vld1q_s32(src_ptr + x + 12),
1235 }
1236 };
1237 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1238 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1239 }
1240
1241 // Compute left-over elements
1242 for(; x < window_end_x; ++x)
1243 {
1244 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1245 }
1246 },
1247 src, dst);
1248 }
1249 else
1250 {
1251 execute_window_loop(win, [&](const Coordinates &)
1252 {
1253 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1254 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
1255
1256 int x = window_start_x;
1257 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1258 {
1259 const int32x4x4_t texels =
1260 {
1261 {
1262 vld1q_s32(src_ptr + x),
1263 vld1q_s32(src_ptr + x + 4),
1264 vld1q_s32(src_ptr + x + 8),
1265 vld1q_s32(src_ptr + x + 12)
1266 }
1267 };
1268
1269 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1270 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1271 }
1272
1273 // Compute left-over elements
1274 for(; x < window_end_x; ++x)
1275 {
1276 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1277 }
1278 },
1279 src, dst);
1280 }
1281 break;
1282 }
1283 case DataType::QASYMM8:
1284 case DataType::U8:
1285 {
1286 /* Down-conversion S32 -> U8 */
1287 if(ConvertPolicy::SATURATE == _policy)
1288 {
1289 execute_window_loop(win, [&](const Coordinates &)
1290 {
1291 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1292 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1293
1294 int x = window_start_x;
1295 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1296 {
1297 const int32x4x4_t texels =
1298 {
1299 {
1300 vld1q_s32(src_ptr + x),
1301 vld1q_s32(src_ptr + x + 4),
1302 vld1q_s32(src_ptr + x + 8),
1303 vld1q_s32(src_ptr + x + 12)
1304 }
1305 };
1306 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1307 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1308 }
1309
1310 // Compute left-over elements
1311 for(; x < window_end_x; ++x)
1312 {
1313 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1314 }
1315 },
1316 src, dst);
1317 }
1318 else
1319 {
1320 execute_window_loop(win, [&](const Coordinates &)
1321 {
1322 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1323 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1324
1325 int x = window_start_x;
1326 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1327 {
1328 const int32x4x4_t texels =
1329 {
1330 {
1331 vld1q_s32(src_ptr + x),
1332 vld1q_s32(src_ptr + x + 4),
1333 vld1q_s32(src_ptr + x + 8),
1334 vld1q_s32(src_ptr + x + 12)
1335 }
1336 };
1337
1338 vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1339 vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1340 }
1341
1342 // Compute left-over elements
1343 for(; x < window_end_x; ++x)
1344 {
1345 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1346 }
1347 },
1348 src, dst);
1349 }
1350 break;
1351 }
1352 default:
1353 ARM_COMPUTE_ERROR("dst data type not supported");
1354 }
1355 break;
1356 default:
1357 ARM_COMPUTE_ERROR("Not supported");
1358 }
1359}
1360
1361const char *CpuCastKernel::name() const
1362{
1363 return "CpuCastKernel.cpp";
1364}
1365} // namespace kernels
1366} // namespace cpu
1367} // namespace arm_compute