blob: 531873e49ec2bf83616bba4cca9d33a44d43031c [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michalis Spyroua4f378d2019-04-26 14:54:54 +01002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000024#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/NEON/NEFixedPoint.h"
Michalis Spyroue2588182018-12-13 18:31:18 +000031#include "arm_compute/core/NEON/NEMath.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
34
35#include <arm_neon.h>
36
37using namespace arm_compute;
38
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010039namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010041Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
42{
43 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
44 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
45 ARM_COMPUTE_UNUSED(policy);
46 ARM_COMPUTE_RETURN_ERROR_ON(input == output);
Usama Arif9e631c22019-05-14 17:10:40 +010047 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32);
Michalis Spyroue2588182018-12-13 18:31:18 +000048 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010049 ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
50
Usama Arif9e631c22019-05-14 17:10:40 +010051 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
52 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
53 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
Michalis Spyroue2588182018-12-13 18:31:18 +000054
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010055 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
Usama Arif9e631c22019-05-14 17:10:40 +010056 && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
57 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010058
59 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
60 "Only data_types supported [in] U16 -> [out] U8, U32");
61
62 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
63 "Only data_types supported [in] S16 -> [out] U8, S32");
64
Usama Arif9e631c22019-05-14 17:10:40 +010065 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::U8 && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
66 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010067
Usama Arif9e631c22019-05-14 17:10:40 +010068 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
69 "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8");
70
71 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
72 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
Michele Di Giorgio3e570db2018-08-24 18:28:48 +010073
74 // Validate in case of configured output
75 if(output->total_size() > 0)
76 {
77 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
78 }
79
80 return Status{};
81}
82
83std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
84{
85 constexpr unsigned int num_elems_processed_per_iteration = 16;
86
87 Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
88
89 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
90 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
91 bool window_changed = update_window_and_padding(win, input_access, output_access);
92 output_access.set_valid_region(win, output->valid_region());
93
94 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
95 return std::make_pair(err, win);
96}
97} // namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010098
Giorgio Arena04a8f8c2017-11-23 11:45:24 +000099NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100100 : _input(nullptr), _output(nullptr), _policy(), _shift(0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100101{
102}
103
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100104void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100105{
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100106 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
107
108 // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
109 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
Georgios Pinitase2229412017-07-12 12:30:40 +0100110
111 _input = input;
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100112 _output = output;
Georgios Pinitase2229412017-07-12 12:30:40 +0100113 _policy = policy;
114 _shift = shift;
115
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100116 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
Georgios Pinitase2229412017-07-12 12:30:40 +0100117
118 // Configure kernel window
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100119 auto win_config = validate_and_configure_window(input->info(), output->info());
120 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
121 ICPPKernel::configure(win_config.second);
122}
Georgios Pinitase2229412017-07-12 12:30:40 +0100123
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100124Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
125{
126 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
127 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
128
129 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100130}
131
Giorgio Arena04a8f8c2017-11-23 11:45:24 +0000132void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100134 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100135 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Georgios Pinitase2229412017-07-12 12:30:40 +0100136 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100137 ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100138 ARM_COMPUTE_ERROR_ON(_input == _output);
139
140 Iterator input(_input, window);
141 Iterator output(_output, window);
142
143 switch(_input->info()->data_type())
144 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000145 case DataType::QASYMM8:
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100146 case DataType::U8:
147 {
148 const int16x8_t b = vdupq_n_s16(_shift);
149
150 switch(_output->info()->data_type())
151 {
152 case DataType::S16:
153 {
154 /* Up-conversion U8 -> S16 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100155 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100156 {
157 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
158
159 const int16x8x2_t texels =
160 {
161 {
162 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
163 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
164 }
165 };
166
167 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]);
168 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]);
169 },
170 input, output);
171 break;
172 }
173 case DataType::S32:
174 {
175 /* Up-conversion U8 -> S32 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100176 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100177 {
178 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
179
180 const int16x8x2_t texels =
181 {
182 {
183 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
184 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
185 }
186 };
187
188 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
189 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
190 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
191 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
192 },
193 input, output);
194 break;
195 }
Usama Arif9e631c22019-05-14 17:10:40 +0100196 case DataType::F32:
197 {
198 /* Up-conversion U8 -> F32 */
199 execute_window_loop(window, [&](const Coordinates &)
200 {
201 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
202
203 const int16x8x2_t texels =
204 {
205 {
206 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
207 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
208 }
209 };
210 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
211 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
212 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
213 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
214 },
215 input, output);
216 break;
217 }
218#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
219 case DataType::F16:
220 {
221 /* Up-conversion U8 -> F16 */
222 execute_window_loop(window, [&](const Coordinates &)
223 {
224 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
225
226 const int16x8x2_t texels =
227 {
228 {
229 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
230 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
231 }
232 };
233 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcvtq_f16_s16(texels.val[0]));
234 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcvtq_f16_s16(texels.val[1]));
235 },
236 input, output);
237 break;
238 }
239#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
240
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100241 case DataType::U16:
242 {
243 /* Up-conversion U8 -> U16 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100244 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100245 {
246 const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
247
248 const uint16x8x2_t texels =
249 {
250 {
251 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
252 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
253 }
254 };
255
256 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), texels.val[0]);
257 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()) + 8, texels.val[1]);
258 },
259 input, output);
260 break;
261 }
262 default:
263 ARM_COMPUTE_ERROR("Output data type not supported");
264 }
265 break;
266 }
267 case DataType::S16:
268 {
269 switch(_output->info()->data_type())
270 {
271 case DataType::U8:
272 {
273 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
274
275 /* Down-conversion S16 -> U8 */
276 if(ConvertPolicy::SATURATE == _policy)
277 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100278 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100279 {
280 const int16x8x2_t texels =
281 {
282 {
283 vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
284 vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
285 }
286 };
287
288 vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
289 },
290 input, output);
291 }
292 else
293 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100294 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100295 {
296 const int16x8x2_t texels =
297 {
298 {
299 vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
300 vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
301 }
302 };
303
304 vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
305 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
306 },
307 input, output);
308 }
309 break;
310 }
311 case DataType::S32:
312 {
313 const int32x4_t b = vdupq_n_s32(_shift);
314
315 /* Up-conversion S16 -> S32 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100316 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100317 {
318 const int16x8x2_t texels =
319 {
320 {
321 vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
322 vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
323 }
324 };
325
326 const int32x4x4_t texels_s32 =
327 {
328 {
329 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
330 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
331 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
332 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
333 }
334 };
335
336 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
337 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
338 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
339 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
340 },
341 input, output);
342 break;
343 }
344 default:
345 ARM_COMPUTE_ERROR("Output data type not supported");
346 }
347 break;
348 }
349 case DataType::U16:
350 {
351 switch(_output->info()->data_type())
352 {
353 case DataType::U8:
354 {
355 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
356
357 /* Down-conversion U16 -> U8 */
358 if(ConvertPolicy::SATURATE == _policy)
359 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100360 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100361 {
362 const uint16x8x2_t texels =
363 {
364 {
365 vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
366 vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
367 }
368 };
369
370 vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
371 },
372 input, output);
373 }
374 else
375 {
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100376 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100377 {
378 const uint16x8x2_t texels =
379 {
380 {
381 vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
382 vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
383 }
384 };
385
386 vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
387 },
388 input, output);
389 }
390 break;
391 }
392 case DataType::U32:
393 {
394 const int32x4_t b = vdupq_n_s32(_shift);
395
396 /* Up-conversion U16 -> U32 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100397 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100398 {
399 const uint16x8x2_t texels =
400 {
401 {
402 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
403 vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
404 }
405 };
406
407 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
408 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
409 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
410 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
411 },
412 input, output);
413 break;
414 }
415 default:
416 ARM_COMPUTE_ERROR("Output data type not supported");
417 }
418 break;
419 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100420#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
421 case DataType::F16:
422 switch(_output->info()->data_type())
423 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000424 case DataType::QASYMM8:
Usama Arif9e631c22019-05-14 17:10:40 +0100425 case DataType::U8:
Michalis Spyroue2588182018-12-13 18:31:18 +0000426 {
Usama Arif9e631c22019-05-14 17:10:40 +0100427 const float16x8_t scale = vdupq_n_f16(1 << _shift);
Michalis Spyroue2588182018-12-13 18:31:18 +0000428
Usama Arif9e631c22019-05-14 17:10:40 +0100429 /* Up-conversion F16 -> U8 */
Michalis Spyroue2588182018-12-13 18:31:18 +0000430 execute_window_loop(window, [&](const Coordinates & id)
431 {
432 const float16x8x2_t texels =
433 {
434 {
435 vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())), scale),
436 vmulq_f16(vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8), scale),
437 }
438 };
439
Usama Arif9e631c22019-05-14 17:10:40 +0100440 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
Michalis Spyroue2588182018-12-13 18:31:18 +0000441 },
442 input, output);
443 break;
444 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100445 case DataType::F32:
446 {
447 const float32x4_t scale = vdupq_n_f32(1 << _shift);
448
449 /* Up-conversion F16 -> F32 */
450 execute_window_loop(window, [&](const Coordinates & id)
451 {
452 const float16x8x2_t texels =
453 {
454 {
455 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
456 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
457 }
458 };
459
460 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
461 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
462 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
463 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
464 },
465 input, output);
466 break;
467 }
Usama Arif9e631c22019-05-14 17:10:40 +0100468 case DataType::S32:
469 {
470 const float32x4_t scale = vdupq_n_f32(1 << _shift);
471
472 /* Up-conversion F16 -> S32 */
473 execute_window_loop(window, [&](const Coordinates & id)
474 {
475 const float16x8x2_t texels =
476 {
477 {
478 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
479 vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
480 }
481 };
482
483 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
484 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
485 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
486 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
487 },
488 input, output);
489 break;
490 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100491 default:
492 ARM_COMPUTE_ERROR("Output data type not supported");
493 }
494 break;
Michalis Spyroue2588182018-12-13 18:31:18 +0000495#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100496 case DataType::F32:
497 switch(_output->info()->data_type())
498 {
Michalis Spyroue2588182018-12-13 18:31:18 +0000499#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100500 case DataType::F16:
501 {
502 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
503
504 /* Down-conversion F32 -> F16 */
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100505 execute_window_loop(window, [&](const Coordinates &)
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100506 {
507 const float32x4x4_t texels =
508 {
509 {
510 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
511 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
512 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
513 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
514 }
515 };
516
517 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
518 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
519 },
520 input, output);
521 break;
522 }
Michalis Spyroue2588182018-12-13 18:31:18 +0000523#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Usama Arif9e631c22019-05-14 17:10:40 +0100524 case DataType::S32:
525 {
526 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
527
528 /* Conversion F32 -> S32 */
529 execute_window_loop(window, [&](const Coordinates &)
530 {
531 const float32x4x4_t texels =
532 {
533 {
534 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
535 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
536 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
537 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale),
538 }
539 };
540
541 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(texels.val[0]));
542 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(texels.val[1]));
543 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(texels.val[2]));
544 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(texels.val[3]));
545 },
546 input, output);
547 break;
548 }
549 case DataType::QASYMM8:
550 case DataType::U8:
551 {
552 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
553
554 /* Down-conversion F32 -> U8 */
555 execute_window_loop(window, [&](const Coordinates &)
556 {
557 const float32x4x4_t texels =
558 {
559 {
560 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
561 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
562 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
563 vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale),
564 }
565 };
566
567 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
568 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
569 },
570 input, output);
571 break;
572 }
573
574
575 default:
576 ARM_COMPUTE_ERROR("Output data type not supported");
577 }
578 break;
579
580 case DataType::S32:
581 switch(_output->info()->data_type())
582 {
583#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
584 case DataType::F16:
585 {
586 const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
587
588 /* Down-conversion S32 -> F16 */
589 execute_window_loop(window, [&](const Coordinates &)
590 {
591 const float32x4x4_t texels =
592 {
593 {
594 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()))), scale),
595 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4)), scale),
596 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8)), scale),
597 vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12)), scale)
598 }
599 };
600
601 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
602 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
603 },
604 input, output);
605 break;
606 }
607#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
608 case DataType::F32:
609 {
610 const int32x4_t scale = vdupq_n_s32(1.f / (1 << _shift));
611
612 /* Conversion S32 -> F32 */
613 execute_window_loop(window, [&](const Coordinates &)
614 {
615 const int32x4x4_t texels =
616 {
617 {
618 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), scale),
619 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), scale),
620 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), scale),
621 vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), scale),
622 }
623 };
624
625 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(texels.val[0]));
626 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(texels.val[1]));
627 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(texels.val[2]));
628 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(texels.val[3]));
629 },
630 input, output);
631 break;
632 }
633 case DataType::QASYMM8:
634 case DataType::U8:
635 {
636 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
637
638 /* Down-conversion S32 -> U8 */
639 if(ConvertPolicy::SATURATE == _policy)
640 {
641 execute_window_loop(window, [&](const Coordinates &)
642 {
643 const int32x4x4_t texels =
644 {
645 {
646 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b),
647 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b),
648 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b),
649 vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b)
650 }
651 };
652
653 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
654 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
655 },
656 input, output);
657 }
658 else
659 {
660 execute_window_loop(window, [&](const Coordinates &)
661 {
662 const int32x4x4_t texels =
663 {
664 {
665 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b),
666 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b),
667 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b),
668 vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b)
669 }
670 };
671
672 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
673 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
674 },
675 input, output);
676 }
677 break;
678 }
Michele Di Giorgio3e570db2018-08-24 18:28:48 +0100679 default:
680 ARM_COMPUTE_ERROR("Output data type not supported");
681 }
682 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100683 default:
684 ARM_COMPUTE_ERROR("Not supported");
685 }
686}