blob: 15a9ddcab4082853911f24d01cd25a5cb7efc378 [file] [log] [blame]
Georgios Pinitas11d84152021-04-28 10:20:18 +01001/*
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +02002 * Copyright (c) 2016-2022 Arm Limited.
Georgios Pinitas11d84152021-04-28 10:20:18 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuCastKernel.h"
Georgios Pinitas11d84152021-04-28 10:20:18 +010025
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Validate.h"
31#include "src/core/CPP/Validate.h"
32#include "src/core/NEON/NEFixedPoint.h"
33#include "src/core/NEON/NEMath.h"
34#include "src/core/NEON/wrapper/wrapper.h"
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +020035#include "src/core/common/Registrars.h"
Georgios Pinitas11d84152021-04-28 10:20:18 +010036#include "src/core/helpers/AutoConfiguration.h"
37#include "src/core/helpers/WindowHelpers.h"
38#include "support/SaturateCast.h"
39
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +020040#include "src/cpu/kernels/cast/list.h"
41
Georgios Pinitas11d84152021-04-28 10:20:18 +010042namespace arm_compute
43{
44namespace cpu
45{
46namespace kernels
47{
48namespace
49{
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +020050static const std::vector<CpuCastKernel::CastKernel> available_kernels =
51{
52 {
53 "neon_qs8_cast",
54 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
55 REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
56 },
57 {
58 "neon_qu8_cast",
59 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
60 REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
61 },
62 {
63 "neon_u8_cast",
64 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
65 REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
66 },
67 {
68 "neon_fp16_cast",
69 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
70 REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
71 },
72 {
73 "neon_fp32_to_fp16_cast",
74 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
75 REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
76 },
77 {
78 "neon_fp32_to_bf16_cast",
79 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::BFLOAT16 && data.isa.bf16; },
80 REGISTER_BF16_NEON(arm_compute::cpu::neon_fp32_to_bfloat16_cast)
81 },
82 {
83 "neon_s32_cast",
84 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
85 REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
86 },
87 {
88 "neon_bf16_cast",
89 [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::BFLOAT16 && data.dst_dt == DataType::F32 && data.isa.bf16; },
90 REGISTER_BF16_NEON(arm_compute::cpu::neon_bfloat16_to_fp32_cast)
91 },
92};
93
Georgios Pinitas11d84152021-04-28 10:20:18 +010094Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
95{
96 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
97 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
98 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
99 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
100 ARM_COMPUTE_UNUSED(policy);
101 ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
102 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
103 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
104 DataType::F32, DataType::S32);
105 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
106 DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
107 DataType::U32, DataType::S32, DataType::F32);
108
109 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
110 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
111 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
112
113 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
114 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
115 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
116
117 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
118 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
119 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
120
121 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
122 "Only data_types supported [in] U16 -> [out] U8, U32");
123
124 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
125 "Only data_types supported [in] S16 -> [out] U8, S32");
126
127 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
128 "Only data_types supported [in] BFLOAT16 -> [out] F32");
129
130 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
131 && dst->data_type() != DataType::U8
132 && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
133 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
134
135 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
136 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
137 && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
138 "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
139
140 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
141 && dst->data_type() != DataType::F16
142 && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
143 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
144
145 // Validate in case of configured dst
146 if(dst->total_size() > 0)
147 {
148 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
149 }
150
151 return Status{};
152}
153} // namespace
154
155void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
156{
157 ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
158
159 // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
160 set_shape_if_empty(*dst, src->tensor_shape());
161
162 _policy = policy;
163
164 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
165
166 // Configure kernel window
167 Window win = calculate_max_window(*src, Steps());
168
169 ICPPKernel::configure(win);
170}
171
172Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
173{
174 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
175 return Status{};
176}
177
178void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
179{
180 ARM_COMPUTE_UNUSED(info);
181 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
182 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
183
184 const auto window_start_x = static_cast<int>(window.x().start());
185 const auto window_end_x = static_cast<int>(window.x().end());
186 const int window_step_x = 16;
187
188 const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
189 ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST);
190 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
191 ARM_COMPUTE_ERROR_ON(_src == _dst);
192
193 ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
194
195 Window win{ window };
196 win.set(Window::DimX, Window::Dimension(0, 1, 1));
197
198 Iterator src(_src, win);
199 Iterator dst(_dst, win);
200
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200201 /*ukernel runs only when using fp16/bfloat16, so we validate it isn't a nullptr only before using it */
202 const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
203
Georgios Pinitas11d84152021-04-28 10:20:18 +0100204 switch(_src->info()->data_type())
205 {
206 case DataType::QASYMM8_SIGNED:
207 {
208 switch(_dst->info()->data_type())
209 {
210 case DataType::S16:
211 {
212 /* Up-conversion QASYMM8_SIGNED -> S16 */
213 execute_window_loop(win, [&](const Coordinates &)
214 {
215 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
216 const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
217 int x = window_start_x;
218
219 for(; x <= (window_end_x - window_step_x); x += window_step_x)
220 {
221 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
222
223 const int16x8x2_t texels =
224 {
225 {
226 vmovl_s8(vget_low_s8(texels_s8)),
227 vmovl_s8(vget_high_s8(texels_s8))
228 }
229 };
230
231 vst1q_s16(dst_ptr + x, texels.val[0]);
232 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
233 }
234
235 // Compute left-over elements
236 for(; x < window_end_x; ++x)
237 {
238 *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
239 }
240 },
241 src, dst);
242 break;
243 }
244 case DataType::S32:
245 {
246 /* Up-conversion QASYMM8_SIGNED -> S32 */
247 execute_window_loop(win, [&](const Coordinates &)
248 {
249 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
250 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
251 int x = window_start_x;
252
253 for(; x <= (window_end_x - window_step_x); x += window_step_x)
254 {
255 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
256
257 const int16x8x2_t texels =
258 {
259 {
260 vmovl_s8(vget_low_s8(texels_s8)),
261 vmovl_s8(vget_high_s8(texels_s8))
262 }
263 };
264
265 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
266 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
267 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
268 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
269 }
270
271 // Compute left-over elements
272 for(; x < window_end_x; ++x)
273 {
274 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
275 }
276 },
277 src, dst);
278 break;
279 }
280 case DataType::F32:
281 {
282 /* Up-conversion QASYMM8_SIGNED -> F32 */
283 execute_window_loop(win, [&](const Coordinates &)
284 {
285 const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
286 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
287
288 int x = window_start_x;
289 for(; x <= (window_end_x - window_step_x); x += window_step_x)
290 {
Viet-Hoa Do622b8ad2022-09-13 10:20:06 +0100291 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100292
293 const int16x8x2_t texels =
294 {
295 {
296 vmovl_s8(vget_low_s8(texels_s8)),
297 vmovl_s8(vget_high_s8(texels_s8))
298 }
299 };
300 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
301 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
302 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
303 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
304 }
305
306 // Compute left-over elements
307 for(; x < window_end_x; ++x)
308 {
309 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
310 }
311 },
312 src, dst);
313 break;
314 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100315 case DataType::F16:
316 {
317 /* Up-conversion QASYMM8_SIGNED -> F16 */
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200318 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
319 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100320 break;
321 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100322 default:
323 ARM_COMPUTE_ERROR("dst data type not supported");
324 }
325 break;
326 }
327
328 case DataType::QASYMM8:
329 case DataType::U8:
330 {
331 switch(_dst->info()->data_type())
332 {
333 case DataType::S16:
334 {
335 /* Up-conversion U8 -> S16 */
336 execute_window_loop(win, [&](const Coordinates &)
337 {
338 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
339 const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
340
341 int x = window_start_x;
342 for(; x <= (window_end_x - window_step_x); x += window_step_x)
343 {
344 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
345
346 const int16x8x2_t texels =
347 {
348 {
349 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
350 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
351 }
352 };
353
354 vst1q_s16(dst_ptr + x, texels.val[0]);
355 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
356 }
357
358 // Compute left-over elements
359 for(; x < window_end_x; ++x)
360 {
361 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
362 }
363 },
364 src, dst);
365 break;
366 }
367 case DataType::S32:
368 {
369 /* Up-conversion U8 -> S32 */
370 execute_window_loop(win, [&](const Coordinates &)
371 {
372 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
373 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
374
375 int x = window_start_x;
376 for(; x <= (window_end_x - window_step_x); x += window_step_x)
377 {
378 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
379
380 const int16x8x2_t texels =
381 {
382 {
383 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
384 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
385 }
386 };
387
388 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
389 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
390 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
391 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
392 }
393
394 // Compute left-over elements
395 for(; x < window_end_x; ++x)
396 {
397 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
398 }
399 },
400 src, dst);
401 break;
402 }
403 case DataType::F32:
404 {
405 /* Up-conversion U8 -> F32 */
406 execute_window_loop(win, [&](const Coordinates &)
407 {
408 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
409 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
410
411 int x = window_start_x;
412 for(; x <= (window_end_x - window_step_x); x += window_step_x)
413 {
414 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
415
416 const int16x8x2_t texels =
417 {
418 {
419 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
420 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
421 }
422 };
423 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
424 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
425 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
426 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
427 }
428
429 // Compute left-over elements
430 for(; x < window_end_x; ++x)
431 {
432 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
433 }
434 },
435 src, dst);
436 break;
437 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100438 case DataType::F16:
439 {
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200440 /* Up-conversion U8 -> FP16 */
441 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
442 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100443 break;
444 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100445 case DataType::U16:
446 {
447 /* Up-conversion U8 -> U16 */
448 execute_window_loop(win, [&](const Coordinates &)
449 {
450 const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
451 const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
452
453 int x = window_start_x;
454 for(; x <= (window_end_x - window_step_x); x += window_step_x)
455 {
456 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
457
458 const uint16x8x2_t texels =
459 {
460 {
461 vmovl_u8(vget_low_u8(texels_u8)),
462 vmovl_u8(vget_high_u8(texels_u8))
463 }
464 };
465
466 vst1q_u16(dst_ptr + x, texels.val[0]);
467 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
468 }
469
470 // Compute left-over elements
471 for(; x < window_end_x; ++x)
472 {
473 *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
474 }
475 },
476 src, dst);
477 break;
478 }
479 default:
480 ARM_COMPUTE_ERROR("dst data type not supported");
481 }
482 break;
483 }
484 case DataType::S16:
485 {
486 switch(_dst->info()->data_type())
487 {
488 case DataType::QASYMM8_SIGNED:
489 {
490 /* Down-conversion S16 -> QASYMM8_SIGNED */
491 if(ConvertPolicy::SATURATE == _policy)
492 {
493 execute_window_loop(win, [&](const Coordinates &)
494 {
495 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
496 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
497
498 int x = window_start_x;
499 for(; x <= (window_end_x - window_step_x); x += window_step_x)
500 {
501 const int16x8x2_t texels =
502 {
503 {
504 vld1q_s16(src_ptr + x),
505 vld1q_s16(src_ptr + x + 8)
506 }
507 };
508
509 vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
510 }
511
512 // Compute left-over elements
513 for(; x < window_end_x; ++x)
514 {
515 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
516 }
517 },
518 src, dst);
519 }
520 else
521 {
522 execute_window_loop(win, [&](const Coordinates &)
523 {
524 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
525 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
526
527 int x = window_start_x;
528 for(; x <= (window_end_x - window_step_x); x += window_step_x)
529 {
530 const int16x8x2_t texels =
531 {
532 {
533 vld1q_s16(src_ptr + x),
534 vld1q_s16(src_ptr + x + 8)
535 }
536 };
537
538 vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
539 }
540
541 // Compute left-over elements
542 for(; x < window_end_x; ++x)
543 {
544 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
545 }
546 },
547 src, dst);
548 }
549 break;
550 }
551 case DataType::U8:
552 {
553 /* Down-conversion S16 -> U8 */
554 if(ConvertPolicy::SATURATE == _policy)
555 {
556 execute_window_loop(win, [&](const Coordinates &)
557 {
558 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
559 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
560
561 int x = window_start_x;
562 for(; x <= (window_end_x - window_step_x); x += window_step_x)
563 {
564 const int16x8x2_t texels =
565 {
566 {
567 vld1q_s16(src_ptr + x),
568 vld1q_s16(src_ptr + x + 8)
569 }
570 };
571
572 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
573 }
574
575 // Compute left-over elements
576 for(; x < window_end_x; ++x)
577 {
578 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
579 }
580 },
581 src, dst);
582 }
583 else
584 {
585 execute_window_loop(win, [&](const Coordinates &)
586 {
587 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
588 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
589
590 int x = window_start_x;
591 for(; x <= (window_end_x - window_step_x); x += window_step_x)
592 {
593 const int16x8x2_t texels =
594 {
595 {
596 vld1q_s16(src_ptr + x),
597 vld1q_s16(src_ptr + x + 8)
598 }
599 };
600
601 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
602 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
603 }
604
605 // Compute left-over elements
606 for(; x < window_end_x; ++x)
607 {
608 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
609 }
610 },
611 src, dst);
612 }
613 break;
614 }
615 case DataType::S32:
616 {
617 /* Up-conversion S16 -> S32 */
618 execute_window_loop(win, [&](const Coordinates &)
619 {
620 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
621 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
622
623 int x = window_start_x;
624 for(; x <= (window_end_x - window_step_x); x += window_step_x)
625 {
626 const int16x8x2_t texels =
627 {
628 {
629 vld1q_s16(src_ptr + x),
630 vld1q_s16(src_ptr + x + 8)
631 }
632 };
633
634 const int32x4x4_t texels_s32 =
635 {
636 {
637 vmovl_s16(vget_low_s16(texels.val[0])),
638 vmovl_s16(vget_high_s16(texels.val[0])),
639 vmovl_s16(vget_low_s16(texels.val[1])),
640 vmovl_s16(vget_high_s16(texels.val[1]))
641 }
642 };
643
644 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
645 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
646 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
647 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
648 }
649
650 // Compute left-over elements
651 for(; x < window_end_x; ++x)
652 {
653 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
654 }
655 },
656 src, dst);
657 break;
658 }
659 default:
660 ARM_COMPUTE_ERROR("dst data type not supported");
661 }
662 break;
663 }
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200664
Georgios Pinitas11d84152021-04-28 10:20:18 +0100665 case DataType::U16:
666 {
667 switch(_dst->info()->data_type())
668 {
669 case DataType::U8:
670 {
671 /* Down-conversion U16 -> U8 */
672 if(ConvertPolicy::SATURATE == _policy)
673 {
674 execute_window_loop(win, [&](const Coordinates &)
675 {
676 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
677 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
678
679 int x = window_start_x;
680 for(; x <= (window_end_x - window_step_x); x += window_step_x)
681 {
682 const uint16x8x2_t texels =
683 {
684 {
685 vld1q_u16(src_ptr + x),
686 vld1q_u16(src_ptr + x + 8)
687 }
688 };
689
690 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
691 }
692
693 // Compute left-over elements
694 for(; x < window_end_x; ++x)
695 {
696 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
697 }
698 },
699 src, dst);
700 }
701 else
702 {
703 execute_window_loop(win, [&](const Coordinates &)
704 {
705 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
706 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
707
708 int x = window_start_x;
709 for(; x <= (window_end_x - window_step_x); x += window_step_x)
710 {
711 const uint16x8x2_t texels =
712 {
713 {
714 vld1q_u16(src_ptr + x),
715 vld1q_u16(src_ptr + x + 8)
716 }
717 };
718
719 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
720 }
721
722 // Compute left-over elements
723 for(; x < window_end_x; ++x)
724 {
725 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
726 }
727
728 },
729 src, dst);
730 }
731 break;
732 }
733 case DataType::U32:
734 {
735 /* Up-conversion U16 -> U32 */
736 execute_window_loop(win, [&](const Coordinates &)
737 {
738 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
739 const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
740
741 int x = window_start_x;
742 for(; x <= (window_end_x - window_step_x); x += window_step_x)
743 {
744 const uint16x8x2_t texels =
745 {
746 {
747 vld1q_u16(src_ptr + x),
748 vld1q_u16(src_ptr + x + 8)
749 }
750 };
751
752 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
753 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
754 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
755 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
756 }
757 // Compute left-over elements
758 for(; x < window_end_x; ++x)
759 {
760 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
761 }
762
763 },
764 src, dst);
765 break;
766 }
767 default:
768 ARM_COMPUTE_ERROR("dst data type not supported");
769 }
770 break;
771 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100772 case DataType::BFLOAT16:
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200773 {
774 /* Up-conversion BFLOAT16 -> F32 */
775 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
776 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100777 break;
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200778 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100779 case DataType::F16:
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200780 {
781 /* conversion F16 -> any data type */
782 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
783 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100784 break;
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200785 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100786 case DataType::F32:
787 switch(_dst->info()->data_type())
788 {
Georgios Pinitas11d84152021-04-28 10:20:18 +0100789 case DataType::F16:
790 {
791 /* Down-conversion F32 -> F16 */
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200792 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
793 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100794 break;
795 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100796 case DataType::BFLOAT16:
797 {
798 /* Down-conversion F32 -> BFLOAT16 */
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200799 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
800 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100801 break;
802 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100803 case DataType::S32:
804 {
805 /* Conversion F32 -> S32 */
806 execute_window_loop(win, [&](const Coordinates &)
807 {
808 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
809 const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
810
811 int x = window_start_x;
812 for(; x <= (window_end_x - window_step_x); x += window_step_x)
813 {
814 const float32x4x4_t texels =
815 {
816 {
817 vld1q_f32(src_ptr + x),
818 vld1q_f32(src_ptr + x + 4),
819 vld1q_f32(src_ptr + x + 8),
820 vld1q_f32(src_ptr + x + 12),
821 }
822 };
823
824 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
825 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
826 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
827 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
828 }
829
830 // Compute left-over elements
831 for(; x < window_end_x; ++x)
832 {
833 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
834 }
835 },
836 src, dst);
837 break;
838 }
839 case DataType::QASYMM8:
840 case DataType::U8:
841 {
842 /* Down-conversion F32 -> U8 */
843 execute_window_loop(win, [&](const Coordinates &)
844 {
845 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
846 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
847
848 int x = window_start_x;
849 for(; x <= (window_end_x - window_step_x); x += window_step_x)
850 {
851 const float32x4x4_t texels =
852 {
853 {
854 vld1q_f32(src_ptr + x),
855 vld1q_f32(src_ptr + x + 4),
856 vld1q_f32(src_ptr + x + 8),
857 vld1q_f32(src_ptr + x + 12),
858 }
859 };
860
861 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
862 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
863 }
864
865 // Compute left-over elements
866 for(; x < window_end_x; ++x)
867 {
868 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
869 }
870 },
871 src, dst);
872 break;
873 }
874 case DataType::QASYMM8_SIGNED:
875 {
876 /* Down-conversion F32 -> QASYMM8_SIGNED */
877 execute_window_loop(win, [&](const Coordinates &)
878 {
879 const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
880 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
881
882 int x = window_start_x;
883 for(; x <= (window_end_x - window_step_x); x += window_step_x)
884 {
885 const float32x4x4_t texels =
886 {
887 {
888 vld1q_f32(src_ptr + x),
889 vld1q_f32(src_ptr + x + 4),
890 vld1q_f32(src_ptr + x + 8),
891 vld1q_f32(src_ptr + x + 12),
892 }
893 };
894
895 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
896 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
897 }
898 // Compute left-over elements
899 for(; x < window_end_x; ++x)
900 {
901 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
902 }
903 },
904 src, dst);
905 break;
906 }
907
908 default:
909 ARM_COMPUTE_ERROR("dst data type not supported");
910 }
911 break;
912
913 case DataType::S32:
914 switch(_dst->info()->data_type())
915 {
Georgios Pinitas11d84152021-04-28 10:20:18 +0100916 case DataType::F16:
917 {
918 /* Down-conversion S32 -> F16 */
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +0200919 ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
920 uk->ukernel(_src, _dst, info, _policy, window);
Georgios Pinitas11d84152021-04-28 10:20:18 +0100921 break;
922 }
Georgios Pinitas11d84152021-04-28 10:20:18 +0100923 case DataType::F32:
924 {
925 /* Conversion S32 -> F32 */
926 execute_window_loop(win, [&](const Coordinates &)
927 {
928 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
929 const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
930
931 int x = window_start_x;
932 for(; x <= (window_end_x - window_step_x); x += window_step_x)
933 {
934 const int32x4x4_t texels =
935 {
936 {
937 vld1q_s32(src_ptr + x),
938 vld1q_s32(src_ptr + x + 4),
939 vld1q_s32(src_ptr + x + 8),
940 vld1q_s32(src_ptr + x + 12),
941 }
942 };
943
944 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
945 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
946 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
947 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
948 }
949
950 // Compute left-over elements
951 for(; x < window_end_x; ++x)
952 {
953 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
954 }
955 },
956 src, dst);
957 break;
958 }
959 case DataType::QASYMM8_SIGNED:
960 {
961 /* Down-conversion S32 -> QASYMM8_SIGNED */
962 if(ConvertPolicy::SATURATE == _policy)
963 {
964 execute_window_loop(win, [&](const Coordinates &)
965 {
966 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
967 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
968
969 int x = window_start_x;
970 for(; x <= (window_end_x - window_step_x); x += window_step_x)
971 {
972 const int32x4x4_t texels =
973 {
974 {
975 vld1q_s32(src_ptr + x),
976 vld1q_s32(src_ptr + x + 4),
977 vld1q_s32(src_ptr + x + 8),
978 vld1q_s32(src_ptr + x + 12),
979 }
980 };
981 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
982 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
983 }
984
985 // Compute left-over elements
986 for(; x < window_end_x; ++x)
987 {
988 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
989 }
990 },
991 src, dst);
992 }
993 else
994 {
995 execute_window_loop(win, [&](const Coordinates &)
996 {
997 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
998 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
999
1000 int x = window_start_x;
1001 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1002 {
1003 const int32x4x4_t texels =
1004 {
1005 {
1006 vld1q_s32(src_ptr + x),
1007 vld1q_s32(src_ptr + x + 4),
1008 vld1q_s32(src_ptr + x + 8),
1009 vld1q_s32(src_ptr + x + 12)
1010 }
1011 };
1012
1013 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1014 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1015 }
1016
1017 // Compute left-over elements
1018 for(; x < window_end_x; ++x)
1019 {
1020 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1021 }
1022 },
1023 src, dst);
1024 }
1025 break;
1026 }
1027 case DataType::QASYMM8:
1028 case DataType::U8:
1029 {
1030 /* Down-conversion S32 -> U8 */
1031 if(ConvertPolicy::SATURATE == _policy)
1032 {
1033 execute_window_loop(win, [&](const Coordinates &)
1034 {
1035 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1036 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1037
1038 int x = window_start_x;
1039 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1040 {
1041 const int32x4x4_t texels =
1042 {
1043 {
1044 vld1q_s32(src_ptr + x),
1045 vld1q_s32(src_ptr + x + 4),
1046 vld1q_s32(src_ptr + x + 8),
1047 vld1q_s32(src_ptr + x + 12)
1048 }
1049 };
1050 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1051 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1052 }
1053
1054 // Compute left-over elements
1055 for(; x < window_end_x; ++x)
1056 {
1057 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1058 }
1059 },
1060 src, dst);
1061 }
1062 else
1063 {
1064 execute_window_loop(win, [&](const Coordinates &)
1065 {
1066 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1067 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1068
1069 int x = window_start_x;
1070 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1071 {
1072 const int32x4x4_t texels =
1073 {
1074 {
1075 vld1q_s32(src_ptr + x),
1076 vld1q_s32(src_ptr + x + 4),
1077 vld1q_s32(src_ptr + x + 8),
1078 vld1q_s32(src_ptr + x + 12)
1079 }
1080 };
1081
1082 vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1083 vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1084 }
1085
1086 // Compute left-over elements
1087 for(; x < window_end_x; ++x)
1088 {
1089 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1090 }
1091 },
1092 src, dst);
1093 }
1094 break;
1095 }
1096 default:
1097 ARM_COMPUTE_ERROR("dst data type not supported");
1098 }
1099 break;
1100 default:
1101 ARM_COMPUTE_ERROR("Not supported");
1102 }
1103}
1104
1105const char *CpuCastKernel::name() const
1106{
1107 return "CpuCastKernel.cpp";
1108}
Yair Schwarzbaum298b2c02022-02-01 08:55:56 +02001109
1110const std::vector<CpuCastKernel::CastKernel> &CpuCastKernel::get_available_kernels()
1111{
1112 return available_kernels;
1113}
1114
Georgios Pinitas11d84152021-04-28 10:20:18 +01001115} // namespace kernels
1116} // namespace cpu
1117} // namespace arm_compute