blob: 98f7e8b9493a2be0cf668fb2be1ecb3e755105c8 [file] [log] [blame]
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001/*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
25#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H
26
27#include "src/core/NEON/NEAsymm.h"
28
29namespace arm_compute
30{
31namespace cpu
32{
33template <ArithmeticOperation op, typename VectorType>
34typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
35{
36 using vec_type = typename VectorType::type;
37 using scalar_type = typename VectorType::scalar_type;
38 using tag_type = typename VectorType::tag_type;
39
40 vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
41
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010042 switch (op)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +020043 {
44 case ArithmeticOperation::MAX:
45 res = wrapper::vmax(a, b);
46 break;
47 case ArithmeticOperation::MIN:
48 res = wrapper::vmin(a, b);
49 break;
50 case ArithmeticOperation::SQUARED_DIFF:
51 {
52 const vec_type tmp = wrapper::vsub(a, b);
53 res = wrapper::vmul(tmp, tmp);
54 break;
55 }
56 case ArithmeticOperation::PRELU:
57 {
58 const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
59 const vec_type tmp = wrapper::vmul(a, b);
60 const auto gt = wrapper::vcgt(a, zero);
61
62 res = wrapper::vbsl(gt, a, tmp);
63 break;
64 }
65
66 default:
67 ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
68 }
69
70 return res;
71}
Dana Zlotnika538ae52022-02-21 13:12:41 +020072
Dana Zlotnikd5c496d2021-11-28 14:46:12 +020073template <ArithmeticOperation op, typename ScalarType, typename VectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010074typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,
75 const ScalarType &broadcast_value,
76 const bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +020077{
78 using tag_type = typename VectorType::tag_type;
79 using vec_type = typename VectorType::type;
80
81 vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
82 return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
83}
84
85template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010086void elementwise_op(
87 const ITensor *in1,
88 const ITensor *in2,
89 ITensor *out,
90 const Window &window,
91 OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
92 int (*broadcast_func)(
93 int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
94 int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
Dana Zlotnikd5c496d2021-11-28 14:46:12 +020095{
96 // Create input windows
97 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
98 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
99
100 // Clear X Dimension on execution window as we handle manually
101 Window win = window;
102 win.set(Window::DimX, Window::Dimension(0, 1, 1));
103
104 const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
105 const auto window_start_x = static_cast<int>(window.x().start());
106 const auto window_end_x = static_cast<int>(window.x().end());
107 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
108
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100109 if (is_broadcast_across_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200110 {
111 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
112 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
113 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
114 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
115 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
116
117 // Clear X Dimension on execution window as we handle manually
118 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
119
120 Iterator broadcast_input(broadcast_tensor, broadcast_win);
121 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
122 Iterator output(out, win);
123
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100124 execute_window_loop(
125 win,
126 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200127 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100128 auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
129 const auto non_broadcast_input_ptr =
130 reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
131 const InputScalarType broadcast_value =
132 *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
133
134 int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
135 broadcast_value, output_ptr, !is_broadcast_input_2);
136 for (; x < window_end_x; ++x)
137 {
138 const auto a = *(non_broadcast_input_ptr + x);
139 *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
140 !is_broadcast_input_2 ? a : broadcast_value);
141 }
142 },
143 broadcast_input, non_broadcast_input, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200144 }
145 else
146 {
147 // Clear X Dimension on execution window as we handle manually
148 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
149 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
150
151 Iterator input1(in1, input1_win);
152 Iterator input2(in2, input2_win);
153 Iterator output(out, win);
154
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100155 execute_window_loop(
156 win,
157 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200158 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100159 auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
160 const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
161 const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
162
163 int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
164 for (; x < window_end_x; ++x)
165 {
166 const auto a = *(input1_ptr + x);
167 const auto b = *(input2_ptr + x);
168 *(output_ptr + x) = (*scalar_func)(a, b);
169 }
170 },
171 input1, input2, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200172 }
173}
174
175template <ArithmeticOperation op, typename ScalarType>
176inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
177{
178 auto res = ScalarType(0);
179
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100180 switch (op)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200181 {
182 case ArithmeticOperation::MAX:
183 res = std::max(a, b);
184 break;
185 case ArithmeticOperation::MIN:
186 res = std::min(a, b);
187 break;
188 case ArithmeticOperation::SQUARED_DIFF:
189 {
190 res = (a - b) * (a - b);
191 break;
192 }
193 case ArithmeticOperation::PRELU:
194 {
195 res = (a > 0 ? a : a * b);
196 break;
197 }
198 case ArithmeticOperation::DIV:
199 {
200 res = a / b;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100201 if (std::is_integral<ScalarType>::value)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200202 {
203 res = (b == 0) ? 0 : res;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100204 if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200205 {
206 --res;
207 }
208 }
209 break;
210 }
211 case ArithmeticOperation::POWER:
212 {
213 res = std::pow(a, b);
214 break;
215 }
216 default:
217 ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
218 }
219 return res;
220}
221
222template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100223inline int32x4_t
224elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
225 const int32x4_t &b)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200226{
227 return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
228}
229
230template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100231inline float32x4_t
232elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
233 const float32x4_t &b)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200234{
235 return wrapper::vdiv(a, b);
236}
237
238template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100239inline float32x4_t
240elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
241 const float32x4_t &b)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200242{
243 return wrapper::vpow(a, b);
244}
245
246#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
247template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100248inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
249 const float16x8_t &a, const float16x8_t &b)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200250{
251 return wrapper::vdiv(a, b);
252}
253
254template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100255inline float16x8_t
256elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
257 const float16x8_t &a, const float16x8_t &b)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200258{
259 return wrapper::vpow(a, b);
260}
261#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
262
263template <ArithmeticOperation op, typename ScalarType, typename VectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100264inline int elementwise_arithm_op_loop(int window_start_x,
265 int window_end_x,
266 int window_step_x,
267 const ScalarType *input1_ptr,
268 const ScalarType *input2_ptr,
269 ScalarType *output_ptr)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200270{
271 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100272 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200273 {
274 const auto a = wrapper::vloadq(input1_ptr + x);
275 const auto b = wrapper::vloadq(input2_ptr + x);
276 wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
277 }
278 return x;
279}
280
281template <ArithmeticOperation op, typename ScalarType, typename VectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100282inline int elementwise_arithm_op_broadcast_loop(int window_start_x,
283 int window_end_x,
284 int window_step_x,
285 const ScalarType *non_broadcast_input_ptr,
286 const ScalarType &broadcast_value,
287 ScalarType *output_ptr,
288 const bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200289{
290 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100291 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200292 {
293 const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100294 wrapper::vstore(output_ptr + x,
295 elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200296 }
297 return x;
298}
299
300template <ArithmeticOperation op, typename VectorType>
301void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
302{
303 using scalar_type = typename VectorType::scalar_type;
304
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100305 elementwise_op<scalar_type, scalar_type, VectorType>(
306 in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,
307 &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
308 &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200309}
310
311template <ComparisonOperation op, typename InputScalarType>
312inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
313{
314 bool res = false;
315
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100316 switch (op)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200317 {
318 case ComparisonOperation::Equal:
319 res = (a == b);
320 break;
321 case ComparisonOperation::NotEqual:
322 res = (a != b);
323 break;
324 case ComparisonOperation::Greater:
325 res = (a > b);
326 break;
327 case ComparisonOperation::GreaterEqual:
328 res = (a >= b);
329 break;
330 case ComparisonOperation::Less:
331 res = (a < b);
332 break;
333 case ComparisonOperation::LessEqual:
334 res = (a <= b);
335 break;
336 default:
337 ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
338 }
339 return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
340}
341
342template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
343inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
344{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100345 OutputVectorType res = {0, 0, 0, 0};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200346
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100347 switch (op)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200348 {
349 case ComparisonOperation::Equal:
350 res = wrapper::vceq(a, b);
351 break;
352 case ComparisonOperation::NotEqual:
353 res = wrapper::vnot(wrapper::vceq(a, b));
354 break;
355 case ComparisonOperation::Greater:
356 res = wrapper::vcgt(a, b);
357 break;
358 case ComparisonOperation::GreaterEqual:
359 res = wrapper::vcge(a, b);
360 break;
361 case ComparisonOperation::Less:
362 res = wrapper::vcgt(b, a);
363 break;
364 case ComparisonOperation::LessEqual:
365 res = wrapper::vcge(b, a);
366 break;
367 default:
368 ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
369 }
370
371 return res;
372}
373
374template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100375inline OutputVectorType
376elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200377{
378 InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100379 return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,
380 reorder ? a : broadcast_vector);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200381}
382
383template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100384inline int elementwise_comp_op_broadcast_8_loop(int window_start_x,
385 int window_end_x,
386 int window_step_x,
387 const InputScalarType *non_broadcast_input_ptr,
388 const InputScalarType &broadcast_value,
389 uint8_t *output_ptr,
390 const bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200391{
392 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100393 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200394 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100395 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
396 wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200397 wrapper::vstore(output_ptr + x, a);
398 }
399 return x;
400}
401
402template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100403inline int elementwise_comp_op_broadcast_16_loop(int window_start_x,
404 int window_end_x,
405 int window_step_x,
406 const InputScalarType *non_broadcast_input_ptr,
407 const InputScalarType &broadcast_value,
408 uint8_t *output_ptr,
409 const bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200410{
411 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100412 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200413 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100414 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
415 wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200416 wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
417 }
418 return x;
419}
420
421template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100422inline int elementwise_comp_op_broadcast_32_loop(int window_start_x,
423 int window_end_x,
424 int window_step_x,
425 const InputScalarType *non_broadcast_input_ptr,
426 const InputScalarType &broadcast_value,
427 uint8_t *output_ptr,
428 const bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200429{
430 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100431 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200432 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100433 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
434 wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
435 const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
436 wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200437 wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
438 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100439 if (x <= window_end_x - 4)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200440 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100441 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
442 wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
443 for (int i = 0; i < 4; i++)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200444 {
445 *(output_ptr + x + i) = wrapper::vgetlane(a, i);
446 }
447 x = +4;
448 }
449 return x;
450}
451
452template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100453inline int elementwise_comp_op_8_loop(int window_start_x,
454 int window_end_x,
455 int window_step_x,
456 const InputScalarType *input1_ptr,
457 const InputScalarType *input2_ptr,
458 uint8_t *output_ptr)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200459{
460 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100461 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200462 {
463 const auto a = wrapper::vloadq(input1_ptr + x);
464 const auto b = wrapper::vloadq(input2_ptr + x);
465 const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
466 wrapper::vstore(output_ptr + x, res);
467 }
468 return x;
469}
470
471template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100472inline int elementwise_comp_op_16_loop(int window_start_x,
473 int window_end_x,
474 int window_step_x,
475 const InputScalarType *input1_ptr,
476 const InputScalarType *input2_ptr,
477 uint8_t *output_ptr)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200478{
479 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100480 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200481 {
482 const auto a = wrapper::vloadq(input1_ptr + x);
483 const auto b = wrapper::vloadq(input2_ptr + x);
484 const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
485 wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
486 }
487 return x;
488}
489
490template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100491inline int elementwise_comp_op_32_loop(int window_start_x,
492 int window_end_x,
493 int window_step_x,
494 const InputScalarType *input1_ptr,
495 const InputScalarType *input2_ptr,
496 uint8_t *output_ptr)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200497{
498 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100499 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200500 {
501 auto a = wrapper::vloadq(input1_ptr + x);
502 auto b = wrapper::vloadq(input2_ptr + x);
503 const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
504 a = wrapper::vloadq(input1_ptr + x + 4);
505 b = wrapper::vloadq(input2_ptr + x + 4);
506 const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
507 wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
508 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100509 if (x <= window_end_x - 4)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200510 {
511 const auto a = wrapper::vloadq(input1_ptr + x);
512 const auto b = wrapper::vloadq(input2_ptr + x);
513 const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100514 for (int i = 0; i < 4; i++)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200515 {
516 *(output_ptr + x + i) = wrapper::vgetlane(res, i);
517 }
518 x = +4;
519 }
520 return x;
521}
522
523template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
524void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
525{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100526 elementwise_op<InputScalarType, uint8_t, InputVectorType>(
527 in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
528 &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
529 &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200530}
531
532template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
533void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
534{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100535 elementwise_op<InputScalarType, uint8_t, InputVectorType>(
536 in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
537 &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
538 &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200539}
540
541template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
542void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
543{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100544 elementwise_op<InputScalarType, uint8_t, InputVectorType>(
545 in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
546 &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
547 &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200548}
549
550inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
551{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100552 qasymm8x16_t x = vld1q_u8(input1_ptr);
553 const float32x4x4_t out = {{
554 vmulq_f32(
555 vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
556 scale),
557 vmulq_f32(
558 vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
559 scale),
560 vmulq_f32(
561 vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
562 scale),
563 vmulq_f32(vcvtq_f32_s32(
564 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
565 scale),
566 }};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200567 return out;
568}
569
570inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
571{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100572 qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
573 const float32x4x4_t out = {{
574 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
575 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
576 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
577 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
578 }};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200579 return out;
580}
581
582inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
583{
584 const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
585 const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
586 vst1q_u8(output_ptr, vcombine_u8(pa, pb));
587}
588
589inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
590{
591 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
592 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
593 vst1q_u8(output_ptr, vcombine_u8(pa, pb));
594}
595
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100596inline void
597store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200598{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100599 int32x4x4_t out = {{
600 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
601 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
602 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
603 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
604 }};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200605 store_quantized(output_ptr, out);
606}
607
608inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
609{
610 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
611 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
612 vst1q_s8(output_ptr, vcombine_s8(pa, pb));
613}
614
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100615inline void store_quantized_signed(int8_t *output_ptr,
616 const float32x4x4_t &rf,
617 const float32x4_t &offset,
618 const float32x4_t &invscale)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200619{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100620 int32x4x4_t out = {{
621 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
622 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
623 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
624 vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
625 }};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200626 store_quantized_signed(output_ptr, out);
627}
628
629template <ArithmeticOperation op>
630inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
631{
632 return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
633}
634
635template <ArithmeticOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100636inline int8_t
637elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200638{
639 return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
640}
641
642template <ArithmeticOperation op>
643float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
644{
645 using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100646 float32x4x4_t out = {{
647 elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
648 elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
649 elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
650 elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
651 }};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200652 return out;
653}
654
655template <ComparisonOperation op>
656inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
657{
658 ARM_COMPUTE_UNUSED(qinfo);
659 return elementwise_comp_op_scalar<op>(a, b);
660}
661
662template <ComparisonOperation op>
663inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
664{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100665 uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
666 elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
667 elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
668 elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200669 return out;
670}
671
672template <ArithmeticOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100673inline int elementwise_arithm_op_quantized_loop(int window_start_x,
674 int window_end_x,
675 int window_step_x,
676 const uint8_t *input1_ptr,
677 const uint8_t *input2_ptr,
678 uint8_t *output_ptr,
679 int32x4_t voffset1,
680 int32x4_t voffset2,
681 float32x4_t vscale1,
682 float32x4_t vscale2,
683 float32x4_t voffseto,
684 float32x4_t invvscaleo)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200685{
686 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100687 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200688 {
689 // Get inputs and compute output
690 const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
691 const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
692 const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
693 store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
694 }
695 return x;
696}
697
698template <ArithmeticOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100699inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x,
700 int window_end_x,
701 int window_step_x,
702 const int8_t *input1_ptr,
703 const int8_t *input2_ptr,
704 int8_t *output_ptr,
705 int32x4_t voffset1,
706 int32x4_t voffset2,
707 float32x4_t vscale1,
708 float32x4_t vscale2,
709 float32x4_t voffseto,
710 float32x4_t invvscaleo)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200711{
712 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100713 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200714 {
715 // Get inputs and compute output
716 const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
717 const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
718 const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
719 store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
720 }
721 return x;
722}
723
724template <ArithmeticOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100725inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x,
726 int window_end_x,
727 int window_step_x,
728 const uint8_t *non_broadcast_input_ptr,
729 float32x4x4_t broadcast_vector,
730 uint8_t *output_ptr,
731 int32x4_t voffset_non_broadcast,
732 float32x4_t vscale_non_broadcast,
733 float32x4_t voffseto,
734 float32x4_t invvscaleo,
735 bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200736{
737 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100738 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200739 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100740 const float32x4x4_t af =
741 load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
742 const float32x4x4_t rf =
743 elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200744 store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
745 }
746 return x;
747}
748template <ArithmeticOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100749inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x,
750 int window_end_x,
751 int window_step_x,
752 const int8_t *non_broadcast_input_ptr,
753 float32x4x4_t broadcast_vector,
754 int8_t *output_ptr,
755 int32x4_t voffset_non_broadcast,
756 float32x4_t vscale_non_broadcast,
757 float32x4_t voffseto,
758 float32x4_t invvscaleo,
759 bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200760{
761 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100762 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200763 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100764 const float32x4x4_t af =
765 load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
766 const float32x4x4_t rf =
767 elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200768 store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
769 }
770 return x;
771}
772
773template <ComparisonOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100774inline int elementwise_comp_op_quantized_loop(int window_start_x,
775 int window_end_x,
776 int window_step_x,
777 const uint8_t *input1_ptr,
778 const uint8_t *input2_ptr,
779 uint8_t *output_ptr,
780 int32x4_t voffset1,
781 int32x4_t voffset2,
782 float32x4_t vscale1,
783 float32x4_t vscale2,
784 float32x4_t voffseto,
785 float32x4_t invvscaleo)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200786{
787 ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
788 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100789 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200790 {
791 const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
792 const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
793 const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
794 store_quantized(output_ptr + x, rf);
795 }
796 return x;
797}
798
799template <ComparisonOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100800inline int elementwise_comp_op_quantized_signed_loop(int window_start_x,
801 int window_end_x,
802 int window_step_x,
803 const int8_t *input1_ptr,
804 const int8_t *input2_ptr,
805 uint8_t *output_ptr,
806 int32x4_t voffset1,
807 int32x4_t voffset2,
808 float32x4_t vscale1,
809 float32x4_t vscale2,
810 float32x4_t voffseto,
811 float32x4_t invvscaleo)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200812{
813 ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
814 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100815 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200816 {
817 const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
818 const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
819 const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
820 store_quantized(output_ptr + x, rf);
821 }
822 return x;
823}
824
825template <ComparisonOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100826inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x,
827 int window_end_x,
828 int window_step_x,
829 const uint8_t *non_broadcast_input_ptr,
830 float32x4x4_t broadcast_vector,
831 uint8_t *output_ptr,
832 int32x4_t voffset_non_broadcast,
833 float32x4_t vscale_non_broadcast,
834 float32x4_t voffseto,
835 float32x4_t invvscaleo,
836 bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200837{
838 ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
839 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100840 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200841 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100842 const float32x4x4_t af =
843 load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
844 const uint32x4x4_t rf =
845 elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200846 store_quantized(output_ptr + x, rf);
847 }
848 return x;
849}
850
851template <ComparisonOperation op>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100852inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x,
853 int window_end_x,
854 int window_step_x,
855 const int8_t *non_broadcast_input_ptr,
856 float32x4x4_t broadcast_vector,
857 uint8_t *output_ptr,
858 int32x4_t voffset_non_broadcast,
859 float32x4_t vscale_non_broadcast,
860 float32x4_t voffseto,
861 float32x4_t invvscaleo,
862 bool reorder)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200863{
864 ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
865 int x = window_start_x;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100866 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200867 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100868 const float32x4x4_t af =
869 load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
870 const uint32x4x4_t rf =
871 elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200872 store_quantized(output_ptr + x, rf);
873 }
874 return x;
875}
876
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100877inline void elementwise_op_quantized(const ITensor *in1,
878 const ITensor *in2,
879 ITensor *out,
880 const Window &window,
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200881 uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100882 int (*broadcast_func)(int,
883 int,
884 int,
885 const uint8_t *,
886 float32x4x4_t,
887 uint8_t *,
888 int32x4_t,
889 float32x4_t,
890 float32x4_t,
891 float32x4_t,
892 const bool),
893 int (*neon_func)(int,
894 int,
895 int,
896 const uint8_t *,
897 const uint8_t *,
898 uint8_t *,
899 int32x4_t,
900 int32x4_t,
901 float32x4_t,
902 float32x4_t,
903 float32x4_t,
904 float32x4_t))
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200905{
906 // Create input windows
907 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
908 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
909
910 // Clear X Dimension on execution window as we handle manually
911 Window win = window;
912 win.set(Window::DimX, Window::Dimension(0, 1, 1));
913
914 const int window_step_x = 16;
915 const auto window_start_x = static_cast<int>(window.x().start());
916 const auto window_end_x = static_cast<int>(window.x().end());
917 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
918
919 const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
920
921 // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
922 const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f);
923 const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
924
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100925 if (is_broadcast_across_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200926 {
927 // Select the broadcast input on the X axis
928 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
929 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
930 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
931 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
932 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
933
934 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
935 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
936
937 const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
938 const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
939
940 // Clear X Dimension on execution window as we handle manually
941 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
942
943 Iterator broadcast_input(broadcast_tensor, broadcast_win);
944 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
945 Iterator output(out, win);
946
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100947 execute_window_loop(
948 win,
949 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200950 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100951 const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
952 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
953
954 const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
955 const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
956
957 int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
958 broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
959 voffseto, invvscaleo, !is_broadcast_input_2);
960 for (; x < window_end_x; ++x)
961 {
962 const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
963 const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
964 *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
965 !is_broadcast_input_2 ? afs : bfs, output_qinfo);
966 }
967 },
968 broadcast_input, non_broadcast_input, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200969 }
970 else
971 {
972 const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
973 const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
974
975 // Input1 quantization info
976 const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
977 const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
978
979 // Input2 quantization info
980 const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
981 const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
982
983 // Clear X Dimension on execution window as we handle manually
984 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
985 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
986
987 Iterator input1(in1, input1_win);
988 Iterator input2(in2, input2_win);
989 Iterator output(out, win);
990
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100991 execute_window_loop(
992 win,
993 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +0200994 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100995 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
996 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
997 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
998
999 int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
1000 voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
1001 for (; x < window_end_x; ++x)
1002 {
1003 const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
1004 const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
1005 *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
1006 }
1007 },
1008 input1, input2, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001009 }
1010}
1011
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001012inline void
1013elementwise_comp_quantized_signed(const ITensor *in1,
1014 const ITensor *in2,
1015 ITensor *out,
1016 const Window &window,
1017 uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
1018 int (*broadcast_func)(int,
1019 int,
1020 int,
1021 const int8_t *,
1022 float32x4x4_t,
1023 uint8_t *,
1024 int32x4_t,
1025 float32x4_t,
1026 float32x4_t,
1027 float32x4_t,
1028 const bool),
1029 int (*neon_func)(int,
1030 int,
1031 int,
1032 const int8_t *,
1033 const int8_t *,
1034 uint8_t *,
1035 int32x4_t,
1036 int32x4_t,
1037 float32x4_t,
1038 float32x4_t,
1039 float32x4_t,
1040 float32x4_t))
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001041{
1042 // Create input windows
1043 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1044 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1045
1046 // Clear X Dimension on execution window as we handle manually
1047 Window win = window;
1048 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1049
1050 const int window_step_x = 16;
1051 const auto window_start_x = static_cast<int>(window.x().start());
1052 const auto window_end_x = static_cast<int>(window.x().end());
1053 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
1054
1055 const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
1056
1057 const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
1058 const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
1059
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001060 if (is_broadcast_across_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001061 {
1062 // Select the broadcast input on the X axis
1063 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1064 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1065 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1066 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
1067 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
1068
1069 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
1070 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
1071
1072 const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
1073 const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
1074
1075 // Clear X Dimension on execution window as we handle manually
1076 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1077
1078 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1079 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1080 Iterator output(out, win);
1081
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001082 execute_window_loop(
1083 win,
1084 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001085 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001086 const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
1087 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1088
1089 const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
1090 const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
1091
1092 int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
1093 broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
1094 voffseto, invvscaleo, !is_broadcast_input_2);
1095 for (; x < window_end_x; ++x)
1096 {
1097 const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
1098 const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
1099 *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
1100 !is_broadcast_input_2 ? afs : bfs, output_qinfo);
1101 }
1102 },
1103 broadcast_input, non_broadcast_input, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001104 }
1105 else
1106 {
1107 const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
1108 const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
1109
1110 // Input1 quantization info
1111 const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
1112 const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
1113
1114 // Input2 quantization info
1115 const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
1116 const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
1117
1118 // Clear X Dimension on execution window as we handle manually
1119 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1120 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1121
1122 Iterator input1(in1, input1_win);
1123 Iterator input2(in2, input2_win);
1124 Iterator output(out, win);
1125
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001126 execute_window_loop(
1127 win,
1128 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001129 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001130 const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
1131 const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
1132 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1133
1134 int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
1135 voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
1136 for (; x < window_end_x; ++x)
1137 {
1138 const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
1139 const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
1140 *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
1141 }
1142 },
1143 input1, input2, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001144 }
1145}
1146
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001147inline void
1148elementwise_op_quantized_signed(const ITensor *in1,
1149 const ITensor *in2,
1150 ITensor *out,
1151 const Window &window,
1152 int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
1153 int (*broadcast_func)(int,
1154 int,
1155 int,
1156 const int8_t *,
1157 float32x4x4_t,
1158 int8_t *,
1159 int32x4_t,
1160 float32x4_t,
1161 float32x4_t,
1162 float32x4_t,
1163 const bool),
1164 int (*neon_func)(int,
1165 int,
1166 int,
1167 const int8_t *,
1168 const int8_t *,
1169 int8_t *,
1170 int32x4_t,
1171 int32x4_t,
1172 float32x4_t,
1173 float32x4_t,
1174 float32x4_t,
1175 float32x4_t))
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001176{
1177 // Create input windows
1178 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1179 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1180
1181 // Clear X Dimension on execution window as we handle manually
1182 Window win = window;
1183 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1184
1185 const int window_step_x = 16;
1186 const auto window_start_x = static_cast<int>(window.x().start());
1187 const auto window_end_x = static_cast<int>(window.x().end());
1188 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
1189
1190 const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
1191
1192 const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
1193 const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
1194
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001195 if (is_broadcast_across_x)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001196 {
1197 // Select the broadcast input on the X axis
1198 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1199 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1200 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1201 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
1202 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
1203
1204 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
1205 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
1206
1207 const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
1208 const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
1209
1210 // Clear X Dimension on execution window as we handle manually
1211 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1212
1213 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1214 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1215 Iterator output(out, win);
1216
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001217 execute_window_loop(
1218 win,
1219 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001220 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001221 const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
1222 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1223
1224 const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
1225 const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
1226
1227 int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
1228 broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
1229 voffseto, invvscaleo, !is_broadcast_input_2);
1230 for (; x < window_end_x; ++x)
1231 {
1232 const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
1233 const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
1234 *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
1235 !is_broadcast_input_2 ? afs : bfs, output_qinfo);
1236 }
1237 },
1238 broadcast_input, non_broadcast_input, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001239 }
1240 else
1241 {
1242 const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
1243 const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
1244
1245 // Input1 quantization info
1246 const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
1247 const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
1248
1249 // Input2 quantization info
1250 const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
1251 const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
1252
1253 // Clear X Dimension on execution window as we handle manually
1254 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1255 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1256
1257 Iterator input1(in1, input1_win);
1258 Iterator input2(in2, input2_win);
1259 Iterator output(out, win);
1260
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001261 execute_window_loop(
1262 win,
1263 [&](const Coordinates &)
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001264 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001265 const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
1266 const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
1267 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1268
1269 int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
1270 voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
1271 for (; x < window_end_x; ++x)
1272 {
1273 const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
1274 const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
1275 *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
1276 }
1277 },
1278 input1, input2, output);
Dana Zlotnikd5c496d2021-11-28 14:46:12 +02001279 }
1280}
1281
1282template <ArithmeticOperation op>
1283void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
1284{
1285 elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
1286 &elementwise_arithm_op_quantized_broadcast_loop<op>,
1287 &elementwise_arithm_op_quantized_loop<op>);
1288}
1289
1290template <ArithmeticOperation op>
1291void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
1292{
1293 elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
1294 &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
1295 &elementwise_arithm_op_quantized_singed_loop<op>);
1296}
1297
1298template <ComparisonOperation op>
1299void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
1300{
1301 elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
1302 &elementwise_comp_op_quantized_broadcast_loop<op>,
1303 &elementwise_comp_op_quantized_loop<op>);
1304}
1305
1306template <ComparisonOperation op>
1307void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
1308{
1309 elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
1310 &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
1311 &elementwise_comp_op_quantized_signed_loop<op>);
1312}
1313} // namespace cpu
1314} // namespace arm_compute
1315
1316#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_IMPL_H */