blob: fc211f7b426c3b5d795e3e77a4dfa16bce93cb1c [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/ITensor.h"
Georgios Pinitas5a594532018-12-03 14:30:05 +000030#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031#include "arm_compute/core/Validate.h"
32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include <map>
34#include <string>
35
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036namespace arm_compute
37{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038namespace
39{
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010040template <typename T>
41void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, const ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042{
Georgios Pinitas5a594532018-12-03 14:30:05 +000043 /** NEON vector tag type. */
44 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
45
46 // Create input windows
47 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
48 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
49
50 // Clear X Dimension on execution window as we handle manually
51 Window win = window;
52 win.set(Window::DimX, Window::Dimension(0, 1, 1));
53
54 constexpr int window_step_x = 16 / sizeof(T);
55 const auto window_start_x = static_cast<int>(window.x().start());
56 const auto window_end_x = static_cast<int>(window.x().end());
57 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
58
59 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010060 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000061 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
65 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010066
Georgios Pinitas5a594532018-12-03 14:30:05 +000067 // Clear X Dimension on execution window as we handle manually
68 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010069
Georgios Pinitas5a594532018-12-03 14:30:05 +000070 Iterator broadcast_input(broadcast_tensor, broadcast_win);
71 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
72 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073
Michalis Spyroua4f378d2019-04-26 14:54:54 +010074 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000076 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
77 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +010078
Georgios Pinitas5a594532018-12-03 14:30:05 +000079 const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
80 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Anthony Barbier6ff3b192017-09-04 18:44:23 +010081
Georgios Pinitas5a594532018-12-03 14:30:05 +000082 // Compute S elements per iteration
83 int x = window_start_x;
84 for(; x <= (window_end_x - window_step_x); x += window_step_x)
85 {
86 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010087 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
Georgios Pinitas5a594532018-12-03 14:30:05 +000088 wrapper::vstore(output_ptr + x, res);
89 }
90
91 // Compute left-over elements
92 for(; x < window_end_x; ++x)
93 {
94 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010095 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
Georgios Pinitas5a594532018-12-03 14:30:05 +000096 }
97 },
98 broadcast_input, non_broadcast_input, output);
99 }
100 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100101 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000102 // Clear X Dimension on execution window as we handle manually
103 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
104 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
105
106 Iterator input1(in1, input1_win);
107 Iterator input2(in2, input2_win);
108 Iterator output(out, win);
109
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100110 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100111 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000112 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
113 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
114 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100115
Georgios Pinitas5a594532018-12-03 14:30:05 +0000116 // Compute S elements per iteration
117 int x = window_start_x;
118 for(; x <= (window_end_x - window_step_x); x += window_step_x)
119 {
120 const auto val1 = wrapper::vloadq(input1_ptr + x);
121 const auto val2 = wrapper::vloadq(input2_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100122 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000123 wrapper::vstore(output_ptr + x, res);
124 }
125
126 // Compute left-over elements
127 for(; x < window_end_x; ++x)
128 {
129 const auto val1 = *(input1_ptr + x);
130 const auto val2 = *(input2_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100131 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
Georgios Pinitas5a594532018-12-03 14:30:05 +0000132 }
133 },
134 input1, input2, output);
135 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100136}
137
Georgios Pinitas5a594532018-12-03 14:30:05 +0000138void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100139{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000140 ARM_COMPUTE_UNUSED(policy);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100141
Georgios Pinitas5a594532018-12-03 14:30:05 +0000142 // Create input windows
143 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
144 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100145
Georgios Pinitas5a594532018-12-03 14:30:05 +0000146 // Clear X Dimension on execution window as we handle manually
147 Window win = window;
148 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100149
Georgios Pinitas5a594532018-12-03 14:30:05 +0000150 const int window_step_x = 16;
151 const auto window_start_x = static_cast<int>(window.x().start());
152 const auto window_end_x = static_cast<int>(window.x().end());
153 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100154
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100155 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
156 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
157 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000158
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100159 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
160 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
161 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
162 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
163 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
164 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000165
Georgios Pinitas5a594532018-12-03 14:30:05 +0000166 if(is_broadcast_across_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000167 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100168 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
169 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
170 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
171 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
172 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
173 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
174 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000175
Georgios Pinitas5a594532018-12-03 14:30:05 +0000176 // Clear X Dimension on execution window as we handle manually
177 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
178
179 Iterator broadcast_input(broadcast_tensor, broadcast_win);
180 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
181 Iterator output(out, win);
182
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100183 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000184 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000185 const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
186 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000187
Georgios Pinitas5a594532018-12-03 14:30:05 +0000188 const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
189 const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
190
191 const float32x4x4_t bf =
192 {
193 {
194 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
195 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
196 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
197 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
198 }
199 };
200 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
201
202 // Compute S elements per iteration
203 int x = window_start_x;
204 for(; x <= (window_end_x - window_step_x); x += window_step_x)
205 {
206 const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
207 const float32x4x4_t af =
208 {
209 {
210 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
211 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
212 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
213 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
214 }
215 };
216
217 const int32x4x4_t rf =
218 {
219 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000220#ifdef __aarch64__
221 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
222 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
223 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
224 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100225#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000226 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
227 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
228 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
229 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000230#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000231 }
232 };
233
234 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
235 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
236 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
237 }
238
239 // Compute left-over elements
240 for(; x < window_end_x; ++x)
241 {
242 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100243 *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000244 }
245 },
246 broadcast_input, non_broadcast_input, output);
247 }
248 else
249 {
250 // Clear X Dimension on execution window as we handle manually
251 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
252 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
253
Georgios Pinitas5a594532018-12-03 14:30:05 +0000254 Iterator input1(in1, input1_win);
255 Iterator input2(in2, input2_win);
256 Iterator output(out, win);
257
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100258 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000259 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000260 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
261 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
262 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000263
Georgios Pinitas5a594532018-12-03 14:30:05 +0000264 // Compute S elements per iteration
265 int x = window_start_x;
266 for(; x <= (window_end_x - window_step_x); x += window_step_x)
267 {
268 const uint8x16_t a = vld1q_u8(input1_ptr + x);
269 const uint8x16_t b = vld1q_u8(input2_ptr + x);
270
271 const float32x4x4_t af =
272 {
273 {
274 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
275 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
276 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
277 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
278 }
279 };
280
281 const float32x4x4_t bf =
282 {
283 {
284 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
285 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
286 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
287 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
288 }
289 };
290
291 const int32x4x4_t rf =
292 {
293 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000294#ifdef __aarch64__
295 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
296 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
297 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
298 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100299#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000300 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
301 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
302 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
303 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000304#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000305 }
306 };
307
308 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
309 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
310 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
311 }
312
313 // Compute left-over elements
314 for(; x < window_end_x; ++x)
315 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100316 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
317 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
318 *(output_ptr + x) = quantize_qasymm8((afs + bfs), out->info()->quantization_info());
Georgios Pinitas5a594532018-12-03 14:30:05 +0000319 }
320 },
321 input1, input2, output);
322 }
323}
324
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000325void add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
326{
327 ARM_COMPUTE_UNUSED(policy);
328
329 // Create input windows
330 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
331 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
332
333 // Clear X Dimension on execution window as we handle manually
334 Window win = window;
335 win.set(Window::DimX, Window::Dimension(0, 1, 1));
336
337 const int window_step_x = 16;
338 const auto window_start_x = static_cast<int>(window.x().start());
339 const auto window_end_x = static_cast<int>(window.x().end());
340 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
341
342 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
343 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
344 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
345
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000346 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000347 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
348
349 if(is_broadcast_across_x)
350 {
351 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
352 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
353 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
354 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
355 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
356 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
357 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
358
Michalis Spyrou2232a202020-07-13 15:15:33 +0100359 const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
360 const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
361 const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
362 const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
363
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000364 // Clear X Dimension on execution window as we handle manually
365 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
366
367 Iterator broadcast_input(broadcast_tensor, broadcast_win);
368 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
369 Iterator output(out, win);
370
371 execute_window_loop(win, [&](const Coordinates &)
372 {
373 const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
374 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
375
376 const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
377 const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
378
379 const float32x4x4_t bf =
380 {
381 {
382 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2),
383 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2),
384 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2),
385 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2),
386 }
387 };
388 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
389
390 // Compute S elements per iteration
391 int x = window_start_x;
392 for(; x <= (window_end_x - window_step_x); x += window_step_x)
393 {
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000394 const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000395 const float32x4x4_t af =
396 {
397 {
398 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
399 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
400 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
401 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
402 }
403 };
404
405 const int32x4x4_t rf =
406 {
407 {
408#ifdef __aarch64__
409 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
410 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
411 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
412 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
413#else //__aarch64__
414 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
415 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
416 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
417 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
418#endif //__aarch64__
419 }
420 };
421
422 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
423 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
424 vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
425 }
426
427 // Compute left-over elements
428 for(; x < window_end_x; ++x)
429 {
430 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
431 *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
432 }
433 },
434 broadcast_input, non_broadcast_input, output);
435 }
436 else
437 {
438 // Clear X Dimension on execution window as we handle manually
439 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
440 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
441
442 Iterator input1(in1, input1_win);
443 Iterator input2(in2, input2_win);
444 Iterator output(out, win);
445
Michalis Spyrou2232a202020-07-13 15:15:33 +0100446 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
447 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
448 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
449 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000450 execute_window_loop(win, [&](const Coordinates &)
451 {
452 const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
453 const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
454 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
455
456 // Compute S elements per iteration
457 int x = window_start_x;
458 for(; x <= (window_end_x - window_step_x); x += window_step_x)
459 {
460 const int8x16_t a = vld1q_s8(input1_ptr + x);
461 const int8x16_t b = vld1q_s8(input2_ptr + x);
462
463 const float32x4x4_t af =
464 {
465 {
466 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
467 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
468 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
469 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
470 }
471 };
472
473 const float32x4x4_t bf =
474 {
475 {
476 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2),
477 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2),
478 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2),
479 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2),
480 }
481 };
482
483 const int32x4x4_t rf =
484 {
485 {
486#ifdef __aarch64__
487 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
488 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
489 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
490 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
491#else //__aarch64__
492 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
493 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
494 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
495 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
496#endif //__aarch64__
497 }
498 };
499
500 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
501 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
502 vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
503 }
504
505 // Compute left-over elements
506 for(; x < window_end_x; ++x)
507 {
508 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
509 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
510 *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), out->info()->quantization_info());
511 }
512 },
513 input1, input2, output);
514 }
515}
516
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100517void add_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
518{
519 ARM_COMPUTE_UNUSED(policy);
520
521 // Create input windows
522 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
523 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
524
525 // Clear X Dimension on execution window as we handle manually
526 Window win = window;
527 win.set(Window::DimX, Window::Dimension(0, 1, 1));
528
529 const int window_step_x = 8;
530 const auto window_start_x = static_cast<int>(window.x().start());
531 const auto window_end_x = static_cast<int>(window.x().end());
532 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
533
534 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
535 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
536 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
537
538 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
539 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
540 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
541
542 if(is_broadcast_across_x)
543 {
544 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
545 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
546 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
547 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
548 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
549 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
550 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
551
552 // Clear X Dimension on execution window as we handle manually
553 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
554
555 Iterator broadcast_input(broadcast_tensor, broadcast_win);
556 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
557 Iterator output(out, win);
558
559 execute_window_loop(win, [&](const Coordinates &)
560 {
561 const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
562 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
563
564 const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
565 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
566
567 const float32x4x2_t bf =
568 {
569 {
570 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
571 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
572 }
573 };
574 const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
575
576 // Compute S elements per iteration
577 int x = window_start_x;
578 for(; x <= (window_end_x - window_step_x); x += window_step_x)
579 {
580 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
581 const float32x4x2_t af =
582 {
583 {
584 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
585 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
586 }
587 };
588
589 const int32x4x4_t rf =
590 {
591 {
592#ifdef __aarch64__
593 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
594 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
595#else //__aarch64__
596 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
597 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
598#endif //__aarch64__
599 }
600 };
601
602 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
603 vst1q_s16(output_ptr + x, pa);
604 }
605
606 // Compute left-over elements
607 for(; x < window_end_x; ++x)
608 {
609 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
610 *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
611 }
612 },
613 broadcast_input, non_broadcast_input, output);
614 }
615 else
616 {
617 // Clear X Dimension on execution window as we handle manually
618 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
619 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
620
621 Iterator input1(in1, input1_win);
622 Iterator input2(in2, input2_win);
623 Iterator output(out, win);
624
625 execute_window_loop(win, [&](const Coordinates &)
626 {
627 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
628 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
629 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
630
631 // Compute S elements per iteration
632 int x = window_start_x;
633 for(; x <= (window_end_x - window_step_x); x += window_step_x)
634 {
635 const int16x8_t a = vld1q_s16(input1_ptr + x);
636 const int16x8_t b = vld1q_s16(input2_ptr + x);
637
638 const float32x4x2_t af =
639 {
640 {
641 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
642 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
643 }
644 };
645
646 const float32x4x2_t bf =
647 {
648 {
649 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
650 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
651 }
652 };
653
654 const int32x4x2_t rf =
655 {
656 {
657#ifdef __aarch64__
658 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
659 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
660#else //__aarch64__
661 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
662 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
663#endif //__aarch64__
664 }
665 };
666
667 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
668 vst1q_s16(output_ptr + x, pa);
669 }
670
671 // Compute left-over elements
672 for(; x < window_end_x; ++x)
673 {
674 const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
675 const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
676 *(output_ptr + x) = quantize_qsymm16((afs + bfs), out->info()->quantization_info());
677 }
678 },
679 input1, input2, output);
680 }
681}
682
Georgios Pinitas5a594532018-12-03 14:30:05 +0000683void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
684{
685 // Create input windows
686 Window win = window;
687 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
688 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
689
690 // Clear X Dimension on execution window as we handle manually
691 win.set(Window::DimX, Window::Dimension(0, 1, 1));
692 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
693 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
694
695 Iterator input1(in1, input1_win);
696 Iterator input2(in2, input2_win);
697 Iterator output(out, win);
698
699 const int window_step_x = 8;
700 const auto window_start_x = static_cast<int>(window.x().start());
701 const auto window_end_x = static_cast<int>(window.x().end());
702
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100703 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitas5a594532018-12-03 14:30:05 +0000704 {
705 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
706 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
707 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
708
709 if(policy == ConvertPolicy::WRAP)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000710 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000711 // Compute S elements per iteration
712 int x = window_start_x;
713 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000714 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000715 const auto vin1 = wrapper::vloadq(input1_ptr + x);
716 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
717 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000718 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000719
Georgios Pinitas5a594532018-12-03 14:30:05 +0000720 // Compute left-over elements
721 for(; x < window_end_x; ++x)
722 {
723 *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
724 }
725 }
726 else
727 {
728 // Compute S elements per iteration
729 int x = window_start_x;
730 for(; x <= (window_end_x - window_step_x); x += window_step_x)
731 {
732 const auto vin1 = wrapper::vloadq(input1_ptr + x);
733 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
734 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
735 }
736
737 // Compute left-over elements
738 for(; x < window_end_x; ++x)
739 {
740 *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
741 }
742 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000743 },
744 input1, input2, output);
745}
746
Georgios Pinitas5a594532018-12-03 14:30:05 +0000747inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100748{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000749 // Simply swap the two input buffers:
750 add_S16_U8_S16(input2, input1, output, policy, window);
751}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100752
Georgios Pinitas5a594532018-12-03 14:30:05 +0000753void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
754{
755 // Create input windows
756 Window win = window;
757 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
758 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
759
760 // Clear X Dimension on execution window as we handle manually
761 win.set(Window::DimX, Window::Dimension(0, 1, 1));
762 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
763 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
764
765 Iterator input1(in1, input1_win);
766 Iterator input2(in2, input2_win);
767 Iterator output(out, win);
768
769 const int window_step_x = 8;
770 const auto window_start_x = static_cast<int>(window.x().start());
771 const auto window_end_x = static_cast<int>(window.x().end());
772
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100773 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100774 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000775 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
776 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
777 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100778
Georgios Pinitas5a594532018-12-03 14:30:05 +0000779 if(policy == ConvertPolicy::WRAP)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100780 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000781 // Compute S elements per iteration
782 int x = window_start_x;
783 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100784 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000785 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
786 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
787 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100788 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100789
Georgios Pinitas5a594532018-12-03 14:30:05 +0000790 // Compute left-over elements
791 for(; x < window_end_x; ++x)
792 {
793 *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
794 }
795 }
796 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100797 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000798 // Compute S elements per iteration
799 int x = window_start_x;
800 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100801 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000802 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
803 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
804 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100805 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100806
Georgios Pinitas5a594532018-12-03 14:30:05 +0000807 // Compute left-over elements
808 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100809 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000810 *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
811 static_cast<int16_t>(*(input2_ptr + x)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100812 }
Georgios Pinitas5a594532018-12-03 14:30:05 +0000813 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100814 },
815 input1, input2, output);
816}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000817
Michele Di Giorgio19023832020-06-17 16:08:10 +0000818Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000819{
820 ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000821
Michele Di Giorgio19023832020-06-17 16:08:10 +0000822 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
823 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100824 DataType::S16, DataType::QSYMM16, DataType::F16,
825 DataType::S32, DataType::F32);
Michele Di Giorgio19023832020-06-17 16:08:10 +0000826 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100827 DataType::S16, DataType::QSYMM16, DataType::F16,
828 DataType::S32, DataType::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000829
Michele Di Giorgio19023832020-06-17 16:08:10 +0000830 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000831
832 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Michele Di Giorgio19023832020-06-17 16:08:10 +0000833 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type())
834 || (input2.data_type() != output.data_type())),
835 "Broadcasting across width is supported on configurations where all tensors have the same data type");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000836
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000837 // Validate in case of configured output
Michele Di Giorgio19023832020-06-17 16:08:10 +0000838 if(output.total_size() > 0)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000839 {
840 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michele Di Giorgio19023832020-06-17 16:08:10 +0000841 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
842 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
843 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
844 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
845 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
846 && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)
847 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
848 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
849 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
850 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
851 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000852 "You called addition with the wrong image formats");
853
Michele Di Giorgio19023832020-06-17 16:08:10 +0000854 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000855 "Wrong shape for output");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000856 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000857
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000858 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000859}
860
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100861std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &input1, const ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000862{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000863 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000864 const TensorShape &out_shape = broadcast_pair.first;
865 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000866
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100867 // Auto initialize output if not initialized
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100868 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000869 set_shape_if_empty(output, out_shape);
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100870
Michele Di Giorgio19023832020-06-17 16:08:10 +0000871 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000872 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000873 set_format_if_unknown(output, Format::S16);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000874 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000875 if(input1.data_type() == DataType::S32 || input2.data_type() == DataType::S32)
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100876 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000877 set_format_if_unknown(output, Format::S32);
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100878 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000879 else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000880 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000881 set_format_if_unknown(output, Format::F16);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000882 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000883 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000884 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000885 set_format_if_unknown(output, Format::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000886 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000887 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000888 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000889 set_data_type_if_unknown(output, DataType::QASYMM8);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000890 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000891 else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000892 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000893 set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000894 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000895 else if(input1.data_type() == DataType::QSYMM16 || input2.data_type() == DataType::QSYMM16)
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100896 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000897 set_data_type_if_unknown(output, DataType::QSYMM16);
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100898 }
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000899 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000900
Georgios Pinitas5a594532018-12-03 14:30:05 +0000901 Window win = calculate_max_window(valid_region, Steps());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000902
Georgios Pinitas5a594532018-12-03 14:30:05 +0000903 // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped
904 Coordinates coord;
Michele Di Giorgio19023832020-06-17 16:08:10 +0000905 coord.set_num_dimensions(output.num_dimensions());
906 output.set_valid_region(valid_region);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000907 return std::make_pair(Status{}, win);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000908}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100909} // namespace
910
911NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100912 : _func(nullptr), _policy()
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100913{
914}
915
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100916void NEArithmeticAdditionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100917{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000918 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100919 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100920
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000921 // Configure kernel window
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100922 auto win_config = validate_and_configure_window(*input1, *input2, *output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000923 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100924
925 static std::map<std::string, AddFunction *> map_function =
926 {
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000927 { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
928 { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000929 { "add_wrap_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
930 { "add_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100931 { "add_wrap_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
932 { "add_saturate_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100933 { "add_wrap_U8_U8_U8", &add_same<uint8_t> },
934 { "add_saturate_U8_U8_U8", &add_same<uint8_t> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000935 { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
936 { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
937 { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
938 { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
939 { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
940 { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100941 { "add_wrap_S16_S16_S16", &add_same<int16_t> },
942 { "add_saturate_S16_S16_S16", &add_same<int16_t> },
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100943 { "add_wrap_S32_S32_S32", &add_same<int32_t> },
944 { "add_saturate_S32_S32_S32", &add_same<int32_t> },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100945 { "add_wrap_F32_F32_F32", &add_same<float> },
946 { "add_saturate_F32_F32_F32", &add_same<float> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000947#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100948 { "add_wrap_F16_F16_F16", &add_same<float16_t> },
949 { "add_saturate_F16_F16_F16", &add_same<float16_t> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000950#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100951 };
952
Georgios Pinitas5a594532018-12-03 14:30:05 +0000953 _policy = policy;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100954
955 std::string function_to_call("add_");
956 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100957 function_to_call += string_from_data_type(input1->data_type()) + "_";
958 function_to_call += string_from_data_type(input2->data_type()) + "_";
959 function_to_call += string_from_data_type(output->data_type());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100960
961 auto it = map_function.find(function_to_call);
962
963 if(it != map_function.end())
964 {
965 _func = it->second;
966 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100967
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000968 INEKernel::configure(win_config.second);
969}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100970
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000971Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000972{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000973 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
974
975 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
976 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100977
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000978 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100979}
980
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100981void NEArithmeticAdditionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100982{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100983 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100984 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
985 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100986 // Dispatch kernel
987 (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), _policy, window);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000988}
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100989} // namespace arm_compute