blob: aa7af54e9cbb02886ff282870d4c45c2175dcc03 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEArithmeticAdditionKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Validate.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010030#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010031#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035#include <map>
36#include <string>
37
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038namespace arm_compute
39{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040namespace
41{
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010042template <typename T>
43void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, const ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044{
Georgios Pinitas5a594532018-12-03 14:30:05 +000045 /** NEON vector tag type. */
46 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
47
48 // Create input windows
49 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
50 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
51
52 // Clear X Dimension on execution window as we handle manually
53 Window win = window;
54 win.set(Window::DimX, Window::Dimension(0, 1, 1));
55
56 constexpr int window_step_x = 16 / sizeof(T);
57 const auto window_start_x = static_cast<int>(window.x().start());
58 const auto window_end_x = static_cast<int>(window.x().end());
Georgios Pinitasd7341fb2020-11-12 15:05:01 +000059 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
Georgios Pinitas5a594532018-12-03 14:30:05 +000060
61 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010062 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000063 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
64 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
65 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
66 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
67 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010068
Georgios Pinitas5a594532018-12-03 14:30:05 +000069 // Clear X Dimension on execution window as we handle manually
70 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071
Georgios Pinitas5a594532018-12-03 14:30:05 +000072 Iterator broadcast_input(broadcast_tensor, broadcast_win);
73 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
74 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075
Michalis Spyroua4f378d2019-04-26 14:54:54 +010076 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010077 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000078 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
79 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +010080
Georgios Pinitas5a594532018-12-03 14:30:05 +000081 const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
82 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Anthony Barbier6ff3b192017-09-04 18:44:23 +010083
Georgios Pinitas5a594532018-12-03 14:30:05 +000084 // Compute S elements per iteration
85 int x = window_start_x;
86 for(; x <= (window_end_x - window_step_x); x += window_step_x)
87 {
88 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010089 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
Georgios Pinitas5a594532018-12-03 14:30:05 +000090 wrapper::vstore(output_ptr + x, res);
91 }
92
93 // Compute left-over elements
94 for(; x < window_end_x; ++x)
95 {
96 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010097 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
Georgios Pinitas5a594532018-12-03 14:30:05 +000098 }
99 },
100 broadcast_input, non_broadcast_input, output);
101 }
102 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000104 // Clear X Dimension on execution window as we handle manually
105 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
106 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
107
108 Iterator input1(in1, input1_win);
109 Iterator input2(in2, input2_win);
110 Iterator output(out, win);
111
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100112 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100113 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000114 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
115 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
116 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100117
Georgios Pinitas5a594532018-12-03 14:30:05 +0000118 // Compute S elements per iteration
119 int x = window_start_x;
120 for(; x <= (window_end_x - window_step_x); x += window_step_x)
121 {
122 const auto val1 = wrapper::vloadq(input1_ptr + x);
123 const auto val2 = wrapper::vloadq(input2_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100124 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000125 wrapper::vstore(output_ptr + x, res);
126 }
127
128 // Compute left-over elements
129 for(; x < window_end_x; ++x)
130 {
131 const auto val1 = *(input1_ptr + x);
132 const auto val2 = *(input2_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100133 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
Georgios Pinitas5a594532018-12-03 14:30:05 +0000134 }
135 },
136 input1, input2, output);
137 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100138}
139
Georgios Pinitas5a594532018-12-03 14:30:05 +0000140void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100141{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000142 ARM_COMPUTE_UNUSED(policy);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143
Georgios Pinitas5a594532018-12-03 14:30:05 +0000144 // Create input windows
145 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
146 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100147
Georgios Pinitas5a594532018-12-03 14:30:05 +0000148 // Clear X Dimension on execution window as we handle manually
149 Window win = window;
150 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100151
Georgios Pinitas5a594532018-12-03 14:30:05 +0000152 const int window_step_x = 16;
153 const auto window_start_x = static_cast<int>(window.x().start());
154 const auto window_end_x = static_cast<int>(window.x().end());
Georgios Pinitasd7341fb2020-11-12 15:05:01 +0000155 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100156
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100157 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
158 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
159 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000160
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100161 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100162 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000163
Georgios Pinitas5a594532018-12-03 14:30:05 +0000164 if(is_broadcast_across_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000165 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100166 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
167 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
168 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
169 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
170 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
171 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
172 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000173
Michalis Spyrou85b75992020-07-16 11:47:12 +0100174 const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
175 const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
176 const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
177 const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
178
Georgios Pinitas5a594532018-12-03 14:30:05 +0000179 // Clear X Dimension on execution window as we handle manually
180 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
181
182 Iterator broadcast_input(broadcast_tensor, broadcast_win);
183 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
184 Iterator output(out, win);
185
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100186 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000187 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000188 const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
189 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000190
Georgios Pinitas5a594532018-12-03 14:30:05 +0000191 const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
192 const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
193
194 const float32x4x4_t bf =
195 {
196 {
197 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
198 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
199 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
200 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
201 }
202 };
203 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
204
205 // Compute S elements per iteration
206 int x = window_start_x;
207 for(; x <= (window_end_x - window_step_x); x += window_step_x)
208 {
209 const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
210 const float32x4x4_t af =
211 {
212 {
213 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
214 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
215 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
216 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
217 }
218 };
219
220 const int32x4x4_t rf =
221 {
222 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000223#ifdef __aarch64__
224 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
225 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
226 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
227 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100228#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000229 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
230 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
231 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
232 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000233#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000234 }
235 };
236
237 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
238 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
239 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
240 }
241
242 // Compute left-over elements
243 for(; x < window_end_x; ++x)
244 {
245 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100246 *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000247 }
248 },
249 broadcast_input, non_broadcast_input, output);
250 }
251 else
252 {
253 // Clear X Dimension on execution window as we handle manually
254 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
255 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
256
Georgios Pinitas5a594532018-12-03 14:30:05 +0000257 Iterator input1(in1, input1_win);
258 Iterator input2(in2, input2_win);
259 Iterator output(out, win);
260
Michalis Spyrou85b75992020-07-16 11:47:12 +0100261 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
262 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
263 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
264 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
265
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100266 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000267 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000268 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
269 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
270 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000271
Georgios Pinitas5a594532018-12-03 14:30:05 +0000272 // Compute S elements per iteration
273 int x = window_start_x;
274 for(; x <= (window_end_x - window_step_x); x += window_step_x)
275 {
276 const uint8x16_t a = vld1q_u8(input1_ptr + x);
277 const uint8x16_t b = vld1q_u8(input2_ptr + x);
278
279 const float32x4x4_t af =
280 {
281 {
282 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
283 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
284 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
285 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
286 }
287 };
288
289 const float32x4x4_t bf =
290 {
291 {
292 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
293 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
294 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
295 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
296 }
297 };
298
299 const int32x4x4_t rf =
300 {
301 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000302#ifdef __aarch64__
303 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
304 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
305 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
306 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100307#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000308 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
309 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
310 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
311 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000312#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000313 }
314 };
315
316 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
317 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
318 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
319 }
320
321 // Compute left-over elements
322 for(; x < window_end_x; ++x)
323 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100324 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
325 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
326 *(output_ptr + x) = quantize_qasymm8((afs + bfs), out->info()->quantization_info());
Georgios Pinitas5a594532018-12-03 14:30:05 +0000327 }
328 },
329 input1, input2, output);
330 }
331}
332
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000333void add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
334{
335 ARM_COMPUTE_UNUSED(policy);
336
337 // Create input windows
338 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
339 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
340
341 // Clear X Dimension on execution window as we handle manually
342 Window win = window;
343 win.set(Window::DimX, Window::Dimension(0, 1, 1));
344
345 const int window_step_x = 16;
346 const auto window_start_x = static_cast<int>(window.x().start());
347 const auto window_end_x = static_cast<int>(window.x().end());
Georgios Pinitasd7341fb2020-11-12 15:05:01 +0000348 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000349
350 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
351 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
352 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
353
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000354 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000355 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
356
357 if(is_broadcast_across_x)
358 {
359 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
360 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
361 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
362 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
363 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
364 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
365 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
366
Michalis Spyrou2232a202020-07-13 15:15:33 +0100367 const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
368 const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
369 const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
370 const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
371
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000372 // Clear X Dimension on execution window as we handle manually
373 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
374
375 Iterator broadcast_input(broadcast_tensor, broadcast_win);
376 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
377 Iterator output(out, win);
378
379 execute_window_loop(win, [&](const Coordinates &)
380 {
381 const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
382 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
383
384 const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
385 const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
386
387 const float32x4x4_t bf =
388 {
389 {
390 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2),
391 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2),
392 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2),
393 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2),
394 }
395 };
396 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
397
398 // Compute S elements per iteration
399 int x = window_start_x;
400 for(; x <= (window_end_x - window_step_x); x += window_step_x)
401 {
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000402 const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000403 const float32x4x4_t af =
404 {
405 {
406 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
407 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
408 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
409 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
410 }
411 };
412
413 const int32x4x4_t rf =
414 {
415 {
416#ifdef __aarch64__
417 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
418 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
419 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
420 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
421#else //__aarch64__
422 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
423 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
424 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
425 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
426#endif //__aarch64__
427 }
428 };
429
430 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
431 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
432 vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
433 }
434
435 // Compute left-over elements
436 for(; x < window_end_x; ++x)
437 {
438 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
439 *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
440 }
441 },
442 broadcast_input, non_broadcast_input, output);
443 }
444 else
445 {
446 // Clear X Dimension on execution window as we handle manually
447 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
448 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
449
450 Iterator input1(in1, input1_win);
451 Iterator input2(in2, input2_win);
452 Iterator output(out, win);
453
Michalis Spyrou2232a202020-07-13 15:15:33 +0100454 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
455 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
456 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
457 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000458 execute_window_loop(win, [&](const Coordinates &)
459 {
460 const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
461 const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
462 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
463
464 // Compute S elements per iteration
465 int x = window_start_x;
466 for(; x <= (window_end_x - window_step_x); x += window_step_x)
467 {
468 const int8x16_t a = vld1q_s8(input1_ptr + x);
469 const int8x16_t b = vld1q_s8(input2_ptr + x);
470
471 const float32x4x4_t af =
472 {
473 {
474 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
475 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
476 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
477 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
478 }
479 };
480
481 const float32x4x4_t bf =
482 {
483 {
484 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2),
485 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2),
486 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2),
487 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2),
488 }
489 };
490
491 const int32x4x4_t rf =
492 {
493 {
494#ifdef __aarch64__
495 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
496 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
497 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
498 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
499#else //__aarch64__
500 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
501 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
502 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
503 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
504#endif //__aarch64__
505 }
506 };
507
508 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
509 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
510 vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
511 }
512
513 // Compute left-over elements
514 for(; x < window_end_x; ++x)
515 {
516 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
517 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
518 *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), out->info()->quantization_info());
519 }
520 },
521 input1, input2, output);
522 }
523}
524
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100525void add_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
526{
527 ARM_COMPUTE_UNUSED(policy);
528
529 // Create input windows
530 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
531 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
532
533 // Clear X Dimension on execution window as we handle manually
534 Window win = window;
535 win.set(Window::DimX, Window::Dimension(0, 1, 1));
536
537 const int window_step_x = 8;
538 const auto window_start_x = static_cast<int>(window.x().start());
539 const auto window_end_x = static_cast<int>(window.x().end());
Georgios Pinitasd7341fb2020-11-12 15:05:01 +0000540 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100541
542 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
543 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
544 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
545
546 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
547 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
548 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
549
550 if(is_broadcast_across_x)
551 {
552 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
553 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
554 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
555 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
556 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
557 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
558 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
559
560 // Clear X Dimension on execution window as we handle manually
561 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
562
563 Iterator broadcast_input(broadcast_tensor, broadcast_win);
564 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
565 Iterator output(out, win);
566
567 execute_window_loop(win, [&](const Coordinates &)
568 {
569 const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
570 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
571
572 const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
573 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
574
575 const float32x4x2_t bf =
576 {
577 {
578 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
579 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
580 }
581 };
582 const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
583
584 // Compute S elements per iteration
585 int x = window_start_x;
586 for(; x <= (window_end_x - window_step_x); x += window_step_x)
587 {
588 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
589 const float32x4x2_t af =
590 {
591 {
592 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
593 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
594 }
595 };
596
597 const int32x4x4_t rf =
598 {
599 {
600#ifdef __aarch64__
601 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
602 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
603#else //__aarch64__
604 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
605 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
606#endif //__aarch64__
607 }
608 };
609
610 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
611 vst1q_s16(output_ptr + x, pa);
612 }
613
614 // Compute left-over elements
615 for(; x < window_end_x; ++x)
616 {
617 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
618 *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
619 }
620 },
621 broadcast_input, non_broadcast_input, output);
622 }
623 else
624 {
625 // Clear X Dimension on execution window as we handle manually
626 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
627 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
628
629 Iterator input1(in1, input1_win);
630 Iterator input2(in2, input2_win);
631 Iterator output(out, win);
632
633 execute_window_loop(win, [&](const Coordinates &)
634 {
635 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
636 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
637 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
638
639 // Compute S elements per iteration
640 int x = window_start_x;
641 for(; x <= (window_end_x - window_step_x); x += window_step_x)
642 {
643 const int16x8_t a = vld1q_s16(input1_ptr + x);
644 const int16x8_t b = vld1q_s16(input2_ptr + x);
645
646 const float32x4x2_t af =
647 {
648 {
649 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
650 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
651 }
652 };
653
654 const float32x4x2_t bf =
655 {
656 {
657 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
658 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
659 }
660 };
661
662 const int32x4x2_t rf =
663 {
664 {
665#ifdef __aarch64__
666 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
667 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
668#else //__aarch64__
669 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
670 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
671#endif //__aarch64__
672 }
673 };
674
675 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
676 vst1q_s16(output_ptr + x, pa);
677 }
678
679 // Compute left-over elements
680 for(; x < window_end_x; ++x)
681 {
682 const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
683 const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
684 *(output_ptr + x) = quantize_qsymm16((afs + bfs), out->info()->quantization_info());
685 }
686 },
687 input1, input2, output);
688 }
689}
690
Georgios Pinitas5a594532018-12-03 14:30:05 +0000691void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
692{
693 // Create input windows
694 Window win = window;
695 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
696 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
697
698 // Clear X Dimension on execution window as we handle manually
699 win.set(Window::DimX, Window::Dimension(0, 1, 1));
700 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
701 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
702
703 Iterator input1(in1, input1_win);
704 Iterator input2(in2, input2_win);
705 Iterator output(out, win);
706
707 const int window_step_x = 8;
708 const auto window_start_x = static_cast<int>(window.x().start());
709 const auto window_end_x = static_cast<int>(window.x().end());
710
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100711 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitas5a594532018-12-03 14:30:05 +0000712 {
713 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
714 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
715 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
716
717 if(policy == ConvertPolicy::WRAP)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000718 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000719 // Compute S elements per iteration
720 int x = window_start_x;
721 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000722 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000723 const auto vin1 = wrapper::vloadq(input1_ptr + x);
724 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
725 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000726 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000727
Georgios Pinitas5a594532018-12-03 14:30:05 +0000728 // Compute left-over elements
729 for(; x < window_end_x; ++x)
730 {
731 *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
732 }
733 }
734 else
735 {
736 // Compute S elements per iteration
737 int x = window_start_x;
738 for(; x <= (window_end_x - window_step_x); x += window_step_x)
739 {
740 const auto vin1 = wrapper::vloadq(input1_ptr + x);
741 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
742 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
743 }
744
745 // Compute left-over elements
746 for(; x < window_end_x; ++x)
747 {
748 *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
749 }
750 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000751 },
752 input1, input2, output);
753}
754
Georgios Pinitas5a594532018-12-03 14:30:05 +0000755inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100756{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000757 // Simply swap the two input buffers:
758 add_S16_U8_S16(input2, input1, output, policy, window);
759}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100760
Georgios Pinitas5a594532018-12-03 14:30:05 +0000761void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
762{
763 // Create input windows
764 Window win = window;
765 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
766 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
767
768 // Clear X Dimension on execution window as we handle manually
769 win.set(Window::DimX, Window::Dimension(0, 1, 1));
770 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
771 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
772
773 Iterator input1(in1, input1_win);
774 Iterator input2(in2, input2_win);
775 Iterator output(out, win);
776
777 const int window_step_x = 8;
778 const auto window_start_x = static_cast<int>(window.x().start());
779 const auto window_end_x = static_cast<int>(window.x().end());
780
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100781 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100782 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000783 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
784 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
785 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100786
Georgios Pinitas5a594532018-12-03 14:30:05 +0000787 if(policy == ConvertPolicy::WRAP)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100788 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000789 // Compute S elements per iteration
790 int x = window_start_x;
791 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100792 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000793 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
794 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
795 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100796 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100797
Georgios Pinitas5a594532018-12-03 14:30:05 +0000798 // Compute left-over elements
799 for(; x < window_end_x; ++x)
800 {
801 *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
802 }
803 }
804 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100805 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000806 // Compute S elements per iteration
807 int x = window_start_x;
808 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100809 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000810 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
811 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
812 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100813 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100814
Georgios Pinitas5a594532018-12-03 14:30:05 +0000815 // Compute left-over elements
816 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100817 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000818 *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
819 static_cast<int16_t>(*(input2_ptr + x)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100820 }
Georgios Pinitas5a594532018-12-03 14:30:05 +0000821 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100822 },
823 input1, input2, output);
824}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000825
Michele Di Giorgio19023832020-06-17 16:08:10 +0000826Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000827{
828 ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000829
Michele Di Giorgio19023832020-06-17 16:08:10 +0000830 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
831 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100832 DataType::S16, DataType::QSYMM16, DataType::F16,
833 DataType::S32, DataType::F32);
Michele Di Giorgio19023832020-06-17 16:08:10 +0000834 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100835 DataType::S16, DataType::QSYMM16, DataType::F16,
836 DataType::S32, DataType::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000837
Michele Di Giorgio19023832020-06-17 16:08:10 +0000838 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000839
840 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Michele Di Giorgio19023832020-06-17 16:08:10 +0000841 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type())
842 || (input2.data_type() != output.data_type())),
843 "Broadcasting across width is supported on configurations where all tensors have the same data type");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000844
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000845 // Validate in case of configured output
Michele Di Giorgio19023832020-06-17 16:08:10 +0000846 if(output.total_size() > 0)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000847 {
848 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michele Di Giorgio19023832020-06-17 16:08:10 +0000849 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
850 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
851 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
852 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
853 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
854 && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)
855 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
856 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
857 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
858 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
859 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000860 "You called addition with the wrong image formats");
861
Michele Di Giorgio19023832020-06-17 16:08:10 +0000862 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000863 "Wrong shape for output");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000864 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000865
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000866 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000867}
868
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100869std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &input1, const ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000870{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000871 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000872 const TensorShape &out_shape = broadcast_pair.first;
873 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000874
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100875 // Auto initialize output if not initialized
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100876 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000877 set_shape_if_empty(output, out_shape);
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100878
Michele Di Giorgio19023832020-06-17 16:08:10 +0000879 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000880 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000881 set_format_if_unknown(output, Format::S16);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000882 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000883 if(input1.data_type() == DataType::S32 || input2.data_type() == DataType::S32)
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100884 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000885 set_format_if_unknown(output, Format::S32);
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100886 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000887 else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000888 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000889 set_format_if_unknown(output, Format::F16);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000890 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000891 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000892 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000893 set_format_if_unknown(output, Format::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000894 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000895 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000896 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000897 set_data_type_if_unknown(output, DataType::QASYMM8);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000898 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000899 else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000900 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000901 set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000902 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000903 else if(input1.data_type() == DataType::QSYMM16 || input2.data_type() == DataType::QSYMM16)
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100904 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000905 set_data_type_if_unknown(output, DataType::QSYMM16);
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100906 }
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000907 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000908
Georgios Pinitas5a594532018-12-03 14:30:05 +0000909 Window win = calculate_max_window(valid_region, Steps());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000910
Georgios Pinitas5a594532018-12-03 14:30:05 +0000911 // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped
912 Coordinates coord;
Michele Di Giorgio19023832020-06-17 16:08:10 +0000913 coord.set_num_dimensions(output.num_dimensions());
914 output.set_valid_region(valid_region);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000915 return std::make_pair(Status{}, win);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000916}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100917} // namespace
918
919NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100920 : _func(nullptr), _policy()
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100921{
922}
923
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100924void NEArithmeticAdditionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100925{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000926 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100927 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100928
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000929 // Configure kernel window
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100930 auto win_config = validate_and_configure_window(*input1, *input2, *output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000931 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100932
933 static std::map<std::string, AddFunction *> map_function =
934 {
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000935 { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
936 { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000937 { "add_wrap_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
938 { "add_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100939 { "add_wrap_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
940 { "add_saturate_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100941 { "add_wrap_U8_U8_U8", &add_same<uint8_t> },
942 { "add_saturate_U8_U8_U8", &add_same<uint8_t> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000943 { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
944 { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
945 { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
946 { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
947 { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
948 { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100949 { "add_wrap_S16_S16_S16", &add_same<int16_t> },
950 { "add_saturate_S16_S16_S16", &add_same<int16_t> },
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100951 { "add_wrap_S32_S32_S32", &add_same<int32_t> },
952 { "add_saturate_S32_S32_S32", &add_same<int32_t> },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100953 { "add_wrap_F32_F32_F32", &add_same<float> },
954 { "add_saturate_F32_F32_F32", &add_same<float> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000955#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100956 { "add_wrap_F16_F16_F16", &add_same<float16_t> },
957 { "add_saturate_F16_F16_F16", &add_same<float16_t> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000958#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100959 };
960
Georgios Pinitas5a594532018-12-03 14:30:05 +0000961 _policy = policy;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100962
963 std::string function_to_call("add_");
964 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100965 function_to_call += string_from_data_type(input1->data_type()) + "_";
966 function_to_call += string_from_data_type(input2->data_type()) + "_";
967 function_to_call += string_from_data_type(output->data_type());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100968
969 auto it = map_function.find(function_to_call);
970
971 if(it != map_function.end())
972 {
973 _func = it->second;
974 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100975
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000976 INEKernel::configure(win_config.second);
977}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100978
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000979Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000980{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000981 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
982
983 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
984 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100985
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000986 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100987}
988
Georgios Pinitas0499dff2020-07-31 22:21:38 +0100989void NEArithmeticAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100990{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100991 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100992 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
993 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100994 // Dispatch kernel
Georgios Pinitas0499dff2020-07-31 22:21:38 +0100995 (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC_0),
996 tensors.get_const_tensor(TensorType::ACL_SRC_1),
997 tensors.get_tensor(TensorType::ACL_DST),
998 _policy,
999 window);
Diego Lopez Recas0021d752017-12-18 14:42:56 +00001000}
Michalis Spyrou7a7fe652020-05-15 11:28:59 +01001001} // namespace arm_compute