blob: 59a454f91ae3ec0521a64d4f3f7f893036a8557d [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/ITensor.h"
Georgios Pinitas5a594532018-12-03 14:30:05 +000030#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031#include "arm_compute/core/Validate.h"
32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include <map>
34#include <string>
35
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036namespace arm_compute
37{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038namespace
39{
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010040template <typename T>
41void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, const ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042{
Georgios Pinitas5a594532018-12-03 14:30:05 +000043 /** NEON vector tag type. */
44 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
45
46 // Create input windows
47 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
48 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
49
50 // Clear X Dimension on execution window as we handle manually
51 Window win = window;
52 win.set(Window::DimX, Window::Dimension(0, 1, 1));
53
54 constexpr int window_step_x = 16 / sizeof(T);
55 const auto window_start_x = static_cast<int>(window.x().start());
56 const auto window_end_x = static_cast<int>(window.x().end());
57 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
58
59 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010060 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000061 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
65 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010066
Georgios Pinitas5a594532018-12-03 14:30:05 +000067 // Clear X Dimension on execution window as we handle manually
68 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010069
Georgios Pinitas5a594532018-12-03 14:30:05 +000070 Iterator broadcast_input(broadcast_tensor, broadcast_win);
71 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
72 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073
Michalis Spyroua4f378d2019-04-26 14:54:54 +010074 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000076 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
77 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +010078
Georgios Pinitas5a594532018-12-03 14:30:05 +000079 const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
80 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Anthony Barbier6ff3b192017-09-04 18:44:23 +010081
Georgios Pinitas5a594532018-12-03 14:30:05 +000082 // Compute S elements per iteration
83 int x = window_start_x;
84 for(; x <= (window_end_x - window_step_x); x += window_step_x)
85 {
86 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010087 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
Georgios Pinitas5a594532018-12-03 14:30:05 +000088 wrapper::vstore(output_ptr + x, res);
89 }
90
91 // Compute left-over elements
92 for(; x < window_end_x; ++x)
93 {
94 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +010095 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
Georgios Pinitas5a594532018-12-03 14:30:05 +000096 }
97 },
98 broadcast_input, non_broadcast_input, output);
99 }
100 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100101 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000102 // Clear X Dimension on execution window as we handle manually
103 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
104 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
105
106 Iterator input1(in1, input1_win);
107 Iterator input2(in2, input2_win);
108 Iterator output(out, win);
109
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100110 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100111 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000112 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
113 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
114 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100115
Georgios Pinitas5a594532018-12-03 14:30:05 +0000116 // Compute S elements per iteration
117 int x = window_start_x;
118 for(; x <= (window_end_x - window_step_x); x += window_step_x)
119 {
120 const auto val1 = wrapper::vloadq(input1_ptr + x);
121 const auto val2 = wrapper::vloadq(input2_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100122 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000123 wrapper::vstore(output_ptr + x, res);
124 }
125
126 // Compute left-over elements
127 for(; x < window_end_x; ++x)
128 {
129 const auto val1 = *(input1_ptr + x);
130 const auto val2 = *(input2_ptr + x);
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100131 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
Georgios Pinitas5a594532018-12-03 14:30:05 +0000132 }
133 },
134 input1, input2, output);
135 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100136}
137
Georgios Pinitas5a594532018-12-03 14:30:05 +0000138void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100139{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000140 ARM_COMPUTE_UNUSED(policy);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100141
Georgios Pinitas5a594532018-12-03 14:30:05 +0000142 // Create input windows
143 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
144 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100145
Georgios Pinitas5a594532018-12-03 14:30:05 +0000146 // Clear X Dimension on execution window as we handle manually
147 Window win = window;
148 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100149
Georgios Pinitas5a594532018-12-03 14:30:05 +0000150 const int window_step_x = 16;
151 const auto window_start_x = static_cast<int>(window.x().start());
152 const auto window_end_x = static_cast<int>(window.x().end());
153 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100154
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100155 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
156 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
157 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000158
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100159 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
160 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
161 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
162 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
163 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
164 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000165
Georgios Pinitas5a594532018-12-03 14:30:05 +0000166 if(is_broadcast_across_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000167 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100168 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
169 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
170 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
171 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
172 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
173 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
174 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000175
Georgios Pinitas5a594532018-12-03 14:30:05 +0000176 // Clear X Dimension on execution window as we handle manually
177 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
178
179 Iterator broadcast_input(broadcast_tensor, broadcast_win);
180 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
181 Iterator output(out, win);
182
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100183 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000184 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000185 const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
186 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000187
Georgios Pinitas5a594532018-12-03 14:30:05 +0000188 const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
189 const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
190
191 const float32x4x4_t bf =
192 {
193 {
194 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
195 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
196 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
197 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
198 }
199 };
200 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
201
202 // Compute S elements per iteration
203 int x = window_start_x;
204 for(; x <= (window_end_x - window_step_x); x += window_step_x)
205 {
206 const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
207 const float32x4x4_t af =
208 {
209 {
210 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
211 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
212 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
213 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
214 }
215 };
216
217 const int32x4x4_t rf =
218 {
219 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000220#ifdef __aarch64__
221 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
222 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
223 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
224 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100225#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000226 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
227 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
228 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
229 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000230#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000231 }
232 };
233
234 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
235 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
236 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
237 }
238
239 // Compute left-over elements
240 for(; x < window_end_x; ++x)
241 {
242 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100243 *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000244 }
245 },
246 broadcast_input, non_broadcast_input, output);
247 }
248 else
249 {
250 // Clear X Dimension on execution window as we handle manually
251 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
252 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
253
Georgios Pinitas5a594532018-12-03 14:30:05 +0000254 Iterator input1(in1, input1_win);
255 Iterator input2(in2, input2_win);
256 Iterator output(out, win);
257
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100258 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000259 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000260 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
261 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
262 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000263
Georgios Pinitas5a594532018-12-03 14:30:05 +0000264 // Compute S elements per iteration
265 int x = window_start_x;
266 for(; x <= (window_end_x - window_step_x); x += window_step_x)
267 {
268 const uint8x16_t a = vld1q_u8(input1_ptr + x);
269 const uint8x16_t b = vld1q_u8(input2_ptr + x);
270
271 const float32x4x4_t af =
272 {
273 {
274 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
275 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
276 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
277 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
278 }
279 };
280
281 const float32x4x4_t bf =
282 {
283 {
284 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
285 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
286 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
287 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
288 }
289 };
290
291 const int32x4x4_t rf =
292 {
293 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000294#ifdef __aarch64__
295 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
296 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
297 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
298 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100299#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000300 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
301 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
302 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
303 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000304#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000305 }
306 };
307
308 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
309 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
310 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
311 }
312
313 // Compute left-over elements
314 for(; x < window_end_x; ++x)
315 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100316 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
317 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
318 *(output_ptr + x) = quantize_qasymm8((afs + bfs), out->info()->quantization_info());
Georgios Pinitas5a594532018-12-03 14:30:05 +0000319 }
320 },
321 input1, input2, output);
322 }
323}
324
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000325void add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
326{
327 ARM_COMPUTE_UNUSED(policy);
328
329 // Create input windows
330 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
331 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
332
333 // Clear X Dimension on execution window as we handle manually
334 Window win = window;
335 win.set(Window::DimX, Window::Dimension(0, 1, 1));
336
337 const int window_step_x = 16;
338 const auto window_start_x = static_cast<int>(window.x().start());
339 const auto window_end_x = static_cast<int>(window.x().end());
340 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
341
342 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
343 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
344 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
345
346 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
347 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
348 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
349 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
350 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
351 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
352
353 if(is_broadcast_across_x)
354 {
355 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
356 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
357 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
358 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
359 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
360 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
361 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
362
363 // Clear X Dimension on execution window as we handle manually
364 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
365
366 Iterator broadcast_input(broadcast_tensor, broadcast_win);
367 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
368 Iterator output(out, win);
369
370 execute_window_loop(win, [&](const Coordinates &)
371 {
372 const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
373 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
374
375 const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
376 const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
377
378 const float32x4x4_t bf =
379 {
380 {
381 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2),
382 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2),
383 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2),
384 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2),
385 }
386 };
387 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
388
389 // Compute S elements per iteration
390 int x = window_start_x;
391 for(; x <= (window_end_x - window_step_x); x += window_step_x)
392 {
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000393 const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000394 const float32x4x4_t af =
395 {
396 {
397 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
398 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
399 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
400 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
401 }
402 };
403
404 const int32x4x4_t rf =
405 {
406 {
407#ifdef __aarch64__
408 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
409 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
410 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
411 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
412#else //__aarch64__
413 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
414 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
415 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
416 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
417#endif //__aarch64__
418 }
419 };
420
421 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
422 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
423 vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
424 }
425
426 // Compute left-over elements
427 for(; x < window_end_x; ++x)
428 {
429 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
430 *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
431 }
432 },
433 broadcast_input, non_broadcast_input, output);
434 }
435 else
436 {
437 // Clear X Dimension on execution window as we handle manually
438 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
439 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
440
441 Iterator input1(in1, input1_win);
442 Iterator input2(in2, input2_win);
443 Iterator output(out, win);
444
445 execute_window_loop(win, [&](const Coordinates &)
446 {
447 const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
448 const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
449 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
450
451 // Compute S elements per iteration
452 int x = window_start_x;
453 for(; x <= (window_end_x - window_step_x); x += window_step_x)
454 {
455 const int8x16_t a = vld1q_s8(input1_ptr + x);
456 const int8x16_t b = vld1q_s8(input2_ptr + x);
457
458 const float32x4x4_t af =
459 {
460 {
461 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
462 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1),
463 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
464 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1),
465 }
466 };
467
468 const float32x4x4_t bf =
469 {
470 {
471 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2),
472 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2),
473 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2),
474 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2),
475 }
476 };
477
478 const int32x4x4_t rf =
479 {
480 {
481#ifdef __aarch64__
482 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
483 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
484 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
485 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
486#else //__aarch64__
487 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
488 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
489 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
490 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
491#endif //__aarch64__
492 }
493 };
494
495 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
496 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
497 vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
498 }
499
500 // Compute left-over elements
501 for(; x < window_end_x; ++x)
502 {
503 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
504 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
505 *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), out->info()->quantization_info());
506 }
507 },
508 input1, input2, output);
509 }
510}
511
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100512void add_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
513{
514 ARM_COMPUTE_UNUSED(policy);
515
516 // Create input windows
517 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
518 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
519
520 // Clear X Dimension on execution window as we handle manually
521 Window win = window;
522 win.set(Window::DimX, Window::Dimension(0, 1, 1));
523
524 const int window_step_x = 8;
525 const auto window_start_x = static_cast<int>(window.x().start());
526 const auto window_end_x = static_cast<int>(window.x().end());
527 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
528
529 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
530 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
531 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
532
533 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
534 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
535 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
536
537 if(is_broadcast_across_x)
538 {
539 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
540 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
541 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
542 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
543 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
544 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
545 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
546
547 // Clear X Dimension on execution window as we handle manually
548 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
549
550 Iterator broadcast_input(broadcast_tensor, broadcast_win);
551 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
552 Iterator output(out, win);
553
554 execute_window_loop(win, [&](const Coordinates &)
555 {
556 const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
557 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
558
559 const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
560 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
561
562 const float32x4x2_t bf =
563 {
564 {
565 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
566 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
567 }
568 };
569 const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
570
571 // Compute S elements per iteration
572 int x = window_start_x;
573 for(; x <= (window_end_x - window_step_x); x += window_step_x)
574 {
575 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
576 const float32x4x2_t af =
577 {
578 {
579 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
580 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
581 }
582 };
583
584 const int32x4x4_t rf =
585 {
586 {
587#ifdef __aarch64__
588 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
589 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
590#else //__aarch64__
591 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
592 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
593#endif //__aarch64__
594 }
595 };
596
597 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
598 vst1q_s16(output_ptr + x, pa);
599 }
600
601 // Compute left-over elements
602 for(; x < window_end_x; ++x)
603 {
604 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
605 *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
606 }
607 },
608 broadcast_input, non_broadcast_input, output);
609 }
610 else
611 {
612 // Clear X Dimension on execution window as we handle manually
613 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
614 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
615
616 Iterator input1(in1, input1_win);
617 Iterator input2(in2, input2_win);
618 Iterator output(out, win);
619
620 execute_window_loop(win, [&](const Coordinates &)
621 {
622 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
623 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
624 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
625
626 // Compute S elements per iteration
627 int x = window_start_x;
628 for(; x <= (window_end_x - window_step_x); x += window_step_x)
629 {
630 const int16x8_t a = vld1q_s16(input1_ptr + x);
631 const int16x8_t b = vld1q_s16(input2_ptr + x);
632
633 const float32x4x2_t af =
634 {
635 {
636 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
637 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
638 }
639 };
640
641 const float32x4x2_t bf =
642 {
643 {
644 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
645 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
646 }
647 };
648
649 const int32x4x2_t rf =
650 {
651 {
652#ifdef __aarch64__
653 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
654 vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
655#else //__aarch64__
656 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
657 vcvtq_s32_f32(vmulq_f32(vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
658#endif //__aarch64__
659 }
660 };
661
662 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
663 vst1q_s16(output_ptr + x, pa);
664 }
665
666 // Compute left-over elements
667 for(; x < window_end_x; ++x)
668 {
669 const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
670 const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
671 *(output_ptr + x) = quantize_qsymm16((afs + bfs), out->info()->quantization_info());
672 }
673 },
674 input1, input2, output);
675 }
676}
677
Georgios Pinitas5a594532018-12-03 14:30:05 +0000678void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
679{
680 // Create input windows
681 Window win = window;
682 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
683 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
684
685 // Clear X Dimension on execution window as we handle manually
686 win.set(Window::DimX, Window::Dimension(0, 1, 1));
687 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
688 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
689
690 Iterator input1(in1, input1_win);
691 Iterator input2(in2, input2_win);
692 Iterator output(out, win);
693
694 const int window_step_x = 8;
695 const auto window_start_x = static_cast<int>(window.x().start());
696 const auto window_end_x = static_cast<int>(window.x().end());
697
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100698 execute_window_loop(win, [&](const Coordinates &)
Georgios Pinitas5a594532018-12-03 14:30:05 +0000699 {
700 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
701 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
702 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
703
704 if(policy == ConvertPolicy::WRAP)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000705 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000706 // Compute S elements per iteration
707 int x = window_start_x;
708 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000709 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000710 const auto vin1 = wrapper::vloadq(input1_ptr + x);
711 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
712 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000713 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000714
Georgios Pinitas5a594532018-12-03 14:30:05 +0000715 // Compute left-over elements
716 for(; x < window_end_x; ++x)
717 {
718 *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
719 }
720 }
721 else
722 {
723 // Compute S elements per iteration
724 int x = window_start_x;
725 for(; x <= (window_end_x - window_step_x); x += window_step_x)
726 {
727 const auto vin1 = wrapper::vloadq(input1_ptr + x);
728 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
729 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
730 }
731
732 // Compute left-over elements
733 for(; x < window_end_x; ++x)
734 {
735 *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
736 }
737 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000738 },
739 input1, input2, output);
740}
741
Georgios Pinitas5a594532018-12-03 14:30:05 +0000742inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100743{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000744 // Simply swap the two input buffers:
745 add_S16_U8_S16(input2, input1, output, policy, window);
746}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100747
Georgios Pinitas5a594532018-12-03 14:30:05 +0000748void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
749{
750 // Create input windows
751 Window win = window;
752 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
753 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
754
755 // Clear X Dimension on execution window as we handle manually
756 win.set(Window::DimX, Window::Dimension(0, 1, 1));
757 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
758 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
759
760 Iterator input1(in1, input1_win);
761 Iterator input2(in2, input2_win);
762 Iterator output(out, win);
763
764 const int window_step_x = 8;
765 const auto window_start_x = static_cast<int>(window.x().start());
766 const auto window_end_x = static_cast<int>(window.x().end());
767
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100768 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100769 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000770 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
771 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
772 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100773
Georgios Pinitas5a594532018-12-03 14:30:05 +0000774 if(policy == ConvertPolicy::WRAP)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100775 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000776 // Compute S elements per iteration
777 int x = window_start_x;
778 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100779 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000780 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
781 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
782 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100783 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100784
Georgios Pinitas5a594532018-12-03 14:30:05 +0000785 // Compute left-over elements
786 for(; x < window_end_x; ++x)
787 {
788 *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
789 }
790 }
791 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100792 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000793 // Compute S elements per iteration
794 int x = window_start_x;
795 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100796 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000797 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
798 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
799 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100800 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100801
Georgios Pinitas5a594532018-12-03 14:30:05 +0000802 // Compute left-over elements
803 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100804 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000805 *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
806 static_cast<int16_t>(*(input2_ptr + x)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100807 }
Georgios Pinitas5a594532018-12-03 14:30:05 +0000808 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100809 },
810 input1, input2, output);
811}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000812
Michele Di Giorgio19023832020-06-17 16:08:10 +0000813Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000814{
815 ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000816
Michele Di Giorgio19023832020-06-17 16:08:10 +0000817 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
818 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100819 DataType::S16, DataType::QSYMM16, DataType::F16,
820 DataType::S32, DataType::F32);
Michele Di Giorgio19023832020-06-17 16:08:10 +0000821 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100822 DataType::S16, DataType::QSYMM16, DataType::F16,
823 DataType::S32, DataType::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000824
Michele Di Giorgio19023832020-06-17 16:08:10 +0000825 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000826
827 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Michele Di Giorgio19023832020-06-17 16:08:10 +0000828 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type())
829 || (input2.data_type() != output.data_type())),
830 "Broadcasting across width is supported on configurations where all tensors have the same data type");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000831
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000832 // Validate in case of configured output
Michele Di Giorgio19023832020-06-17 16:08:10 +0000833 if(output.total_size() > 0)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000834 {
835 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michele Di Giorgio19023832020-06-17 16:08:10 +0000836 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
837 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
838 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
839 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
840 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
841 && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)
842 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
843 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
844 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
845 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
846 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000847 "You called addition with the wrong image formats");
848
Michele Di Giorgio19023832020-06-17 16:08:10 +0000849 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000850 "Wrong shape for output");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000851 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000852
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000853 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000854}
855
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100856std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &input1, const ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000857{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000858 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000859 const TensorShape &out_shape = broadcast_pair.first;
860 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000861
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100862 // Auto initialize output if not initialized
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100863 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000864 set_shape_if_empty(output, out_shape);
Michele Di Giorgio4a616532020-06-04 15:05:38 +0100865
Michele Di Giorgio19023832020-06-17 16:08:10 +0000866 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000867 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000868 set_format_if_unknown(output, Format::S16);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000869 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000870 if(input1.data_type() == DataType::S32 || input2.data_type() == DataType::S32)
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100871 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000872 set_format_if_unknown(output, Format::S32);
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100873 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000874 else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000875 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000876 set_format_if_unknown(output, Format::F16);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000877 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000878 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000879 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000880 set_format_if_unknown(output, Format::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000881 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000882 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000883 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000884 set_data_type_if_unknown(output, DataType::QASYMM8);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000885 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000886 else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000887 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000888 set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000889 }
Michele Di Giorgio19023832020-06-17 16:08:10 +0000890 else if(input1.data_type() == DataType::QSYMM16 || input2.data_type() == DataType::QSYMM16)
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100891 {
Michele Di Giorgio19023832020-06-17 16:08:10 +0000892 set_data_type_if_unknown(output, DataType::QSYMM16);
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100893 }
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000894 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000895
Georgios Pinitas5a594532018-12-03 14:30:05 +0000896 Window win = calculate_max_window(valid_region, Steps());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000897
Georgios Pinitas5a594532018-12-03 14:30:05 +0000898 // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped
899 Coordinates coord;
Michele Di Giorgio19023832020-06-17 16:08:10 +0000900 coord.set_num_dimensions(output.num_dimensions());
901 output.set_valid_region(valid_region);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000902 return std::make_pair(Status{}, win);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000903}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100904} // namespace
905
906NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100907 : _func(nullptr), _policy()
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100908{
909}
910
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100911void NEArithmeticAdditionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100912{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000913 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100914 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100915
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000916 // Configure kernel window
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100917 auto win_config = validate_and_configure_window(*input1, *input2, *output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000918 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100919
920 static std::map<std::string, AddFunction *> map_function =
921 {
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000922 { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
923 { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
Michalis Spyroubc4d7c22019-12-03 15:11:09 +0000924 { "add_wrap_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
925 { "add_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
Manuel Bottini3689fcd2019-06-14 17:18:12 +0100926 { "add_wrap_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
927 { "add_saturate_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100928 { "add_wrap_U8_U8_U8", &add_same<uint8_t> },
929 { "add_saturate_U8_U8_U8", &add_same<uint8_t> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000930 { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
931 { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
932 { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
933 { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
934 { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
935 { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100936 { "add_wrap_S16_S16_S16", &add_same<int16_t> },
937 { "add_saturate_S16_S16_S16", &add_same<int16_t> },
Michele Di Giorgio11c562c2020-06-10 16:34:50 +0100938 { "add_wrap_S32_S32_S32", &add_same<int32_t> },
939 { "add_saturate_S32_S32_S32", &add_same<int32_t> },
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100940 { "add_wrap_F32_F32_F32", &add_same<float> },
941 { "add_saturate_F32_F32_F32", &add_same<float> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000942#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100943 { "add_wrap_F16_F16_F16", &add_same<float16_t> },
944 { "add_saturate_F16_F16_F16", &add_same<float16_t> },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000945#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100946 };
947
Georgios Pinitas5a594532018-12-03 14:30:05 +0000948 _policy = policy;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100949
950 std::string function_to_call("add_");
951 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100952 function_to_call += string_from_data_type(input1->data_type()) + "_";
953 function_to_call += string_from_data_type(input2->data_type()) + "_";
954 function_to_call += string_from_data_type(output->data_type());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100955
956 auto it = map_function.find(function_to_call);
957
958 if(it != map_function.end())
959 {
960 _func = it->second;
961 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100962
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000963 INEKernel::configure(win_config.second);
964}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100965
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000966Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000967{
Michele Di Giorgio19023832020-06-17 16:08:10 +0000968 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
969
970 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
971 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100972
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000973 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100974}
975
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100976void NEArithmeticAdditionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100977{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100978 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100979 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
980 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100981 // Dispatch kernel
982 (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), _policy, window);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000983}
Michalis Spyrou7a7fe652020-05-15 11:28:59 +0100984} // namespace arm_compute