blob: ffa578f40ebf1c865fa286528ff55bbb19627a21 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas5a594532018-12-03 14:30:05 +00002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/IAccessWindow.h"
30#include "arm_compute/core/ITensor.h"
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010031#include "arm_compute/core/NEON/NEFixedPoint.h"
Georgios Pinitas5a594532018-12-03 14:30:05 +000032#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include "arm_compute/core/TensorInfo.h"
34#include "arm_compute/core/Validate.h"
35
36#include <algorithm>
37#include <arm_neon.h>
38#include <cstdint>
39#include <map>
40#include <string>
41
42using namespace arm_compute;
43
44namespace arm_compute
45{
46class Coordinates;
47} // namespace arm_compute
48
49namespace
50{
Georgios Pinitas5a594532018-12-03 14:30:05 +000051template <typename T, bool is_sat>
52void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010053{
Georgios Pinitas5a594532018-12-03 14:30:05 +000054 ARM_COMPUTE_UNUSED(policy);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055
Georgios Pinitas5a594532018-12-03 14:30:05 +000056 /** NEON vector tag type. */
57 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
58
59 // Create input windows
60 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
61 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
62
63 // Clear X Dimension on execution window as we handle manually
64 Window win = window;
65 win.set(Window::DimX, Window::Dimension(0, 1, 1));
66
67 constexpr int window_step_x = 16 / sizeof(T);
68 const auto window_start_x = static_cast<int>(window.x().start());
69 const auto window_end_x = static_cast<int>(window.x().end());
70 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
71
72 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000074 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
75 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
76 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
77 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
78 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079
Georgios Pinitas5a594532018-12-03 14:30:05 +000080 // Clear X Dimension on execution window as we handle manually
81 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010082
Georgios Pinitas5a594532018-12-03 14:30:05 +000083 Iterator broadcast_input(broadcast_tensor, broadcast_win);
84 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
85 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010086
Georgios Pinitas5a594532018-12-03 14:30:05 +000087 execute_window_loop(win, [&](const Coordinates & id)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010088 {
Georgios Pinitas5a594532018-12-03 14:30:05 +000089 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
90 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +010091
Georgios Pinitas5a594532018-12-03 14:30:05 +000092 const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
93 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Anthony Barbier6ff3b192017-09-04 18:44:23 +010094
Georgios Pinitas5a594532018-12-03 14:30:05 +000095 // Compute S elements per iteration
96 int x = window_start_x;
97 for(; x <= (window_end_x - window_step_x); x += window_step_x)
98 {
99 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
100 const auto res = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
101 wrapper::vstore(output_ptr + x, res);
102 }
103
104 // Compute left-over elements
105 for(; x < window_end_x; ++x)
106 {
107 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
108 *(output_ptr + x) = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
109 }
110 },
111 broadcast_input, non_broadcast_input, output);
112 }
113 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100114 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000115 // Clear X Dimension on execution window as we handle manually
116 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
117 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
118
119 Iterator input1(in1, input1_win);
120 Iterator input2(in2, input2_win);
121 Iterator output(out, win);
122
123 execute_window_loop(win, [&](const Coordinates & id)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000125 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
126 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
127 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100128
Georgios Pinitas5a594532018-12-03 14:30:05 +0000129 // Compute S elements per iteration
130 int x = window_start_x;
131 for(; x <= (window_end_x - window_step_x); x += window_step_x)
132 {
133 const auto val1 = wrapper::vloadq(input1_ptr + x);
134 const auto val2 = wrapper::vloadq(input2_ptr + x);
135 const auto res = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
136 wrapper::vstore(output_ptr + x, res);
137 }
138
139 // Compute left-over elements
140 for(; x < window_end_x; ++x)
141 {
142 const auto val1 = *(input1_ptr + x);
143 const auto val2 = *(input2_ptr + x);
144 *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2;
145 }
146 },
147 input1, input2, output);
148 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149}
150
Georgios Pinitas5a594532018-12-03 14:30:05 +0000151void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100152{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000153 ARM_COMPUTE_UNUSED(policy);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100154
Georgios Pinitas5a594532018-12-03 14:30:05 +0000155 // Create input windows
156 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
157 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158
Georgios Pinitas5a594532018-12-03 14:30:05 +0000159 // Clear X Dimension on execution window as we handle manually
160 Window win = window;
161 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100162
Georgios Pinitas5a594532018-12-03 14:30:05 +0000163 const int window_step_x = 16;
164 const auto window_start_x = static_cast<int>(window.x().start());
165 const auto window_end_x = static_cast<int>(window.x().end());
166 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100167
Georgios Pinitas5a594532018-12-03 14:30:05 +0000168 const float output_scale = out->info()->quantization_info().scale;
Georgios Pinitas5a594532018-12-03 14:30:05 +0000169 const int output_offset = out->info()->quantization_info().offset;
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000170
171 const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
172 const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000173 const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_scale);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000174 const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
175 const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000176 const float32x4_t voffseto = vdupq_n_f32(output_offset);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000177
Georgios Pinitas5a594532018-12-03 14:30:05 +0000178 if(is_broadcast_across_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000179 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000180 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
181 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
182 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
183 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
184 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
185 const QuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info();
186 const QuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info();
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000187
Georgios Pinitas5a594532018-12-03 14:30:05 +0000188 // Clear X Dimension on execution window as we handle manually
189 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
190
191 Iterator broadcast_input(broadcast_tensor, broadcast_win);
192 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
193 Iterator output(out, win);
194
195 execute_window_loop(win, [&](const Coordinates & id)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000196 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000197 const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
198 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000199
Georgios Pinitas5a594532018-12-03 14:30:05 +0000200 const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
201 const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
202
203 const float32x4x4_t bf =
204 {
205 {
206 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
207 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2),
208 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
209 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2),
210 }
211 };
212 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
213
214 // Compute S elements per iteration
215 int x = window_start_x;
216 for(; x <= (window_end_x - window_step_x); x += window_step_x)
217 {
218 const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
219 const float32x4x4_t af =
220 {
221 {
222 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
223 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
224 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
225 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
226 }
227 };
228
229 const int32x4x4_t rf =
230 {
231 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000232#ifdef __aarch64__
233 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
234 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
235 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
236 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
237#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000238 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
239 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
240 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
241 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000242#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000243 }
244 };
245
246 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
247 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
248 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
249 }
250
251 // Compute left-over elements
252 for(; x < window_end_x; ++x)
253 {
254 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000255 *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000256 }
257 },
258 broadcast_input, non_broadcast_input, output);
259 }
260 else
261 {
262 // Clear X Dimension on execution window as we handle manually
263 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
264 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
265
266 const QuantizationInfo input1_qinfo = in1->info()->quantization_info();
267 const QuantizationInfo input2_qinfo = in2->info()->quantization_info();
268
269 Iterator input1(in1, input1_win);
270 Iterator input2(in2, input2_win);
271 Iterator output(out, win);
272
273 execute_window_loop(win, [&](const Coordinates & id)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000274 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000275 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
276 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
277 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000278
Georgios Pinitas5a594532018-12-03 14:30:05 +0000279 // Compute S elements per iteration
280 int x = window_start_x;
281 for(; x <= (window_end_x - window_step_x); x += window_step_x)
282 {
283 const uint8x16_t a = vld1q_u8(input1_ptr + x);
284 const uint8x16_t b = vld1q_u8(input2_ptr + x);
285
286 const float32x4x4_t af =
287 {
288 {
289 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
290 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
291 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
292 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
293 }
294 };
295
296 const float32x4x4_t bf =
297 {
298 {
299 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
300 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
301 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
302 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
303 }
304 };
305
306 const int32x4x4_t rf =
307 {
308 {
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000309#ifdef __aarch64__
310 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
311 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
312 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
313 vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
314#else //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000315 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
316 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
317 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
318 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000319#endif //__aarch64__
Georgios Pinitas5a594532018-12-03 14:30:05 +0000320 }
321 };
322
323 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
324 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
325 vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
326 }
327
328 // Compute left-over elements
329 for(; x < window_end_x; ++x)
330 {
331 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - input1_qinfo.offset) * input1_qinfo.scale;
332 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - input2_qinfo.offset) * input2_qinfo.scale;
Vidhya Sudhan Loganathanf8b65202019-02-01 09:49:50 +0000333 *(output_ptr + x) = out->info()->quantization_info().quantize((afs + bfs),RoundingPolicy::TO_NEAREST_UP);
Georgios Pinitas5a594532018-12-03 14:30:05 +0000334 }
335 },
336 input1, input2, output);
337 }
338}
339
340void add_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
341{
342 // Create input windows
343 Window win = window;
344 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
345 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
346
347 // Clear X Dimension on execution window as we handle manually
348 win.set(Window::DimX, Window::Dimension(0, 1, 1));
349 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
350 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
351
352 Iterator input1(in1, input1_win);
353 Iterator input2(in2, input2_win);
354 Iterator output(out, win);
355
356 const int window_step_x = 8;
357 const auto window_start_x = static_cast<int>(window.x().start());
358 const auto window_end_x = static_cast<int>(window.x().end());
359
360 execute_window_loop(win, [&](const Coordinates & id)
361 {
362 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
363 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
364 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
365
366 if(policy == ConvertPolicy::WRAP)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000367 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000368 // Compute S elements per iteration
369 int x = window_start_x;
370 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000371 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000372 const auto vin1 = wrapper::vloadq(input1_ptr + x);
373 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
374 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000375 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000376
Georgios Pinitas5a594532018-12-03 14:30:05 +0000377 // Compute left-over elements
378 for(; x < window_end_x; ++x)
379 {
380 *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
381 }
382 }
383 else
384 {
385 // Compute S elements per iteration
386 int x = window_start_x;
387 for(; x <= (window_end_x - window_step_x); x += window_step_x)
388 {
389 const auto vin1 = wrapper::vloadq(input1_ptr + x);
390 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
391 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
392 }
393
394 // Compute left-over elements
395 for(; x < window_end_x; ++x)
396 {
397 *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
398 }
399 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000400 },
401 input1, input2, output);
402}
403
Georgios Pinitas5a594532018-12-03 14:30:05 +0000404inline void add_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100405{
Georgios Pinitas5a594532018-12-03 14:30:05 +0000406 // Simply swap the two input buffers:
407 add_S16_U8_S16(input2, input1, output, policy, window);
408}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100409
Georgios Pinitas5a594532018-12-03 14:30:05 +0000410void add_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
411{
412 // Create input windows
413 Window win = window;
414 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
415 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
416
417 // Clear X Dimension on execution window as we handle manually
418 win.set(Window::DimX, Window::Dimension(0, 1, 1));
419 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
420 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
421
422 Iterator input1(in1, input1_win);
423 Iterator input2(in2, input2_win);
424 Iterator output(out, win);
425
426 const int window_step_x = 8;
427 const auto window_start_x = static_cast<int>(window.x().start());
428 const auto window_end_x = static_cast<int>(window.x().end());
429
430 execute_window_loop(win, [&](const Coordinates & id)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100431 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000432 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
433 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
434 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100435
Georgios Pinitas5a594532018-12-03 14:30:05 +0000436 if(policy == ConvertPolicy::WRAP)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100437 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000438 // Compute S elements per iteration
439 int x = window_start_x;
440 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100441 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000442 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
443 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
444 wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100445 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100446
Georgios Pinitas5a594532018-12-03 14:30:05 +0000447 // Compute left-over elements
448 for(; x < window_end_x; ++x)
449 {
450 *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
451 }
452 }
453 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100454 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000455 // Compute S elements per iteration
456 int x = window_start_x;
457 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100458 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000459 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
460 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
461 wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100462 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100463
Georgios Pinitas5a594532018-12-03 14:30:05 +0000464 // Compute left-over elements
465 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100466 {
Georgios Pinitas5a594532018-12-03 14:30:05 +0000467 *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
468 static_cast<int16_t>(*(input2_ptr + x)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100469 }
Georgios Pinitas5a594532018-12-03 14:30:05 +0000470 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100471 },
472 input1, input2, output);
473}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000474
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000475Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000476{
477 ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000478
Anthony Barbiereaefd002018-07-20 17:49:35 +0100479 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000480 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
481 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000482
483 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
484
485 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Georgios Pinitas5a594532018-12-03 14:30:05 +0000486 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input1.tensor_shape().x() != input2.tensor_shape().x()) && ((input1.data_type() != input2.data_type()) || (input1.data_type() != output.data_type())
487 || (input2.data_type() != output.data_type())),
488 "Broadcasting across width is supported on configurations where all tensors have the same data type");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000489
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000490 // Validate in case of configured output
491 if(output.total_size() > 0)
492 {
493 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100494 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000495 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
496 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
497 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000498 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
499 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000500 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
501 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000502 "You called addition with the wrong image formats");
503
504 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
505 "Wrong shape for output");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000506 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000507
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000508 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000509}
510
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000511std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000512{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000513 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
514 const TensorShape &out_shape = broadcast_pair.first;
515 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000516
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000517 // Auto initialize output if not initialized
518 {
519 set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000520
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000521 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
522 {
523 set_format_if_unknown(output, Format::S16);
524 }
525 else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
526 {
527 set_format_if_unknown(output, Format::F16);
528 }
529 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
530 {
531 set_format_if_unknown(output, Format::F32);
532 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000533 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
534 {
535 set_data_type_if_unknown(output, DataType::QASYMM8);
536 }
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000537 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000538
Georgios Pinitas5a594532018-12-03 14:30:05 +0000539 Window win = calculate_max_window(valid_region, Steps());
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000540
Georgios Pinitas5a594532018-12-03 14:30:05 +0000541 // NEArithmeticAdditionKernel doesn't need padding so update_window_and_padding() can be skipped
542 Coordinates coord;
543 coord.set_num_dimensions(output.num_dimensions());
544 output.set_valid_region(valid_region);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000545
Georgios Pinitas5a594532018-12-03 14:30:05 +0000546 return std::make_pair(Status{}, win);
547 ;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000548}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100549} // namespace
550
551NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
Georgios Pinitas5a594532018-12-03 14:30:05 +0000552 : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy()
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100553{
554}
555
556void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
557{
558 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000559 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100560
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000561 // Configure kernel window
562 auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
563 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100564
565 static std::map<std::string, AddFunction *> map_function =
566 {
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000567 { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
568 { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
Georgios Pinitas5a594532018-12-03 14:30:05 +0000569 { "add_wrap_U8_U8_U8", &add_same<uint8_t, false> },
570 { "add_saturate_U8_U8_U8", &add_same<uint8_t, true> },
571 { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
572 { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
573 { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
574 { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
575 { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
576 { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
577 { "add_wrap_S16_S16_S16", &add_same<int16_t, false> },
578 { "add_saturate_S16_S16_S16", &add_same<int16_t, true> },
579 { "add_wrap_F32_F32_F32", &add_same<float, false> },
580 { "add_saturate_F32_F32_F32", &add_same<float, false> },
581#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
582 { "add_wrap_F16_F16_F16", &add_same<float16_t, false> },
583 { "add_saturate_F16_F16_F16", &add_same<float16_t, false> },
584#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100585 };
586
587 _input1 = input1;
588 _input2 = input2;
589 _output = output;
Georgios Pinitas5a594532018-12-03 14:30:05 +0000590 _policy = policy;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100591
592 std::string function_to_call("add_");
593 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
594 function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
595 function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
596 function_to_call += string_from_data_type(output->info()->data_type());
597
598 auto it = map_function.find(function_to_call);
599
600 if(it != map_function.end())
601 {
602 _func = it->second;
603 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100604
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000605 INEKernel::configure(win_config.second);
606}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100607
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000608Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000609{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100610 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000611
612 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
613 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100614
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000615 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100616}
617
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100618void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100619{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100620 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100621 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
622 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
623 ARM_COMPUTE_ERROR_ON(_func == nullptr);
624
Georgios Pinitas5a594532018-12-03 14:30:05 +0000625 (*_func)(_input1, _input2, _output, _policy, window);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000626}