blob: 954a2c1754c7e8c30208fdfdc794f9ad6a0f2655 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Diego Lopez Recas0021d752017-12-18 14:42:56 +00002 * Copyright (c) 2016-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/IAccessWindow.h"
30#include "arm_compute/core/ITensor.h"
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010031#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
34
35#include <algorithm>
36#include <arm_neon.h>
37#include <cstdint>
38#include <map>
39#include <string>
40
41using namespace arm_compute;
42
43namespace arm_compute
44{
45class Coordinates;
46} // namespace arm_compute
47
48namespace
49{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000050constexpr unsigned int num_elems_processed_per_iteration = 16;
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
53{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000054 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
55 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010056 Iterator output(out, window);
57
58 execute_window_loop(window, [&](const Coordinates & id)
59 {
60 vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
61 },
62 input1, input2, output);
63}
64
65void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
66{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000067 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
68 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010069 Iterator output(out, window);
70
71 execute_window_loop(window, [&](const Coordinates & id)
72 {
73 vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
74 },
75 input1, input2, output);
76}
77
78inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
79{
80 const int16x8x2_t res =
81 {
82 {
83 vaddq_s16(a.val[0], b.val[0]),
84 vaddq_s16(a.val[1], b.val[1])
85 }
86 };
87
88 return res;
89}
90
91inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
92{
93 const float32x4x4_t res =
94 {
95 {
96 vaddq_f32(a.val[0], b.val[0]),
97 vaddq_f32(a.val[1], b.val[1]),
98 vaddq_f32(a.val[2], b.val[2]),
99 vaddq_f32(a.val[3], b.val[3])
100 }
101 };
102
103 return res;
104}
105
106inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
107{
108 const int16x8x2_t res =
109 {
110 {
111 vqaddq_s16(a.val[0], b.val[0]),
112 vqaddq_s16(a.val[1], b.val[1])
113 }
114 };
115
116 return res;
117}
118
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000119#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100120inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
121{
122 const float16x8x2_t res =
123 {
124 {
125 vaddq_f16(a.val[0], b.val[0]),
126 vaddq_f16(a.val[1], b.val[1])
127 }
128 };
129
130 return res;
131}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000132#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100133
134void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
135{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000136#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000137 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
138 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100139 Iterator output(out, window);
140
141 execute_window_loop(window, [&](const Coordinates & id)
142 {
143 const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
144 const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
145
146 vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
147 },
148 input1, input2, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000149#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100150 ARM_COMPUTE_UNUSED(in1);
151 ARM_COMPUTE_UNUSED(in2);
152 ARM_COMPUTE_UNUSED(out);
153 ARM_COMPUTE_UNUSED(window);
154 ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000155#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100156}
157
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
159{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000160 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
161 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100162 Iterator output(out, window);
163
164 execute_window_loop(window, [&](const Coordinates & id)
165 {
166 const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
167 const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
168
169 vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
170 },
171 input1, input2, output);
172}
173
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000174void add_QASYMM8_QASYMM8_QASYMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
175{
176 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
177 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
178 Iterator output(out, window);
179
180 const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
181 const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
182 const float32x4_t invvscaleo = vdupq_n_f32(1.f / out->info()->quantization_info().scale);
183 const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
184 const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
185 const float32x4_t voffseto = vdupq_n_f32(out->info()->quantization_info().offset);
186
187 execute_window_loop(window, [&](const Coordinates & id)
188 {
189 const uint8x16_t a = vld1q_u8(input1.ptr());
190 const uint8x16_t b = vld1q_u8(input2.ptr());
191
192 const float32x4x4_t af =
193 {
194 {
195 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
196 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
197 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
198 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
199 }
200 };
201
202 const float32x4x4_t bf =
203 {
204 {
205 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
206 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
207 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
208 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
209 }
210 };
211
212 const int32x4x4_t rf =
213 {
214 {
215 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
216 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
217 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
218 vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
219 }
220 };
221
222 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
223 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
224 vst1q_u8(output.ptr(), vcombine_u8(pa, pb));
225 },
226 input1, input2, output);
227}
228
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100229void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
230{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000231 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
232 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100233 Iterator output(out, window);
234
235 execute_window_loop(window, [&](const Coordinates & id)
236 {
237 const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
238 const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
239
240 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
241 },
242 input1, input2, output);
243}
244
245void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
246{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000247 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
248 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100249 Iterator output(out, window);
250
251 execute_window_loop(window, [&](const Coordinates & id)
252 {
253 const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
254 const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
255
256 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
257 },
258 input1, input2, output);
259}
260
261void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
262{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000263 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
264 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100265 Iterator output(out, window);
266
267 execute_window_loop(window, [&](const Coordinates & id)
268 {
269 const int16x8x2_t a =
270 {
271 {
272 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
273 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
274 }
275 };
276 const uint8x16_t b = vld1q_u8(input2.ptr());
277
278 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
279 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
280 },
281 input1, input2, output);
282}
283
284void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
285{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000286 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
287 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100288 Iterator output(out, window);
289
290 execute_window_loop(window, [&](const Coordinates & id)
291 {
292 const int16x8x2_t a =
293 {
294 {
295 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
296 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
297 }
298 };
299 const uint8x16_t b = vld1q_u8(input2.ptr());
300
301 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
302 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
303 },
304 input1, input2, output);
305}
306
307inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
308{
309 //Simply swap the two input buffers:
310 add_wrap_S16_U8_S16(input2, input1, output, window);
311}
312
313inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
314{
315 //Simply swap the two input buffers:
316 add_saturate_S16_U8_S16(input2, input1, output, window);
317}
318
319void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
320{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000321 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
322 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100323 Iterator output(out, window);
324
325 execute_window_loop(window, [&](const Coordinates & id)
326 {
327 const uint8x16_t a = vld1q_u8(input1.ptr());
328 const uint8x16_t b = vld1q_u8(input2.ptr());
329
330 const int16x8x2_t a_s16 =
331 {
332 {
333 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
334 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
335 }
336 };
337
338 const int16x8x2_t b_s16 =
339 {
340 {
341 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
342 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
343 }
344 };
345
346 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
347 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
348 },
349 input1, input2, output);
350}
351
352void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
353{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000354 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
355 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100356 Iterator output(out, window);
357
358 execute_window_loop(window, [&](const Coordinates & id)
359 {
360 const uint8x16_t a = vld1q_u8(input1.ptr());
361 const uint8x16_t b = vld1q_u8(input2.ptr());
362
363 const int16x8x2_t a_s16 =
364 {
365 {
366 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
367 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
368 }
369 };
370
371 const int16x8x2_t b_s16 =
372 {
373 {
374 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
375 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
376 }
377 };
378
379 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
380 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
381 },
382 input1, input2, output);
383}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000384
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000385Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000386{
387 ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000388
Anthony Barbiereaefd002018-07-20 17:49:35 +0100389 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000390 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
391 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000392
393 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
394
395 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
396
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000397 // Validate in case of configured output
398 if(output.total_size() > 0)
399 {
400 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100401 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000402 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
403 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
404 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000405 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
406 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000407 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
408 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8),
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000409 "You called addition with the wrong image formats");
410
411 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
412 "Wrong shape for output");
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000413 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000414
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000415 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000416}
417
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000418std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000419{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000420 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
421 const TensorShape &out_shape = broadcast_pair.first;
422 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000423
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000424 // Auto initialize output if not initialized
425 {
426 set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000427
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000428 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
429 {
430 set_format_if_unknown(output, Format::S16);
431 }
432 else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
433 {
434 set_format_if_unknown(output, Format::F16);
435 }
436 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
437 {
438 set_format_if_unknown(output, Format::F32);
439 }
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000440 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
441 {
442 set_data_type_if_unknown(output, DataType::QASYMM8);
443 }
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000444 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000445
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000446 Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
447 Window win_input1 = win.broadcast_if_dimension_le_one(input1);
448 Window win_input2 = win.broadcast_if_dimension_le_one(input2);
449
450 AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
451 AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
452 AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
453
454 bool window_changed = update_window_and_padding(win_input1, input1_access)
455 || update_window_and_padding(win_input2, input2_access)
456 || update_window_and_padding(win, output_access);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000457
458 output_access.set_valid_region(win, valid_region);
459
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000460 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000461 return std::make_pair(err, win);
462}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100463} // namespace
464
465NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
466 : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
467{
468}
469
470void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
471{
472 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000473 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100474
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000475 // Configure kernel window
476 auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
477 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100478
479 static std::map<std::string, AddFunction *> map_function =
480 {
481 { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
482 { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
483 { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
484 { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
485 { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
486 { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
487 { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
488 { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
489 { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
490 { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
491 { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
492 { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100493 { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
494 { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
Georgios Pinitasa84faff2018-12-05 18:17:24 +0000495 { "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
496 { "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100497 };
498
499 _input1 = input1;
500 _input2 = input2;
501 _output = output;
502
503 std::string function_to_call("add_");
504 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
505 function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
506 function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
507 function_to_call += string_from_data_type(output->info()->data_type());
508
509 auto it = map_function.find(function_to_call);
510
511 if(it != map_function.end())
512 {
513 _func = it->second;
514 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100515
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000516 INEKernel::configure(win_config.second);
517}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100518
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000519Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000520{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100521 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000522
523 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
524 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100525
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000526 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100527}
528
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100529void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100530{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100531 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100532 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
533 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
534 ARM_COMPUTE_ERROR_ON(_func == nullptr);
535
536 (*_func)(_input1, _input2, _output, window);
537}
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000538
539BorderSize NEArithmeticAdditionKernel::border_size() const
540{
541 const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
542 const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
543 return BorderSize(0, border, 0, 0);
544}