blob: a487090a98ca01864e4aaa1f228a184f06b2ab05 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Diego Lopez Recas0021d752017-12-18 14:42:56 +00002 * Copyright (c) 2016-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/IAccessWindow.h"
29#include "arm_compute/core/ITensor.h"
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010030#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Validate.h"
33
34#include <algorithm>
35#include <arm_neon.h>
36#include <cstdint>
37#include <map>
38#include <string>
39
40using namespace arm_compute;
41
42namespace arm_compute
43{
44class Coordinates;
45} // namespace arm_compute
46
47namespace
48{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000049constexpr unsigned int num_elems_processed_per_iteration = 16;
50
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010051void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
52{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000053 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
54 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010055 Iterator output(out, window);
56
57 execute_window_loop(window, [&](const Coordinates & id)
58 {
59 const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
60 const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
61
62 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vaddq_qs8(a, b));
63 },
64 input1, input2, output);
65}
66
67void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
68{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000069 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
70 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010071 Iterator output(out, window);
72
73 execute_window_loop(window, [&](const Coordinates & id)
74 {
75 const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
76 const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
77
78 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqaddq_qs8(a, b));
79 },
80 input1, input2, output);
81}
82
Anthony Barbier6ff3b192017-09-04 18:44:23 +010083void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
84{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000085 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
86 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010087 Iterator output(out, window);
88
89 execute_window_loop(window, [&](const Coordinates & id)
90 {
91 vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
92 },
93 input1, input2, output);
94}
95
96void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
97{
Diego Lopez Recas0021d752017-12-18 14:42:56 +000098 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
99 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100 Iterator output(out, window);
101
102 execute_window_loop(window, [&](const Coordinates & id)
103 {
104 vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
105 },
106 input1, input2, output);
107}
108
109inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
110{
111 const int16x8x2_t res =
112 {
113 {
114 vaddq_s16(a.val[0], b.val[0]),
115 vaddq_s16(a.val[1], b.val[1])
116 }
117 };
118
119 return res;
120}
121
122inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
123{
124 const float32x4x4_t res =
125 {
126 {
127 vaddq_f32(a.val[0], b.val[0]),
128 vaddq_f32(a.val[1], b.val[1]),
129 vaddq_f32(a.val[2], b.val[2]),
130 vaddq_f32(a.val[3], b.val[3])
131 }
132 };
133
134 return res;
135}
136
137inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
138{
139 const int16x8x2_t res =
140 {
141 {
142 vqaddq_s16(a.val[0], b.val[0]),
143 vqaddq_s16(a.val[1], b.val[1])
144 }
145 };
146
147 return res;
148}
149
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000150#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100151inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
152{
153 const float16x8x2_t res =
154 {
155 {
156 vaddq_f16(a.val[0], b.val[0]),
157 vaddq_f16(a.val[1], b.val[1])
158 }
159 };
160
161 return res;
162}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000163#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100164
165void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
166{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000167#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000168 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
169 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100170 Iterator output(out, window);
171
172 execute_window_loop(window, [&](const Coordinates & id)
173 {
174 const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
175 const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
176
177 vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
178 },
179 input1, input2, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000180#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100181 ARM_COMPUTE_UNUSED(in1);
182 ARM_COMPUTE_UNUSED(in2);
183 ARM_COMPUTE_UNUSED(out);
184 ARM_COMPUTE_UNUSED(window);
185 ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000186#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100187}
188
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
190{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000191 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
192 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100193 Iterator output(out, window);
194
195 execute_window_loop(window, [&](const Coordinates & id)
196 {
197 const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
198 const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
199
200 vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
201 },
202 input1, input2, output);
203}
204
205void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
206{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000207 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
208 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100209 Iterator output(out, window);
210
211 execute_window_loop(window, [&](const Coordinates & id)
212 {
213 const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
214 const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
215
216 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
217 },
218 input1, input2, output);
219}
220
221void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
222{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000223 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
224 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100225 Iterator output(out, window);
226
227 execute_window_loop(window, [&](const Coordinates & id)
228 {
229 const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
230 const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
231
232 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
233 },
234 input1, input2, output);
235}
236
237void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
238{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000239 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
240 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100241 Iterator output(out, window);
242
243 execute_window_loop(window, [&](const Coordinates & id)
244 {
245 const int16x8x2_t a =
246 {
247 {
248 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
249 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
250 }
251 };
252 const uint8x16_t b = vld1q_u8(input2.ptr());
253
254 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
255 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
256 },
257 input1, input2, output);
258}
259
260void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
261{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000262 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
263 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100264 Iterator output(out, window);
265
266 execute_window_loop(window, [&](const Coordinates & id)
267 {
268 const int16x8x2_t a =
269 {
270 {
271 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
272 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
273 }
274 };
275 const uint8x16_t b = vld1q_u8(input2.ptr());
276
277 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
278 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
279 },
280 input1, input2, output);
281}
282
283inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
284{
285 //Simply swap the two input buffers:
286 add_wrap_S16_U8_S16(input2, input1, output, window);
287}
288
289inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
290{
291 //Simply swap the two input buffers:
292 add_saturate_S16_U8_S16(input2, input1, output, window);
293}
294
295void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
296{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000297 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
298 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100299 Iterator output(out, window);
300
301 execute_window_loop(window, [&](const Coordinates & id)
302 {
303 const uint8x16_t a = vld1q_u8(input1.ptr());
304 const uint8x16_t b = vld1q_u8(input2.ptr());
305
306 const int16x8x2_t a_s16 =
307 {
308 {
309 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
310 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
311 }
312 };
313
314 const int16x8x2_t b_s16 =
315 {
316 {
317 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
318 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
319 }
320 };
321
322 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
323 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
324 },
325 input1, input2, output);
326}
327
328void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
329{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000330 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
331 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100332 Iterator output(out, window);
333
334 execute_window_loop(window, [&](const Coordinates & id)
335 {
336 const uint8x16_t a = vld1q_u8(input1.ptr());
337 const uint8x16_t b = vld1q_u8(input2.ptr());
338
339 const int16x8x2_t a_s16 =
340 {
341 {
342 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
343 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
344 }
345 };
346
347 const int16x8x2_t b_s16 =
348 {
349 {
350 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
351 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
352 }
353 };
354
355 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
356 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
357 },
358 input1, input2, output);
359}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000360
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000361Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000362{
363 ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000364
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000365 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
366 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
367
368 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
369
370 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
371
372 if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(input2.data_type()))
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000373 {
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000374 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000375 }
376
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000377 // Validate in case of configured output
378 if(output.total_size() > 0)
379 {
380 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
381 !(input1.data_type() == DataType::QS8 && input2.data_type() == DataType::QS8 && output.data_type() == DataType::QS8)
382 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
383 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
384 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
385 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
386 && !(input1.data_type() == DataType::QS16 && input2.data_type() == DataType::QS16 && output.data_type() == DataType::QS16)
387 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
388 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
389 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
390 "You called addition with the wrong image formats");
391
392 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
393 "Wrong shape for output");
394
395 if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(output.data_type()))
396 {
397 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output);
398 }
399 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000400
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000401 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000402}
403
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000404std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000405{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000406 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
407 const TensorShape &out_shape = broadcast_pair.first;
408 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000409
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000410 // Auto initialize output if not initialized
411 {
412 set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000413
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000414 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
415 {
416 set_format_if_unknown(output, Format::S16);
417 }
418 else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
419 {
420 set_format_if_unknown(output, Format::F16);
421 }
422 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
423 {
424 set_format_if_unknown(output, Format::F32);
425 }
426 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000427
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000428 Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
429 Window win_input1 = win.broadcast_if_dimension_le_one(input1);
430 Window win_input2 = win.broadcast_if_dimension_le_one(input2);
431
432 AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
433 AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
434 AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
435
436 bool window_changed = update_window_and_padding(win_input1, input1_access)
437 || update_window_and_padding(win_input2, input2_access)
438 || update_window_and_padding(win, output_access);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000439
440 output_access.set_valid_region(win, valid_region);
441
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000442 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000443 return std::make_pair(err, win);
444}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100445} // namespace
446
447NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
448 : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
449{
450}
451
452void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
453{
454 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000455 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100456
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000457 // Configure kernel window
458 auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
459 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460
461 static std::map<std::string, AddFunction *> map_function =
462 {
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100463 { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 },
464 { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100465 { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
466 { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
467 { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
468 { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
469 { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
470 { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
471 { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
472 { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100473 { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 },
474 { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100475 { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
476 { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
477 { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
478 { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
Pablo Tellod1b0ecc2017-07-11 11:27:04 +0100479 { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
480 { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100481 };
482
483 _input1 = input1;
484 _input2 = input2;
485 _output = output;
486
487 std::string function_to_call("add_");
488 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
489 function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
490 function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
491 function_to_call += string_from_data_type(output->info()->data_type());
492
493 auto it = map_function.find(function_to_call);
494
495 if(it != map_function.end())
496 {
497 _func = it->second;
498 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100499
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000500 INEKernel::configure(win_config.second);
501}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100502
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000503Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000504{
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000505 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
506
507 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
508 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100509
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000510 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100511}
512
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100513void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100514{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100515 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100516 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
517 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
518 ARM_COMPUTE_ERROR_ON(_func == nullptr);
519
520 (*_func)(_input1, _input2, _output, window);
521}
Diego Lopez Recas0021d752017-12-18 14:42:56 +0000522
523BorderSize NEArithmeticAdditionKernel::border_size() const
524{
525 const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
526 const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
527 return BorderSize(0, border, 0, 0);
528}