blob: 7a2601be267ddaa9641111619bde6ea98c2935f7 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Manuel Bottini6a2b6e82019-02-25 13:50:11 +00002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000030#include "arm_compute/core/NEON/NEAsymm.h"
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010031#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
34
35#include <algorithm>
36#include <arm_neon.h>
37#include <cstdint>
38#include <map>
39#include <string>
40
41using namespace arm_compute;
42
43namespace arm_compute
44{
45class Coordinates;
46} // namespace arm_compute
47
48namespace
49{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010050constexpr unsigned int num_elems_processed_per_iteration = 16;
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
53{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010054 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
55 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010056 Iterator output(out, window);
57
Michalis Spyroua4f378d2019-04-26 14:54:54 +010058 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010059 {
60 const uint8x16_t ta1 = vld1q_u8(input1.ptr());
61 const uint8x16_t ta2 = vld1q_u8(input2.ptr());
62
63 vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
64 },
65 input1, input2, output);
66}
67
68void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
69{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010070 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
71 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072 Iterator output(out, window);
73
Michalis Spyroua4f378d2019-04-26 14:54:54 +010074 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 {
76 const uint8x16_t ta1 = vld1q_u8(input1.ptr());
77 const uint8x16_t ta2 = vld1q_u8(input2.ptr());
78
79 vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
80 },
81 input1, input2, output);
82}
83
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000084void sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
85{
86 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
87 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
88 Iterator output(out, window);
89
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010090 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
91 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
92 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
93
Michalis Spyroua4f378d2019-04-26 14:54:54 +010094 execute_window_loop(window, [&](const Coordinates &)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000095 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010096 const float32x4x4_t ta1 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input1.ptr())), iq1_info);
97 const float32x4x4_t ta2 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input2.ptr())), iq2_info);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000098
99 const float32x4x4_t ta3 =
100 {
101 {
102 vsubq_f32(ta1.val[0], ta2.val[0]),
103 vsubq_f32(ta1.val[1], ta2.val[1]),
104 vsubq_f32(ta1.val[2], ta2.val[2]),
105 vsubq_f32(ta1.val[3], ta2.val[3]),
106 }
107 };
108
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100109 const uint8x16_t result = vquantize(ta3, oq_info);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000110
111 vst1q_u8(reinterpret_cast<qasymm8_t *>(output.ptr()), result);
112 },
113 input1, input2, output);
114}
115
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000116void sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
117{
118 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
119 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
120 Iterator output(out, window);
121
122 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
123 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
124 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
125
126 execute_window_loop(window, [&](const Coordinates &)
127 {
128 const float32x4x4_t ta1 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input1.ptr())), iq1_info);
129 const float32x4x4_t ta2 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input2.ptr())), iq2_info);
130
131 const float32x4x4_t ta3 =
132 {
133 {
134 vsubq_f32(ta1.val[0], ta2.val[0]),
135 vsubq_f32(ta1.val[1], ta2.val[1]),
136 vsubq_f32(ta1.val[2], ta2.val[2]),
137 vsubq_f32(ta1.val[3], ta2.val[3]),
138 }
139 };
140
141 const int8x16_t result = vquantize_signed(ta3, oq_info);
142
143 vst1q_s8(reinterpret_cast<qasymm8_signed_t *>(output.ptr()), result);
144 },
145 input1, input2, output);
146}
147
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100148void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
149{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100150 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
151 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100152 Iterator output(out, window);
153
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100154 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155 {
156 const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
157 const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
158
159 const int16x8x2_t ta3 =
160 {
161 {
162 vsubq_s16(ta1.val[0], ta2.val[0]),
163 vsubq_s16(ta1.val[1], ta2.val[1])
164 }
165 };
166
167 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
168 },
169 input1, input2, output);
170}
171
172void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
173{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100174 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
175 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100176 Iterator output(out, window);
177
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100178 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100179 {
180 const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
181 const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
182
183 const int16x8x2_t ta3 =
184 {
185 {
186 vqsubq_s16(ta1.val[0], ta2.val[0]),
187 vqsubq_s16(ta1.val[1], ta2.val[1])
188 }
189 };
190
191 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
192 },
193 input1, input2, output);
194}
195
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000196#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellod7a5d222017-07-11 13:54:43 +0100197inline float16x8x2_t vsub2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
198{
199 const float16x8x2_t res =
200 {
201 {
202 vsubq_f16(a.val[0], b.val[0]),
203 vsubq_f16(a.val[1], b.val[1])
204 }
205 };
206
207 return res;
208}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000209#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100210
211void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
212{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000213#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100214 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
215 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Pablo Tellod7a5d222017-07-11 13:54:43 +0100216 Iterator output(out, window);
217
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100218 execute_window_loop(window, [&](const Coordinates &)
Pablo Tellod7a5d222017-07-11 13:54:43 +0100219 {
220 const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
221 const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
222
223 vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vsub2q_f16(a, b));
224 },
225 input1, input2, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000226#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100227 ARM_COMPUTE_UNUSED(in1);
228 ARM_COMPUTE_UNUSED(in2);
229 ARM_COMPUTE_UNUSED(out);
230 ARM_COMPUTE_UNUSED(window);
231 ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000232#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100233}
234
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100235void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
236{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100237 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
238 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100239 Iterator output(out, window);
240
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100241 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100242 {
243 const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
244 const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
245
246 const float32x4x4_t ta3 =
247 {
248 {
249 vsubq_f32(ta1.val[0], ta2.val[0]),
250 vsubq_f32(ta1.val[1], ta2.val[1]),
251 vsubq_f32(ta1.val[2], ta2.val[2]),
252 vsubq_f32(ta1.val[3], ta2.val[3]),
253 }
254 };
255
256 vst4q_f32(reinterpret_cast<float *>(output.ptr()), ta3);
257 },
258 input1, input2, output);
259}
260void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
261{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100262 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
263 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100264 Iterator output(out, window);
265
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100266 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267 {
268 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
269 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
270 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
271
272 a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
273 a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
274
275 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
276 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
277 },
278 input1, input2, output);
279}
280
281void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
282{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100283 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
284 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285 Iterator output(out, window);
286
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100287 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100288 {
289 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
290 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
291 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
292
293 a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
294 a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
295
296 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
297 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
298 },
299 input1, input2, output);
300}
301
302void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
303{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100304 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
305 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100306 Iterator output(out, window);
307
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100308 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100309 {
310 const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
311 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
312 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
313
314 a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
315 a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
316
317 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
318 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
319 },
320 input1, input2, output);
321}
322
323void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
324{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100325 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
326 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100327 Iterator output(out, window);
328
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100329 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100330 {
331 const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
332 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
333 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
334
335 a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
336 a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
337
338 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
339 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
340 },
341 input1, input2, output);
342}
343
344void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
345{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100346 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
347 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100348 Iterator output(out, window);
349
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100350 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100351 {
352 const uint8x16_t av_0 = vld1q_u8(input1.ptr());
353 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
354
355 const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
356 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
357 const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
358 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
359
360 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
361 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
362 },
363 input1, input2, output);
364}
365
366void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
367{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100368 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
369 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100370 Iterator output(out, window);
371
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100372 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100373 {
374 const uint8x16_t av_0 = vld1q_u8(input1.ptr());
375 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
376
377 const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
378 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
379 const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
380 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
381
382 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
383 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
384 },
385 input1, input2, output);
386}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000387
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100388inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000389{
390 ARM_COMPUTE_UNUSED(policy);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100391 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000392 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
393 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
394 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000395
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100396 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
397 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000398
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000399 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
400 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
401 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000402 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000403 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
404 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
405 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
406 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
407 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
408 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
409 "You called subtract with the wrong image formats");
410
411 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000412 input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP
413 && input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP,
414 "Convert policy cannot be WRAP if datatype is QASYMM8 or QASYMM8_SIGNED");
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000415
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100416 // Validate in case of configured output
417 if(output.total_size() > 0)
418 {
419 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
420 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000421 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000422 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100423 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
424 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
425 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
426 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
427 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
428 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
429 "You called subtract with the wrong image formats");
430
431 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
432 "Wrong shape for output");
433 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000434 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000435}
436
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100437inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000438{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100439 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
440 const TensorShape &out_shape = broadcast_pair.first;
441 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000442
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100443 // Auto initialize output if not initialized
444 {
445 set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000446
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100447 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
448 {
449 set_format_if_unknown(output, Format::S16);
450 }
451 else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
452 {
453 set_format_if_unknown(output, Format::F16);
454 }
455 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
456 {
457 set_format_if_unknown(output, Format::F32);
458 }
459 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000460
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100461 Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
462 Window win_input1 = win.broadcast_if_dimension_le_one(input1);
463 Window win_input2 = win.broadcast_if_dimension_le_one(input2);
464
465 AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
466 AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
467 AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
468
469 bool window_changed = update_window_and_padding(win_input1, input1_access)
470 || update_window_and_padding(win_input2, input2_access)
471 || update_window_and_padding(win, output_access);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000472
473 output_access.set_valid_region(win, valid_region);
474
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000475 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000476 return std::make_pair(err, win);
477}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100478} // namespace
479
480NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
481 : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
482{
483}
484
485void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
486{
487 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100488 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100489
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100490 // Configure kernel window
491 auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
492 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100493
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000494 static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100495 {
496 { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
497 { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
498 { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
499 { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000500 { "sub_saturate_QASYMM8_QASYMM8_QASYMM8", &sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8 },
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000501 { "sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100502 { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
503 { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
504 { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
505 { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100506 { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
507 { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
508 { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
509 { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
Pablo Tellod7a5d222017-07-11 13:54:43 +0100510 { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
511 { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100512 };
513
514 _input1 = input1;
515 _input2 = input2;
516 _output = output;
517
518 std::string function_to_call("sub_");
519 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
520 function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
521 function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
522 function_to_call += string_from_data_type(output->info()->data_type());
523
524 auto it = map_function.find(function_to_call);
525
526 if(it != map_function.end())
527 {
528 _func = it->second;
529 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100530
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000531 INEKernel::configure(win_config.second);
532}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100533
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000534Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000535{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100536 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
537
538 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
539 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100540
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000541 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100542}
543
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100544void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100545{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100546 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100547 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
548 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
549 ARM_COMPUTE_ERROR_ON(_func == nullptr);
550
551 (*_func)(_input1, _input2, _output, window);
552}
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100553
554BorderSize NEArithmeticSubtractionKernel::border_size() const
555{
556 const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
557 const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100558 return BorderSize{ 0, border, 0, 0 };
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100559}