blob: 0695c94927b1903fa9885513fba31921e2d5a8f6 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Manuel Bottini6a2b6e82019-02-25 13:50:11 +00002 * Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000030#include "arm_compute/core/NEON/NEAsymm.h"
Michele Di Giorgio81f0d152017-07-11 15:00:52 +010031#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Validate.h"
34
35#include <algorithm>
36#include <arm_neon.h>
37#include <cstdint>
38#include <map>
39#include <string>
40
41using namespace arm_compute;
42
43namespace arm_compute
44{
45class Coordinates;
46} // namespace arm_compute
47
48namespace
49{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010050constexpr unsigned int num_elems_processed_per_iteration = 16;
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
53{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010054 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
55 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010056 Iterator output(out, window);
57
Michalis Spyroua4f378d2019-04-26 14:54:54 +010058 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010059 {
60 const uint8x16_t ta1 = vld1q_u8(input1.ptr());
61 const uint8x16_t ta2 = vld1q_u8(input2.ptr());
62
63 vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
64 },
65 input1, input2, output);
66}
67
68void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
69{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010070 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
71 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072 Iterator output(out, window);
73
Michalis Spyroua4f378d2019-04-26 14:54:54 +010074 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 {
76 const uint8x16_t ta1 = vld1q_u8(input1.ptr());
77 const uint8x16_t ta2 = vld1q_u8(input2.ptr());
78
79 vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
80 },
81 input1, input2, output);
82}
83
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000084void sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
85{
86 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
87 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
88 Iterator output(out, window);
89
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010090 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
91 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
92 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
93
Michalis Spyroua4f378d2019-04-26 14:54:54 +010094 execute_window_loop(window, [&](const Coordinates &)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000095 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010096 const float32x4x4_t ta1 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input1.ptr())), iq1_info);
97 const float32x4x4_t ta2 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input2.ptr())), iq2_info);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000098
99 const float32x4x4_t ta3 =
100 {
101 {
102 vsubq_f32(ta1.val[0], ta2.val[0]),
103 vsubq_f32(ta1.val[1], ta2.val[1]),
104 vsubq_f32(ta1.val[2], ta2.val[2]),
105 vsubq_f32(ta1.val[3], ta2.val[3]),
106 }
107 };
108
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100109 const uint8x16_t result = vquantize(ta3, oq_info);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000110
111 vst1q_u8(reinterpret_cast<qasymm8_t *>(output.ptr()), result);
112 },
113 input1, input2, output);
114}
115
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000116void sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
117{
118 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
119 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
120 Iterator output(out, window);
121
122 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
123 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
124 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
125
126 execute_window_loop(window, [&](const Coordinates &)
127 {
128 const float32x4x4_t ta1 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input1.ptr())), iq1_info);
129 const float32x4x4_t ta2 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input2.ptr())), iq2_info);
130
131 const float32x4x4_t ta3 =
132 {
133 {
134 vsubq_f32(ta1.val[0], ta2.val[0]),
135 vsubq_f32(ta1.val[1], ta2.val[1]),
136 vsubq_f32(ta1.val[2], ta2.val[2]),
137 vsubq_f32(ta1.val[3], ta2.val[3]),
138 }
139 };
140
141 const int8x16_t result = vquantize_signed(ta3, oq_info);
142
143 vst1q_s8(reinterpret_cast<qasymm8_signed_t *>(output.ptr()), result);
144 },
145 input1, input2, output);
146}
147
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100148void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
149{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100150 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
151 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100152 Iterator output(out, window);
153
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100154 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100155 {
156 const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
157 const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
158
159 const int16x8x2_t ta3 =
160 {
161 {
162 vsubq_s16(ta1.val[0], ta2.val[0]),
163 vsubq_s16(ta1.val[1], ta2.val[1])
164 }
165 };
166
167 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
168 },
169 input1, input2, output);
170}
171
172void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
173{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100174 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
175 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100176 Iterator output(out, window);
177
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100178 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100179 {
180 const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
181 const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
182
183 const int16x8x2_t ta3 =
184 {
185 {
186 vqsubq_s16(ta1.val[0], ta2.val[0]),
187 vqsubq_s16(ta1.val[1], ta2.val[1])
188 }
189 };
190
191 vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
192 },
193 input1, input2, output);
194}
195
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000196#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellod7a5d222017-07-11 13:54:43 +0100197inline float16x8x2_t vsub2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
198{
199 const float16x8x2_t res =
200 {
201 {
202 vsubq_f16(a.val[0], b.val[0]),
203 vsubq_f16(a.val[1], b.val[1])
204 }
205 };
206
207 return res;
208}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000209#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100210
211void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
212{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000213#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100214 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
215 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Pablo Tellod7a5d222017-07-11 13:54:43 +0100216 Iterator output(out, window);
217
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100218 execute_window_loop(window, [&](const Coordinates &)
Pablo Tellod7a5d222017-07-11 13:54:43 +0100219 {
220 const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
221 const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
222
223 vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vsub2q_f16(a, b));
224 },
225 input1, input2, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000226#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100227 ARM_COMPUTE_UNUSED(in1);
228 ARM_COMPUTE_UNUSED(in2);
229 ARM_COMPUTE_UNUSED(out);
230 ARM_COMPUTE_UNUSED(window);
231 ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000232#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100233}
234
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100235void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
236{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100237 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
238 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100239 Iterator output(out, window);
240
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100241 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100242 {
243 const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
244 const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
245
246 const float32x4x4_t ta3 =
247 {
248 {
249 vsubq_f32(ta1.val[0], ta2.val[0]),
250 vsubq_f32(ta1.val[1], ta2.val[1]),
251 vsubq_f32(ta1.val[2], ta2.val[2]),
252 vsubq_f32(ta1.val[3], ta2.val[3]),
253 }
254 };
255
256 vst4q_f32(reinterpret_cast<float *>(output.ptr()), ta3);
257 },
258 input1, input2, output);
259}
260void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
261{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100262 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
263 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100264 Iterator output(out, window);
265
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100266 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267 {
268 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
269 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
270 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
271
272 a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
273 a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
274
275 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
276 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
277 },
278 input1, input2, output);
279}
280
281void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
282{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100283 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
284 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285 Iterator output(out, window);
286
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100287 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100288 {
289 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
290 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
291 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
292
293 a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
294 a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
295
296 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
297 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
298 },
299 input1, input2, output);
300}
301
302void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
303{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100304 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
305 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100306 Iterator output(out, window);
307
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100308 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100309 {
310 const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
311 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
312 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
313
314 a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
315 a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
316
317 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
318 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
319 },
320 input1, input2, output);
321}
322
323void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
324{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100325 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
326 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100327 Iterator output(out, window);
328
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100329 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100330 {
331 const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
332 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
333 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
334
335 a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
336 a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
337
338 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
339 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
340 },
341 input1, input2, output);
342}
343
344void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
345{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100346 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
347 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100348 Iterator output(out, window);
349
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100350 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100351 {
352 const uint8x16_t av_0 = vld1q_u8(input1.ptr());
353 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
354
355 const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
356 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
357 const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
358 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
359
360 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
361 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
362 },
363 input1, input2, output);
364}
365
366void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
367{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100368 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
369 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100370 Iterator output(out, window);
371
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100372 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100373 {
374 const uint8x16_t av_0 = vld1q_u8(input1.ptr());
375 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
376
377 const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
378 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
379 const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
380 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
381
382 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
383 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
384 },
385 input1, input2, output);
386}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000387
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100388inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000389{
390 ARM_COMPUTE_UNUSED(policy);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100391 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000392 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
393 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
394 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000395
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100396 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
397 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000398
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000399 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
400 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
401 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000402 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000403 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
404 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
405 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
406 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
407 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
408 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
409 "You called subtract with the wrong image formats");
410
411 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000412 input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP
413 && input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP,
414 "Convert policy cannot be WRAP if datatype is QASYMM8 or QASYMM8_SIGNED");
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000415
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100416 // Validate in case of configured output
417 if(output.total_size() > 0)
418 {
419 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
420 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000421 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000422 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100423 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
424 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
425 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
426 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
427 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
428 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
429 "You called subtract with the wrong image formats");
430
431 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
432 "Wrong shape for output");
433 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000434 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000435}
436
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100437inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000438{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100439 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
440 const TensorShape &out_shape = broadcast_pair.first;
441 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000442
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100443 // Auto initialize output if not initialized
444 {
445 set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000446
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100447 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
448 {
449 set_format_if_unknown(output, Format::S16);
450 }
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000451 else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100452 {
453 set_format_if_unknown(output, Format::F16);
454 }
455 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
456 {
457 set_format_if_unknown(output, Format::F32);
458 }
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000459 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
460 {
461 set_data_type_if_unknown(output, DataType::QASYMM8);
462 }
463 else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
464 {
465 set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
466 }
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100467 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000468
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100469 Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
470 Window win_input1 = win.broadcast_if_dimension_le_one(input1);
471 Window win_input2 = win.broadcast_if_dimension_le_one(input2);
472
473 AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
474 AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
475 AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
476
477 bool window_changed = update_window_and_padding(win_input1, input1_access)
478 || update_window_and_padding(win_input2, input2_access)
479 || update_window_and_padding(win, output_access);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000480
481 output_access.set_valid_region(win, valid_region);
482
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000483 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000484 return std::make_pair(err, win);
485}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100486} // namespace
487
488NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
489 : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
490{
491}
492
493void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
494{
495 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100496 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100497
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100498 // Configure kernel window
499 auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
500 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100501
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000502 static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100503 {
504 { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
505 { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
506 { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
507 { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000508 { "sub_saturate_QASYMM8_QASYMM8_QASYMM8", &sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8 },
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000509 { "sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100510 { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
511 { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
512 { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
513 { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100514 { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
515 { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
516 { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
517 { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
Pablo Tellod7a5d222017-07-11 13:54:43 +0100518 { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
519 { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100520 };
521
522 _input1 = input1;
523 _input2 = input2;
524 _output = output;
525
526 std::string function_to_call("sub_");
527 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
528 function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
529 function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
530 function_to_call += string_from_data_type(output->info()->data_type());
531
532 auto it = map_function.find(function_to_call);
533
534 if(it != map_function.end())
535 {
536 _func = it->second;
537 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100538
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000539 INEKernel::configure(win_config.second);
540}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100541
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000542Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000543{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100544 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
545
546 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
547 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100548
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000549 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100550}
551
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100552void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100553{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100554 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100555 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
556 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
557 ARM_COMPUTE_ERROR_ON(_func == nullptr);
558
559 (*_func)(_input1, _input2, _output, window);
560}
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100561
562BorderSize NEArithmeticSubtractionKernel::border_size() const
563{
564 const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
565 const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100566 return BorderSize{ 0, border, 0, 0 };
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100567}