blob: 9b7b235c9fa14e677a925274677d24b7f195487d [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +00002 * Copyright (c) 2016-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000027#include "arm_compute/core/NEON/NEAsymm.h"
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +000028#include "arm_compute/core/NEON/NESymm.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Validate.h"
31
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032namespace arm_compute
33{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034namespace
35{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010036constexpr unsigned int num_elems_processed_per_iteration = 16;
37
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
39{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010040 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
41 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042 Iterator output(out, window);
43
Michalis Spyroua4f378d2019-04-26 14:54:54 +010044 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010045 {
46 const uint8x16_t ta1 = vld1q_u8(input1.ptr());
47 const uint8x16_t ta2 = vld1q_u8(input2.ptr());
48
49 vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
50 },
51 input1, input2, output);
52}
53
54void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
55{
Georgios Pinitascbf39c62018-09-10 15:07:45 +010056 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
57 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058 Iterator output(out, window);
59
Michalis Spyroua4f378d2019-04-26 14:54:54 +010060 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010061 {
62 const uint8x16_t ta1 = vld1q_u8(input1.ptr());
63 const uint8x16_t ta2 = vld1q_u8(input2.ptr());
64
65 vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
66 },
67 input1, input2, output);
68}
69
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000070void sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
71{
72 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
73 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
74 Iterator output(out, window);
75
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010076 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
77 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
78 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
79
Michalis Spyroua4f378d2019-04-26 14:54:54 +010080 execute_window_loop(window, [&](const Coordinates &)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000081 {
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010082 const float32x4x4_t ta1 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input1.ptr())), iq1_info);
83 const float32x4x4_t ta2 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input2.ptr())), iq2_info);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000084
85 const float32x4x4_t ta3 =
86 {
87 {
88 vsubq_f32(ta1.val[0], ta2.val[0]),
89 vsubq_f32(ta1.val[1], ta2.val[1]),
90 vsubq_f32(ta1.val[2], ta2.val[2]),
91 vsubq_f32(ta1.val[3], ta2.val[3]),
92 }
93 };
94
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010095 const uint8x16_t result = vquantize(ta3, oq_info);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000096
97 vst1q_u8(reinterpret_cast<qasymm8_t *>(output.ptr()), result);
98 },
99 input1, input2, output);
100}
101
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000102void sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
103{
104 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
105 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
106 Iterator output(out, window);
107
108 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
109 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
110 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
111
112 execute_window_loop(window, [&](const Coordinates &)
113 {
114 const float32x4x4_t ta1 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input1.ptr())), iq1_info);
115 const float32x4x4_t ta2 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input2.ptr())), iq2_info);
116
117 const float32x4x4_t ta3 =
118 {
119 {
120 vsubq_f32(ta1.val[0], ta2.val[0]),
121 vsubq_f32(ta1.val[1], ta2.val[1]),
122 vsubq_f32(ta1.val[2], ta2.val[2]),
123 vsubq_f32(ta1.val[3], ta2.val[3]),
124 }
125 };
126
127 const int8x16_t result = vquantize_signed(ta3, oq_info);
128
129 vst1q_s8(reinterpret_cast<qasymm8_signed_t *>(output.ptr()), result);
130 },
131 input1, input2, output);
132}
133
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000134void sub_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
135{
136 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
137 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
138 Iterator output(out, window);
139
140 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
141 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
142 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
143
144 execute_window_loop(window, [&](const Coordinates &)
145 {
146 const int16x8x2_t in1_s16 =
147 {
148 {
149 vld1q_s16(reinterpret_cast<const qsymm16_t *>(input1.ptr())),
150 vld1q_s16(reinterpret_cast<const qsymm16_t *>(input1.ptr()) + 8),
151 }
152 };
153 const int16x8x2_t in2_s16 =
154 {
155 {
156 vld1q_s16(reinterpret_cast<const qsymm16_t *>(input2.ptr())),
157 vld1q_s16(reinterpret_cast<const qsymm16_t *>(input2.ptr()) + 8),
158 }
159 };
160 const float32x4x4_t ta1 = vdequantize(in1_s16, iq1_info);
161 const float32x4x4_t ta2 = vdequantize(in2_s16, iq2_info);
162
163 const float32x4x4_t ta3 =
164 {
165 {
166 vsubq_f32(ta1.val[0], ta2.val[0]),
167 vsubq_f32(ta1.val[1], ta2.val[1]),
168 vsubq_f32(ta1.val[2], ta2.val[2]),
169 vsubq_f32(ta1.val[3], ta2.val[3]),
170 }
171 };
172
173 const int16x8x2_t result = vquantize_qsymm16(ta3, oq_info);
174
175 vst1q_s16(reinterpret_cast<qsymm16_t *>(output.ptr()), result.val[0]);
176 vst1q_s16(reinterpret_cast<qsymm16_t *>(output.ptr()) + 8, result.val[1]);
177 },
178 input1, input2, output);
179}
180
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100181void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
182{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100183 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
184 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100185 Iterator output(out, window);
186
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100187 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188 {
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000189 const int16x8x2_t ta1 =
190 {
191 {
192 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
193 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8),
194 }
195 };
196 const int16x8x2_t ta2 =
197 {
198 {
199 vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
200 vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8),
201 }
202 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100203
204 const int16x8x2_t ta3 =
205 {
206 {
207 vsubq_s16(ta1.val[0], ta2.val[0]),
208 vsubq_s16(ta1.val[1], ta2.val[1])
209 }
210 };
211
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000212 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3.val[0]);
213 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, ta3.val[1]);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214 },
215 input1, input2, output);
216}
217
218void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
219{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100220 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
221 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100222 Iterator output(out, window);
223
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100224 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100225 {
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000226 const int16x8x2_t ta1 =
227 {
228 {
229 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
230 vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8),
231 }
232 };
233 const int16x8x2_t ta2 =
234 {
235 {
236 vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
237 vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8),
238 }
239 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100240
241 const int16x8x2_t ta3 =
242 {
243 {
244 vqsubq_s16(ta1.val[0], ta2.val[0]),
245 vqsubq_s16(ta1.val[1], ta2.val[1])
246 }
247 };
248
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000249 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3.val[0]);
250 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, ta3.val[1]);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251 },
252 input1, input2, output);
253}
254
Pablo Tellod7a5d222017-07-11 13:54:43 +0100255void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
256{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000257#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100258 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
259 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Pablo Tellod7a5d222017-07-11 13:54:43 +0100260 Iterator output(out, window);
261
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100262 execute_window_loop(window, [&](const Coordinates &)
Pablo Tellod7a5d222017-07-11 13:54:43 +0100263 {
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000264 const float16x8x2_t a =
265 {
266 {
267 vld1q_f16(reinterpret_cast<const float16_t *>(input1.ptr())),
268 vld1q_f16(reinterpret_cast<const float16_t *>(input1.ptr()) + 8),
269 }
270 };
271 const float16x8x2_t b =
272 {
273 {
274 vld1q_f16(reinterpret_cast<const float16_t *>(input2.ptr())),
275 vld1q_f16(reinterpret_cast<const float16_t *>(input2.ptr()) + 8),
276 }
277 };
278 const float16x8x2_t res =
279 {
280 {
281 vsubq_f16(a.val[0], b.val[0]),
282 vsubq_f16(a.val[1], b.val[1]),
283 }
284 };
Pablo Tellod7a5d222017-07-11 13:54:43 +0100285
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000286 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res.val[0]);
287 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, res.val[1]);
Pablo Tellod7a5d222017-07-11 13:54:43 +0100288 },
289 input1, input2, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000290#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100291 ARM_COMPUTE_UNUSED(in1);
292 ARM_COMPUTE_UNUSED(in2);
293 ARM_COMPUTE_UNUSED(out);
294 ARM_COMPUTE_UNUSED(window);
295 ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000296#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellod7a5d222017-07-11 13:54:43 +0100297}
298
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100299void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
300{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100301 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
302 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100303 Iterator output(out, window);
304
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100305 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100306 {
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000307 const float32x4x4_t ta1 =
308 {
309 {
310 vld1q_f32(reinterpret_cast<const float *>(input1.ptr())),
311 vld1q_f32(reinterpret_cast<const float *>(input1.ptr()) + 4),
312 vld1q_f32(reinterpret_cast<const float *>(input1.ptr()) + 8),
313 vld1q_f32(reinterpret_cast<const float *>(input1.ptr()) + 12),
314 }
315 };
316 const float32x4x4_t ta2 =
317 {
318 {
319 vld1q_f32(reinterpret_cast<const float *>(input2.ptr())),
320 vld1q_f32(reinterpret_cast<const float *>(input2.ptr()) + 4),
321 vld1q_f32(reinterpret_cast<const float *>(input2.ptr()) + 8),
322 vld1q_f32(reinterpret_cast<const float *>(input2.ptr()) + 12),
323 }
324 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100325
326 const float32x4x4_t ta3 =
327 {
328 {
329 vsubq_f32(ta1.val[0], ta2.val[0]),
330 vsubq_f32(ta1.val[1], ta2.val[1]),
331 vsubq_f32(ta1.val[2], ta2.val[2]),
332 vsubq_f32(ta1.val[3], ta2.val[3]),
333 }
334 };
335
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000336 vst1q_f32(reinterpret_cast<float *>(output.ptr()), ta3.val[0]);
337 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, ta3.val[1]);
338 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, ta3.val[2]);
339 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, ta3.val[3]);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100340 },
341 input1, input2, output);
342}
343void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
344{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100345 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
346 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100347 Iterator output(out, window);
348
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100349 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100350 {
351 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
352 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
353 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
354
355 a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
356 a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
357
358 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
359 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
360 },
361 input1, input2, output);
362}
363
364void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
365{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100366 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
367 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100368 Iterator output(out, window);
369
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100370 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100371 {
372 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
373 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
374 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
375
376 a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
377 a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
378
379 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
380 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
381 },
382 input1, input2, output);
383}
384
385void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
386{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100387 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
388 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100389 Iterator output(out, window);
390
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100391 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100392 {
393 const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
394 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
395 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
396
397 a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
398 a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
399
400 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
401 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
402 },
403 input1, input2, output);
404}
405
406void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
407{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100408 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
409 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100410 Iterator output(out, window);
411
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100412 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100413 {
414 const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
415 int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
416 int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
417
418 a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
419 a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
420
421 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
422 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
423 },
424 input1, input2, output);
425}
426
427void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
428{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100429 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
430 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100431 Iterator output(out, window);
432
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100433 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100434 {
435 const uint8x16_t av_0 = vld1q_u8(input1.ptr());
436 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
437
438 const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
439 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
440 const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
441 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
442
443 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
444 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
445 },
446 input1, input2, output);
447}
448
449void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
450{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100451 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
452 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100453 Iterator output(out, window);
454
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100455 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100456 {
457 const uint8x16_t av_0 = vld1q_u8(input1.ptr());
458 const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
459
460 const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
461 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
462 const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
463 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
464
465 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
466 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
467 },
468 input1, input2, output);
469}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000470
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100471inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000472{
473 ARM_COMPUTE_UNUSED(policy);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100474 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000475 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
476 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
477 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000478
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100479 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
480 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000481
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000482 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
483 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
484 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000485 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000486 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000487 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
488 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
489 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
490 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
491 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
492 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
493 "You called subtract with the wrong image formats");
494
495 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000496 input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000497 && input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP
498 && input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP,
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000499 "Convert policy cannot be WRAP if datatype is QASYMM8 or QASYMM8_SIGNED");
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000500
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100501 // Validate in case of configured output
502 if(output.total_size() > 0)
503 {
504 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
505 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000506 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000507 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000508 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100509 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
510 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
511 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
512 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
513 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
514 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
515 "You called subtract with the wrong image formats");
516
517 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
518 "Wrong shape for output");
519 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000520 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000521}
522
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100523inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000524{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100525 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
526 const TensorShape &out_shape = broadcast_pair.first;
527 const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000528
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100529 // Auto initialize output if not initialized
530 {
531 set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000532
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100533 if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
534 {
535 set_format_if_unknown(output, Format::S16);
536 }
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000537 else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100538 {
539 set_format_if_unknown(output, Format::F16);
540 }
541 else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
542 {
543 set_format_if_unknown(output, Format::F32);
544 }
Georgios Pinitasd7d7e902019-12-18 15:40:54 +0000545 else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
546 {
547 set_data_type_if_unknown(output, DataType::QASYMM8);
548 }
549 else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
550 {
551 set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
552 }
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000553 else if(input1.data_type() == DataType::QSYMM16 || input2.data_type() == DataType::QSYMM16)
554 {
555 set_data_type_if_unknown(output, DataType::QSYMM16);
556 }
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100557 }
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000558
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100559 Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
560 Window win_input1 = win.broadcast_if_dimension_le_one(input1);
561 Window win_input2 = win.broadcast_if_dimension_le_one(input2);
562
563 AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
564 AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
565 AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
566
567 bool window_changed = update_window_and_padding(win_input1, input1_access)
568 || update_window_and_padding(win_input2, input2_access)
569 || update_window_and_padding(win, output_access);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000570
571 output_access.set_valid_region(win, valid_region);
572
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000573 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000574 return std::make_pair(err, win);
575}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100576} // namespace
577
578NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
579 : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
580{
581}
582
583void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
584{
585 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100586 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100587
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100588 // Configure kernel window
589 auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
590 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100591
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000592 static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100593 {
594 { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
595 { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
596 { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
597 { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000598 { "sub_saturate_QASYMM8_QASYMM8_QASYMM8", &sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8 },
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000599 { "sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000600 { "sub_saturate_QSYMM16_QSYMM16_QSYMM16", &sub_saturate_QSYMM16_QSYMM16_QSYMM16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100601 { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
602 { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
603 { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
604 { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100605 { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
606 { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
607 { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
608 { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
Pablo Tellod7a5d222017-07-11 13:54:43 +0100609 { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
610 { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100611 };
612
613 _input1 = input1;
614 _input2 = input2;
615 _output = output;
616
617 std::string function_to_call("sub_");
618 function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
619 function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
620 function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
621 function_to_call += string_from_data_type(output->info()->data_type());
622
623 auto it = map_function.find(function_to_call);
624
625 if(it != map_function.end())
626 {
627 _func = it->second;
628 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100629
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000630 INEKernel::configure(win_config.second);
631}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100632
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000633Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000634{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100635 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
636
637 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
638 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100639
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000640 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100641}
642
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100643void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100644{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100645 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100646 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
647 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
648 ARM_COMPUTE_ERROR_ON(_func == nullptr);
649
650 (*_func)(_input1, _input2, _output, window);
651}
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100652
653BorderSize NEArithmeticSubtractionKernel::border_size() const
654{
655 const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
656 const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100657 return BorderSize{ 0, border, 0, 0 };
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000658}
659} // namespace arm_compute