blob: bdd356ad7f6fa51676dd9b3019751fcdb479ef07 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026#include "arm_compute/core/TensorInfo.h"
27#include "arm_compute/core/Validate.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NESymm.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035namespace arm_compute
36{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037namespace
38{
Michalis Spyrou5f390912020-05-13 00:12:08 +010039template <typename T>
40inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
41quantize(float val, const QuantizationInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042{
Michalis Spyrou5f390912020-05-13 00:12:08 +010043 return quantize_qasymm8_signed(val, info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044}
45
Michalis Spyrou5f390912020-05-13 00:12:08 +010046template <typename T>
47inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
48quantize(float val, const QuantizationInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010049{
Michalis Spyrou5f390912020-05-13 00:12:08 +010050 return quantize_qasymm8(val, info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010051}
52
Michalis Spyrou5f390912020-05-13 00:12:08 +010053template <typename T>
54void sub_same(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000055{
Michalis Spyrou5f390912020-05-13 00:12:08 +010056 /** NEON vector tag type. */
57 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
58
59 // Create input windows
60 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
61 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
62
63 // Clear X Dimension on execution window as we handle manually
64 Window win = window;
65 win.set(Window::DimX, Window::Dimension(0, 1, 1));
66
67 constexpr int window_step_x = 16 / sizeof(T);
68 const auto window_start_x = static_cast<int>(window.x().start());
69 const auto window_end_x = static_cast<int>(window.x().end());
70 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
71
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000072 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
73 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
74 Iterator output(out, window);
75
Michalis Spyrou5f390912020-05-13 00:12:08 +010076 if(is_broadcast_across_x)
77 {
78 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
79 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
80 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
81 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
82 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
83
84 // Clear X Dimension on execution window as we handle manually
85 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
86
87 Iterator broadcast_input(broadcast_tensor, broadcast_win);
88 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
89 Iterator output(out, win);
90
91 execute_window_loop(win, [&](const Coordinates &)
92 {
93 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
94 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
95
96 const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
97 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
98
99 // Compute S elements per iteration
100 int x = window_start_x;
101 for(; x <= (window_end_x - window_step_x); x += window_step_x)
102 {
103 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
104 auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
105 if(is_broadcast_input_2)
106 {
107 res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
108 }
109 wrapper::vstore(output_ptr + x, res);
110 }
111
112 // Compute left-over elements
113 for(; x < window_end_x; ++x)
114 {
115 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
116 auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
117 if(is_broadcast_input_2)
118 {
119 res = static_cast<T>(-1) * res;
120 }
121
122 *(output_ptr + x) = res;
123 }
124 },
125 broadcast_input, non_broadcast_input, output);
126 }
127 else
128 {
129 // Clear X Dimension on execution window as we handle manually
130 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
131 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
132
133 Iterator input1(in1, input1_win);
134 Iterator input2(in2, input2_win);
135 Iterator output(out, win);
136
137 execute_window_loop(win, [&](const Coordinates &)
138 {
139 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
140 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
141 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
142
143 // Compute S elements per iteration
144 int x = window_start_x;
145 for(; x <= (window_end_x - window_step_x); x += window_step_x)
146 {
147 const auto val1 = wrapper::vloadq(input1_ptr + x);
148 const auto val2 = wrapper::vloadq(input2_ptr + x);
149 const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
150 wrapper::vstore(output_ptr + x, res);
151 }
152
153 // Compute left-over elements
154 for(; x < window_end_x; ++x)
155 {
156 const auto val1 = *(input1_ptr + x);
157 const auto val2 = *(input2_ptr + x);
158 *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
159 }
160 },
161 input1, input2, output);
162 }
163}
164
165template <typename T>
166void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
167{
168 ARM_COMPUTE_UNUSED(is_sat);
169
170 // Create input windows
171 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
172 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
173
174 // Clear X Dimension on execution window as we handle manually
175 Window win = window;
176 win.set(Window::DimX, Window::Dimension(0, 1, 1));
177
178 const int window_step_x = 16;
179 const auto window_start_x = static_cast<int>(window.x().start());
180 const auto window_end_x = static_cast<int>(window.x().end());
181 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
182
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100183 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
184 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
185 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
186
Michalis Spyrou5f390912020-05-13 00:12:08 +0100187 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100188 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
189
190 if(is_broadcast_across_x)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000191 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100192 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
193 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
194 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
195 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
196 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
197 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
198 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Michalis Spyroueae65842020-06-15 20:23:59 +0100199 const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
200 const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
201 const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
202 const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000203
Michalis Spyrou5f390912020-05-13 00:12:08 +0100204 // Clear X Dimension on execution window as we handle manually
205 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
206
207 Iterator broadcast_input(broadcast_tensor, broadcast_win);
208 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
209 Iterator output(out, win);
210
211 execute_window_loop(win, [&](const Coordinates &)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000212 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100213 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
214 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
215
216 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
217 const auto broadcast_value_vec = wrapper::vdup_n(static_cast<T>(broadcast_value), wrapper::traits::vector_128_tag{});
218
219 const float32x4x4_t bf =
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000220 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100221 {
222 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
223 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
224 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
225 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
226 }
227 };
Michalis Spyrou5f390912020-05-13 00:12:08 +0100228
229 // Compute S elements per iteration
230 int x = window_start_x;
231 for(; x <= (window_end_x - window_step_x); x += window_step_x)
232 {
233 const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
234
235 const float32x4x4_t af =
236 {
237 {
238 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
239 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
240 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
241 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
242 }
243 };
244
245 const int32x4x4_t rf =
246 {
247 {
Michalis Spyroueae65842020-06-15 20:23:59 +0100248#ifdef __aarch64_
249 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
250 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
251 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
252 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyrou5f390912020-05-13 00:12:08 +0100253#else //__aarch64__
Michalis Spyroueae65842020-06-15 20:23:59 +0100254 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
255 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
256 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
257 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyrou5f390912020-05-13 00:12:08 +0100258#endif //__aarch64__
259 }
260 };
261
262 const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
263 const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
264 wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000265 }
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000266
Michalis Spyrou5f390912020-05-13 00:12:08 +0100267 // Compute left-over elements
268 for(; x < window_end_x; ++x)
269 {
270 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
Michalis Spyroueae65842020-06-15 20:23:59 +0100271 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
272 *(output_ptr + x) = quantize<T>(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info());
Michalis Spyrou5f390912020-05-13 00:12:08 +0100273 }
274 },
275 broadcast_input, non_broadcast_input, output);
276 }
277 else
278 {
Michalis Spyroueae65842020-06-15 20:23:59 +0100279 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
280 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
281 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
282 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
283
Michalis Spyrou5f390912020-05-13 00:12:08 +0100284 // Clear X Dimension on execution window as we handle manually
285 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
286 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000287
Michalis Spyrou5f390912020-05-13 00:12:08 +0100288 Iterator input1(in1, input1_win);
289 Iterator input2(in2, input2_win);
290 Iterator output(out, win);
291
292 execute_window_loop(win, [&](const Coordinates &)
293 {
294 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
295 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
296 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
297
298 // Compute S elements per iteration
299 int x = window_start_x;
300 for(; x <= (window_end_x - window_step_x); x += window_step_x)
301 {
302 const auto a = wrapper::vloadq(input1_ptr + x);
303 const auto b = wrapper::vloadq(input2_ptr + x);
304
305 const float32x4x4_t af =
306 {
307 {
308 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
309 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
310 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
311 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
312 }
313 };
314
315 const float32x4x4_t bf =
316 {
317 {
318 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
319 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
320 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
321 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
322 }
323 };
324
325 const int32x4x4_t rf =
326 {
327 {
328#ifdef __aarch64__
329 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
330 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
331 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
332 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
333#else //__aarch64__
334 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
335 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
336 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
337 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
338#endif //__aarch64__
339 }
340 };
341
342 const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
343 const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
344 wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
345 }
346
347 // Compute left-over elements
348 for(; x < window_end_x; ++x)
349 {
350 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
351 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
352
353 *(output_ptr + x) = quantize<T>((afs - bfs), out->info()->quantization_info());
354 }
355 },
356 input1, input2, output);
357 }
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000358}
359
Michalis Spyrou5f390912020-05-13 00:12:08 +0100360void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000361{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100362 ARM_COMPUTE_UNUSED(is_sat);
363
364 // Create input windows
365 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
366 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
367
368 // Clear X Dimension on execution window as we handle manually
369 Window win = window;
370 win.set(Window::DimX, Window::Dimension(0, 1, 1));
371
372 const int window_step_x = 8;
373 const auto window_start_x = static_cast<int>(window.x().start());
374 const auto window_end_x = static_cast<int>(window.x().end());
375 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000376
377 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
378 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
379 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
380
Michalis Spyrou5f390912020-05-13 00:12:08 +0100381 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
382 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
383 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
384
385 if(is_broadcast_across_x)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000386 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100387 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
388 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
389 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
390 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
391 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
392 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
393 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000394
Michalis Spyrou5f390912020-05-13 00:12:08 +0100395 // Clear X Dimension on execution window as we handle manually
396 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
397
398 Iterator broadcast_input(broadcast_tensor, broadcast_win);
399 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
400 Iterator output(out, win);
401
402 execute_window_loop(win, [&](const Coordinates &)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000403 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100404 const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
405 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
406
407 const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
408 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
409
410 const float32x4x2_t bf =
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000411 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100412 {
413 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
414 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
415 }
416 };
417 const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
418
419 // Compute S elements per iteration
420 int x = window_start_x;
421 for(; x <= (window_end_x - window_step_x); x += window_step_x)
422 {
423 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
424 const float32x4x2_t af =
425 {
426 {
427 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
428 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
429 }
430 };
431
432 const int32x4x4_t rf =
433 {
434 {
435#ifdef __aarch64__
436 vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
437 vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
438#else //__aarch64__
439 vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
440 vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
441#endif //__aarch64__
442 }
443 };
444
445 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
446 vst1q_s16(output_ptr + x, pa);
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000447 }
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000448
Michalis Spyrou5f390912020-05-13 00:12:08 +0100449 // Compute left-over elements
450 for(; x < window_end_x; ++x)
451 {
452 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
453 *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
454 }
455 },
456 broadcast_input, non_broadcast_input, output);
457 }
458 else
459 {
460 // Clear X Dimension on execution window as we handle manually
461 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
462 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000463
Michalis Spyrou5f390912020-05-13 00:12:08 +0100464 Iterator input1(in1, input1_win);
465 Iterator input2(in2, input2_win);
466 Iterator output(out, win);
467
468 execute_window_loop(win, [&](const Coordinates &)
469 {
470 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
471 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
472 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
473
474 // Compute S elements per iteration
475 int x = window_start_x;
476 for(; x <= (window_end_x - window_step_x); x += window_step_x)
477 {
478 const int16x8_t a = vld1q_s16(input1_ptr + x);
479 const int16x8_t b = vld1q_s16(input2_ptr + x);
480
481 const float32x4x2_t af =
482 {
483 {
484 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
485 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
486 }
487 };
488
489 const float32x4x2_t bf =
490 {
491 {
492 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
493 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
494 }
495 };
496
497 const int32x4x2_t rf =
498 {
499 {
500#ifdef __aarch64__
501 vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
502 vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
503#else //__aarch64__
504 vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
505 vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
506#endif //__aarch64__
507 }
508 };
509
510 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
511 vst1q_s16(output_ptr + x, pa);
512 }
513
514 // Compute left-over elements
515 for(; x < window_end_x; ++x)
516 {
517 const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
518 const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
519 *(output_ptr + x) = quantize_qsymm16((afs - bfs), out->info()->quantization_info());
520 }
521 },
522 input1, input2, output);
523 }
524}
525
526void sub_S16_U8_S16_impl(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat, bool is_swapped)
527{
528 // Create input windows
529 Window win = window;
530 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
531 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
532
533 // Clear X Dimension on execution window as we handle manually
534 win.set(Window::DimX, Window::Dimension(0, 1, 1));
535 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
536 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
537
538 Iterator input1(in1, input1_win);
539 Iterator input2(in2, input2_win);
540 Iterator output(out, win);
541
542 const int window_step_x = 8;
543 const auto window_start_x = static_cast<int>(window.x().start());
544 const auto window_end_x = static_cast<int>(window.x().end());
545
546 execute_window_loop(win, [&](const Coordinates &)
547 {
548 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
549 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
550 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
551
552 if(!is_sat)
553 {
554 // Compute S elements per iteration
555 int x = window_start_x;
556 for(; x <= (window_end_x - window_step_x); x += window_step_x)
557 {
558 const auto vin1 = wrapper::vloadq(input1_ptr + x);
559 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
560 const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);
561 wrapper::vstore(output_ptr + x, res);
562 }
563
564 // Compute left-over elements
565 for(; x < window_end_x; ++x)
566 {
567 const auto res = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));
568 *(output_ptr + x) = res;
569 }
570 }
571 else
572 {
573 // Compute S elements per iteration
574 int x = window_start_x;
575 for(; x <= (window_end_x - window_step_x); x += window_step_x)
576 {
577 const auto vin1 = wrapper::vloadq(input1_ptr + x);
578 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
579 const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);
580 wrapper::vstore(output_ptr + x, res);
581 }
582
583 // Compute left-over elements
584 for(; x < window_end_x; ++x)
585 {
586 const auto res = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
587 *(output_ptr + x) = res;
588 }
589 }
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000590 },
591 input1, input2, output);
592}
593
Michalis Spyrou5f390912020-05-13 00:12:08 +0100594void sub_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000595{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100596 sub_S16_U8_S16_impl(in1, in2, out, window, is_sat, false);
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000597}
598
Michalis Spyrou5f390912020-05-13 00:12:08 +0100599void sub_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100600{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100601 // Swap arguments
602 sub_S16_U8_S16_impl(in2, in1, out, window, is_sat, true);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100603}
604
Michalis Spyrou5f390912020-05-13 00:12:08 +0100605void sub_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100606{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100607 // Create input windows
608 Window win = window;
609 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
610 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100611
Michalis Spyrou5f390912020-05-13 00:12:08 +0100612 // Clear X Dimension on execution window as we handle manually
613 win.set(Window::DimX, Window::Dimension(0, 1, 1));
614 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
615 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
616
617 Iterator input1(in1, input1_win);
618 Iterator input2(in2, input2_win);
619 Iterator output(out, win);
620
621 const int window_step_x = 8;
622 const auto window_start_x = static_cast<int>(window.x().start());
623 const auto window_end_x = static_cast<int>(window.x().end());
624
625 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100626 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100627 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
628 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
629 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
630
631 if(!is_sat)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000632 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100633 // Compute S elements per iteration
634 int x = window_start_x;
635 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000636 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100637 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
638 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
639 wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000640 }
Michalis Spyrou5f390912020-05-13 00:12:08 +0100641
642 // Compute left-over elements
643 for(; x < window_end_x; ++x)
644 {
645 *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));
646 }
647 }
648 else
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000649 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100650 // Compute S elements per iteration
651 int x = window_start_x;
652 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000653 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100654 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
655 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
656 wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000657 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100658
Michalis Spyrou5f390912020-05-13 00:12:08 +0100659 // Compute left-over elements
660 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100661 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100662 *(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),
663 static_cast<int16_t>(*(input2_ptr + x)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100664 }
Michalis Spyrou5f390912020-05-13 00:12:08 +0100665 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100666 },
667 input1, input2, output);
668}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000669
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100670inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000671{
672 ARM_COMPUTE_UNUSED(policy);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100673 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
SiCong Li903f8cc2020-08-27 10:17:10 +0100674 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
675 DataType::F32);
676 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
677 DataType::F32);
678 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
679 DataType::F32);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000680
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100681 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
682 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000683
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000684 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
685 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
686 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000687 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000688 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000689 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
690 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
691 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
692 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
SiCong Li903f8cc2020-08-27 10:17:10 +0100693 && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000694 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
695 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
696 "You called subtract with the wrong image formats");
697
698 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
SiCong Li903f8cc2020-08-27 10:17:10 +0100699 (input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP)
700 || (input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP)
701 || (input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP),
702 "Convert policy cannot be WRAP if datatype is quantized");
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000703
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100704 // Validate in case of configured output
705 if(output.total_size() > 0)
706 {
707 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
708 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000709 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000710 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000711 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100712 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
713 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
714 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
715 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
SiCong Li903f8cc2020-08-27 10:17:10 +0100716 && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100717 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
718 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
719 "You called subtract with the wrong image formats");
720
721 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
722 "Wrong shape for output");
723 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000724 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000725}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100726} // namespace
727
728NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100729 : _func(nullptr), _policy(ConvertPolicy::WRAP)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100730{
731}
732
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100733void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100734{
735 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100736 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100737
Michalis Spyrou5f390912020-05-13 00:12:08 +0100738 _policy = policy;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100739
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100740 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100741 const TensorShape &out_shape = broadcast_pair.first;
742 const ValidRegion &valid_region = broadcast_pair.second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100743
Michalis Spyrou5f390912020-05-13 00:12:08 +0100744 // Auto initialize output if not initialized
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100745 set_shape_if_empty(*output, out_shape);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100746
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100747 switch(input1->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100748 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100749 case DataType::U8:
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100750 if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
Michalis Spyrou5f390912020-05-13 00:12:08 +0100751 {
752 _func = &sub_same<uint8_t>;
753 }
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100754 else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
Michalis Spyrou5f390912020-05-13 00:12:08 +0100755 {
756 _func = &sub_U8_U8_S16;
757 }
758 else
759 {
760 _func = &sub_U8_S16_S16;
761 }
762 break;
763 case DataType::QASYMM8:
764 _func = &sub_quantized<uint8_t>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100765 set_data_type_if_unknown(*output, DataType::QASYMM8);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100766 break;
767 case DataType::QASYMM8_SIGNED:
768 _func = &sub_quantized<int8_t>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100769 set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100770 break;
771 case DataType::S16:
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100772 if(input2->data_type() == DataType::U8)
Michalis Spyrou5f390912020-05-13 00:12:08 +0100773 {
774 _func = &sub_S16_U8_S16;
775 }
776 else
777 {
778 _func = &sub_same<int16_t>;
779 }
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100780 set_format_if_unknown(*output, Format::S16);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100781 break;
782 case DataType::QSYMM16:
783 _func = &sub_QSYMM16_QSYMM16_QSYMM16;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100784 set_data_type_if_unknown(*output, DataType::QSYMM16);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100785 break;
SiCong Li903f8cc2020-08-27 10:17:10 +0100786 case DataType::S32:
787 _func = &sub_same<int32_t>;
788 set_format_if_unknown(*output, Format::S32);
789 break;
Michalis Spyrou5f390912020-05-13 00:12:08 +0100790#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
791 case DataType::F16:
792 _func = &sub_same<float16_t>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100793 set_format_if_unknown(*output, Format::F16);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100794 break;
795#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
796 case DataType::F32:
797 _func = &sub_same<float>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100798 set_format_if_unknown(*output, Format::F32);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100799 break;
800 default:
801 _func = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100802 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100803
Michalis Spyrou5f390912020-05-13 00:12:08 +0100804 // NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped
805 Coordinates coord;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100806 coord.set_num_dimensions(output->num_dimensions());
807 output->set_valid_region(valid_region);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100808 Window win = calculate_max_window(valid_region, Steps());
809
810 INEKernel::configure(win);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000811}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100812
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000813Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000814{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100815 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100816 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100817
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000818 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100819}
820
Georgios Pinitas0499dff2020-07-31 22:21:38 +0100821void NEArithmeticSubtractionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100822{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100823 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100824 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
825 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100826 // Dispatch kernel
Georgios Pinitas0499dff2020-07-31 22:21:38 +0100827 (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC_0),
828 tensors.get_const_tensor(TensorType::ACL_SRC_1),
829 tensors.get_tensor(TensorType::ACL_DST),
830 window,
831 (_policy == ConvertPolicy::SATURATE));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100832}
Michalis Spyrou5f390912020-05-13 00:12:08 +0100833} // namespace arm_compute