blob: 2097d761a7d9a3a5261a9a06cb444bca561aa742 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +00002 * Copyright (c) 2016-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000027#include "arm_compute/core/NEON/NEAsymm.h"
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +000028#include "arm_compute/core/NEON/NESymm.h"
Michalis Spyrou5f390912020-05-13 00:12:08 +010029#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Validate.h"
32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033namespace arm_compute
34{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035namespace
36{
Michalis Spyrou5f390912020-05-13 00:12:08 +010037template <typename T>
38inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
39quantize(float val, const QuantizationInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040{
Michalis Spyrou5f390912020-05-13 00:12:08 +010041 return quantize_qasymm8_signed(val, info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042}
43
Michalis Spyrou5f390912020-05-13 00:12:08 +010044template <typename T>
45inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
46quantize(float val, const QuantizationInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010047{
Michalis Spyrou5f390912020-05-13 00:12:08 +010048 return quantize_qasymm8(val, info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010049}
50
Michalis Spyrou5f390912020-05-13 00:12:08 +010051template <typename T>
52void sub_same(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000053{
Michalis Spyrou5f390912020-05-13 00:12:08 +010054 /** NEON vector tag type. */
55 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
56
57 // Create input windows
58 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
59 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
60
61 // Clear X Dimension on execution window as we handle manually
62 Window win = window;
63 win.set(Window::DimX, Window::Dimension(0, 1, 1));
64
65 constexpr int window_step_x = 16 / sizeof(T);
66 const auto window_start_x = static_cast<int>(window.x().start());
67 const auto window_end_x = static_cast<int>(window.x().end());
68 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
69
Manuel Bottini6a2b6e82019-02-25 13:50:11 +000070 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
71 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
72 Iterator output(out, window);
73
Michalis Spyrou5f390912020-05-13 00:12:08 +010074 if(is_broadcast_across_x)
75 {
76 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
77 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
78 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
79 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
80 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
81
82 // Clear X Dimension on execution window as we handle manually
83 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
84
85 Iterator broadcast_input(broadcast_tensor, broadcast_win);
86 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
87 Iterator output(out, win);
88
89 execute_window_loop(win, [&](const Coordinates &)
90 {
91 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
92 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
93
94 const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
95 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
96
97 // Compute S elements per iteration
98 int x = window_start_x;
99 for(; x <= (window_end_x - window_step_x); x += window_step_x)
100 {
101 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
102 auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
103 if(is_broadcast_input_2)
104 {
105 res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
106 }
107 wrapper::vstore(output_ptr + x, res);
108 }
109
110 // Compute left-over elements
111 for(; x < window_end_x; ++x)
112 {
113 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
114 auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
115 if(is_broadcast_input_2)
116 {
117 res = static_cast<T>(-1) * res;
118 }
119
120 *(output_ptr + x) = res;
121 }
122 },
123 broadcast_input, non_broadcast_input, output);
124 }
125 else
126 {
127 // Clear X Dimension on execution window as we handle manually
128 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
129 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
130
131 Iterator input1(in1, input1_win);
132 Iterator input2(in2, input2_win);
133 Iterator output(out, win);
134
135 execute_window_loop(win, [&](const Coordinates &)
136 {
137 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
138 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
139 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
140
141 // Compute S elements per iteration
142 int x = window_start_x;
143 for(; x <= (window_end_x - window_step_x); x += window_step_x)
144 {
145 const auto val1 = wrapper::vloadq(input1_ptr + x);
146 const auto val2 = wrapper::vloadq(input2_ptr + x);
147 const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
148 wrapper::vstore(output_ptr + x, res);
149 }
150
151 // Compute left-over elements
152 for(; x < window_end_x; ++x)
153 {
154 const auto val1 = *(input1_ptr + x);
155 const auto val2 = *(input2_ptr + x);
156 *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
157 }
158 },
159 input1, input2, output);
160 }
161}
162
163template <typename T>
164void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
165{
166 ARM_COMPUTE_UNUSED(is_sat);
167
168 // Create input windows
169 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
170 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
171
172 // Clear X Dimension on execution window as we handle manually
173 Window win = window;
174 win.set(Window::DimX, Window::Dimension(0, 1, 1));
175
176 const int window_step_x = 16;
177 const auto window_start_x = static_cast<int>(window.x().start());
178 const auto window_end_x = static_cast<int>(window.x().end());
179 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
180
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100181 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
182 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
183 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
184
Michalis Spyrou5f390912020-05-13 00:12:08 +0100185 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100186 const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
187
188 if(is_broadcast_across_x)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000189 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100190 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
191 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
192 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
193 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
194 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
195 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
196 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Michalis Spyroueae65842020-06-15 20:23:59 +0100197 const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
198 const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
199 const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
200 const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000201
Michalis Spyrou5f390912020-05-13 00:12:08 +0100202 // Clear X Dimension on execution window as we handle manually
203 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
204
205 Iterator broadcast_input(broadcast_tensor, broadcast_win);
206 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
207 Iterator output(out, win);
208
209 execute_window_loop(win, [&](const Coordinates &)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000210 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100211 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
212 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
213
214 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
215 const auto broadcast_value_vec = wrapper::vdup_n(static_cast<T>(broadcast_value), wrapper::traits::vector_128_tag{});
216
217 const float32x4x4_t bf =
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000218 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100219 {
220 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
221 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
222 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
223 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
224 }
225 };
Michalis Spyrou5f390912020-05-13 00:12:08 +0100226
227 // Compute S elements per iteration
228 int x = window_start_x;
229 for(; x <= (window_end_x - window_step_x); x += window_step_x)
230 {
231 const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
232
233 const float32x4x4_t af =
234 {
235 {
236 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
237 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
238 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
239 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
240 }
241 };
242
243 const int32x4x4_t rf =
244 {
245 {
Michalis Spyroueae65842020-06-15 20:23:59 +0100246#ifdef __aarch64_
247 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
248 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
249 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
250 vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyrou5f390912020-05-13 00:12:08 +0100251#else //__aarch64__
Michalis Spyroueae65842020-06-15 20:23:59 +0100252 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
253 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
254 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
255 vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
Michalis Spyrou5f390912020-05-13 00:12:08 +0100256#endif //__aarch64__
257 }
258 };
259
260 const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
261 const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
262 wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000263 }
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000264
Michalis Spyrou5f390912020-05-13 00:12:08 +0100265 // Compute left-over elements
266 for(; x < window_end_x; ++x)
267 {
268 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
Michalis Spyroueae65842020-06-15 20:23:59 +0100269 const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
270 *(output_ptr + x) = quantize<T>(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info());
Michalis Spyrou5f390912020-05-13 00:12:08 +0100271 }
272 },
273 broadcast_input, non_broadcast_input, output);
274 }
275 else
276 {
Michalis Spyroueae65842020-06-15 20:23:59 +0100277 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
278 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
279 const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
280 const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
281
Michalis Spyrou5f390912020-05-13 00:12:08 +0100282 // Clear X Dimension on execution window as we handle manually
283 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
284 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000285
Michalis Spyrou5f390912020-05-13 00:12:08 +0100286 Iterator input1(in1, input1_win);
287 Iterator input2(in2, input2_win);
288 Iterator output(out, win);
289
290 execute_window_loop(win, [&](const Coordinates &)
291 {
292 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
293 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
294 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
295
296 // Compute S elements per iteration
297 int x = window_start_x;
298 for(; x <= (window_end_x - window_step_x); x += window_step_x)
299 {
300 const auto a = wrapper::vloadq(input1_ptr + x);
301 const auto b = wrapper::vloadq(input2_ptr + x);
302
303 const float32x4x4_t af =
304 {
305 {
306 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
307 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
308 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
309 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
310 }
311 };
312
313 const float32x4x4_t bf =
314 {
315 {
316 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
317 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
318 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
319 vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
320 }
321 };
322
323 const int32x4x4_t rf =
324 {
325 {
326#ifdef __aarch64__
327 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
328 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
329 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
330 vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
331#else //__aarch64__
332 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
333 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
334 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
335 vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
336#endif //__aarch64__
337 }
338 };
339
340 const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
341 const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
342 wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
343 }
344
345 // Compute left-over elements
346 for(; x < window_end_x; ++x)
347 {
348 const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
349 const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
350
351 *(output_ptr + x) = quantize<T>((afs - bfs), out->info()->quantization_info());
352 }
353 },
354 input1, input2, output);
355 }
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000356}
357
Michalis Spyrou5f390912020-05-13 00:12:08 +0100358void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000359{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100360 ARM_COMPUTE_UNUSED(is_sat);
361
362 // Create input windows
363 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
364 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
365
366 // Clear X Dimension on execution window as we handle manually
367 Window win = window;
368 win.set(Window::DimX, Window::Dimension(0, 1, 1));
369
370 const int window_step_x = 8;
371 const auto window_start_x = static_cast<int>(window.x().start());
372 const auto window_end_x = static_cast<int>(window.x().end());
373 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000374
375 const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
376 const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
377 const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
378
Michalis Spyrou5f390912020-05-13 00:12:08 +0100379 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
380 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
381 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
382
383 if(is_broadcast_across_x)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000384 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100385 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
386 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
387 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
388 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
389 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
390 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
391 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000392
Michalis Spyrou5f390912020-05-13 00:12:08 +0100393 // Clear X Dimension on execution window as we handle manually
394 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
395
396 Iterator broadcast_input(broadcast_tensor, broadcast_win);
397 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
398 Iterator output(out, win);
399
400 execute_window_loop(win, [&](const Coordinates &)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000401 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100402 const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
403 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
404
405 const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
406 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
407
408 const float32x4x2_t bf =
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000409 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100410 {
411 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
412 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
413 }
414 };
415 const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
416
417 // Compute S elements per iteration
418 int x = window_start_x;
419 for(; x <= (window_end_x - window_step_x); x += window_step_x)
420 {
421 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
422 const float32x4x2_t af =
423 {
424 {
425 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
426 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
427 }
428 };
429
430 const int32x4x4_t rf =
431 {
432 {
433#ifdef __aarch64__
434 vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
435 vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
436#else //__aarch64__
437 vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
438 vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
439#endif //__aarch64__
440 }
441 };
442
443 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
444 vst1q_s16(output_ptr + x, pa);
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000445 }
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000446
Michalis Spyrou5f390912020-05-13 00:12:08 +0100447 // Compute left-over elements
448 for(; x < window_end_x; ++x)
449 {
450 const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
451 *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
452 }
453 },
454 broadcast_input, non_broadcast_input, output);
455 }
456 else
457 {
458 // Clear X Dimension on execution window as we handle manually
459 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
460 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000461
Michalis Spyrou5f390912020-05-13 00:12:08 +0100462 Iterator input1(in1, input1_win);
463 Iterator input2(in2, input2_win);
464 Iterator output(out, win);
465
466 execute_window_loop(win, [&](const Coordinates &)
467 {
468 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
469 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
470 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
471
472 // Compute S elements per iteration
473 int x = window_start_x;
474 for(; x <= (window_end_x - window_step_x); x += window_step_x)
475 {
476 const int16x8_t a = vld1q_s16(input1_ptr + x);
477 const int16x8_t b = vld1q_s16(input2_ptr + x);
478
479 const float32x4x2_t af =
480 {
481 {
482 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
483 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
484 }
485 };
486
487 const float32x4x2_t bf =
488 {
489 {
490 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
491 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
492 }
493 };
494
495 const int32x4x2_t rf =
496 {
497 {
498#ifdef __aarch64__
499 vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
500 vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
501#else //__aarch64__
502 vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
503 vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
504#endif //__aarch64__
505 }
506 };
507
508 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
509 vst1q_s16(output_ptr + x, pa);
510 }
511
512 // Compute left-over elements
513 for(; x < window_end_x; ++x)
514 {
515 const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
516 const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
517 *(output_ptr + x) = quantize_qsymm16((afs - bfs), out->info()->quantization_info());
518 }
519 },
520 input1, input2, output);
521 }
522}
523
524void sub_S16_U8_S16_impl(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat, bool is_swapped)
525{
526 // Create input windows
527 Window win = window;
528 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
529 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
530
531 // Clear X Dimension on execution window as we handle manually
532 win.set(Window::DimX, Window::Dimension(0, 1, 1));
533 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
534 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
535
536 Iterator input1(in1, input1_win);
537 Iterator input2(in2, input2_win);
538 Iterator output(out, win);
539
540 const int window_step_x = 8;
541 const auto window_start_x = static_cast<int>(window.x().start());
542 const auto window_end_x = static_cast<int>(window.x().end());
543
544 execute_window_loop(win, [&](const Coordinates &)
545 {
546 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
547 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
548 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
549
550 if(!is_sat)
551 {
552 // Compute S elements per iteration
553 int x = window_start_x;
554 for(; x <= (window_end_x - window_step_x); x += window_step_x)
555 {
556 const auto vin1 = wrapper::vloadq(input1_ptr + x);
557 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
558 const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);
559 wrapper::vstore(output_ptr + x, res);
560 }
561
562 // Compute left-over elements
563 for(; x < window_end_x; ++x)
564 {
565 const auto res = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));
566 *(output_ptr + x) = res;
567 }
568 }
569 else
570 {
571 // Compute S elements per iteration
572 int x = window_start_x;
573 for(; x <= (window_end_x - window_step_x); x += window_step_x)
574 {
575 const auto vin1 = wrapper::vloadq(input1_ptr + x);
576 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
577 const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);
578 wrapper::vstore(output_ptr + x, res);
579 }
580
581 // Compute left-over elements
582 for(; x < window_end_x; ++x)
583 {
584 const auto res = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
585 *(output_ptr + x) = res;
586 }
587 }
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000588 },
589 input1, input2, output);
590}
591
Michalis Spyrou5f390912020-05-13 00:12:08 +0100592void sub_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000593{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100594 sub_S16_U8_S16_impl(in1, in2, out, window, is_sat, false);
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000595}
596
Michalis Spyrou5f390912020-05-13 00:12:08 +0100597void sub_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100598{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100599 // Swap arguments
600 sub_S16_U8_S16_impl(in2, in1, out, window, is_sat, true);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100601}
602
Michalis Spyrou5f390912020-05-13 00:12:08 +0100603void sub_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100604{
Michalis Spyrou5f390912020-05-13 00:12:08 +0100605 // Create input windows
606 Window win = window;
607 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
608 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100609
Michalis Spyrou5f390912020-05-13 00:12:08 +0100610 // Clear X Dimension on execution window as we handle manually
611 win.set(Window::DimX, Window::Dimension(0, 1, 1));
612 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
613 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
614
615 Iterator input1(in1, input1_win);
616 Iterator input2(in2, input2_win);
617 Iterator output(out, win);
618
619 const int window_step_x = 8;
620 const auto window_start_x = static_cast<int>(window.x().start());
621 const auto window_end_x = static_cast<int>(window.x().end());
622
623 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100624 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100625 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
626 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
627 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
628
629 if(!is_sat)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000630 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100631 // Compute S elements per iteration
632 int x = window_start_x;
633 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000634 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100635 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
636 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
637 wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000638 }
Michalis Spyrou5f390912020-05-13 00:12:08 +0100639
640 // Compute left-over elements
641 for(; x < window_end_x; ++x)
642 {
643 *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));
644 }
645 }
646 else
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000647 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100648 // Compute S elements per iteration
649 int x = window_start_x;
650 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000651 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100652 const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
653 const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
654 wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000655 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100656
Michalis Spyrou5f390912020-05-13 00:12:08 +0100657 // Compute left-over elements
658 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100659 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100660 *(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),
661 static_cast<int16_t>(*(input2_ptr + x)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100662 }
Michalis Spyrou5f390912020-05-13 00:12:08 +0100663 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100664 },
665 input1, input2, output);
666}
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000667
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100668inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000669{
670 ARM_COMPUTE_UNUSED(policy);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100671 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000672 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
673 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
674 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000675
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100676 const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
677 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000678
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000679 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
680 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
681 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000682 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000683 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000684 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)
685 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)
686 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)
687 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)
688 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)
689 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),
690 "You called subtract with the wrong image formats");
691
692 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000693 input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000694 && input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP
695 && input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP,
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000696 "Convert policy cannot be WRAP if datatype is QASYMM8 or QASYMM8_SIGNED");
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000697
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100698 // Validate in case of configured output
699 if(output.total_size() > 0)
700 {
701 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
702 !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Manuel Bottini6a2b6e82019-02-25 13:50:11 +0000703 && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
Michalis Spyrou6f58b372019-12-04 12:00:36 +0000704 && !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)
Michele Di Giorgio9f2403f2020-03-27 10:23:44 +0000705 && !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16)
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100706 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
707 && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
708 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
709 && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
710 && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
711 && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
712 "You called subtract with the wrong image formats");
713
714 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
715 "Wrong shape for output");
716 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000717 return Status{};
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000718}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100719} // namespace
720
721NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100722 : _func(nullptr), _policy(ConvertPolicy::WRAP)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723{
724}
725
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100726void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100727{
728 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100729 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100730
Michalis Spyrou5f390912020-05-13 00:12:08 +0100731 _policy = policy;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100732
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100733 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100734 const TensorShape &out_shape = broadcast_pair.first;
735 const ValidRegion &valid_region = broadcast_pair.second;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100736
Michalis Spyrou5f390912020-05-13 00:12:08 +0100737 // Auto initialize output if not initialized
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100738 set_shape_if_empty(*output, out_shape);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100739
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100740 switch(input1->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100741 {
Michalis Spyrou5f390912020-05-13 00:12:08 +0100742 case DataType::U8:
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100743 if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
Michalis Spyrou5f390912020-05-13 00:12:08 +0100744 {
745 _func = &sub_same<uint8_t>;
746 }
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100747 else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
Michalis Spyrou5f390912020-05-13 00:12:08 +0100748 {
749 _func = &sub_U8_U8_S16;
750 }
751 else
752 {
753 _func = &sub_U8_S16_S16;
754 }
755 break;
756 case DataType::QASYMM8:
757 _func = &sub_quantized<uint8_t>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100758 set_data_type_if_unknown(*output, DataType::QASYMM8);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100759 break;
760 case DataType::QASYMM8_SIGNED:
761 _func = &sub_quantized<int8_t>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100762 set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100763 break;
764 case DataType::S16:
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100765 if(input2->data_type() == DataType::U8)
Michalis Spyrou5f390912020-05-13 00:12:08 +0100766 {
767 _func = &sub_S16_U8_S16;
768 }
769 else
770 {
771 _func = &sub_same<int16_t>;
772 }
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100773 set_format_if_unknown(*output, Format::S16);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100774 break;
775 case DataType::QSYMM16:
776 _func = &sub_QSYMM16_QSYMM16_QSYMM16;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100777 set_data_type_if_unknown(*output, DataType::QSYMM16);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100778 break;
779#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
780 case DataType::F16:
781 _func = &sub_same<float16_t>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100782 set_format_if_unknown(*output, Format::F16);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100783 break;
784#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
785 case DataType::F32:
786 _func = &sub_same<float>;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100787 set_format_if_unknown(*output, Format::F32);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100788 break;
789 default:
790 _func = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100791 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100792
Michalis Spyrou5f390912020-05-13 00:12:08 +0100793 // NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped
794 Coordinates coord;
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100795 coord.set_num_dimensions(output->num_dimensions());
796 output->set_valid_region(valid_region);
Michalis Spyrou5f390912020-05-13 00:12:08 +0100797 Window win = calculate_max_window(valid_region, Steps());
798
799 INEKernel::configure(win);
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000800}
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100801
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000802Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo397d58a2017-11-30 15:19:11 +0000803{
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100804 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
Georgios Pinitascbf39c62018-09-10 15:07:45 +0100805 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100806
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000807 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100808}
809
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100810void NEArithmeticSubtractionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100811{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100812 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100813 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
814 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Michalis Spyrou173ba9b2020-06-23 17:25:43 +0100815 // Dispatch kernel
816 (*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), window, (_policy == ConvertPolicy::SATURATE));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100817}
Michalis Spyrou5f390912020-05-13 00:12:08 +0100818} // namespace arm_compute