blob: 0d4402e3329f5a62905e5ec05147240978d176f2 [file] [log] [blame]
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +00001/*
Gunes Bayir9b921be2022-07-28 17:44:00 +01002 * Copyright (c) 2020-2022 Arm Limited.
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000024
Dana Zlotnikbd2942d2021-11-15 08:46:04 +020025#include "src/cpu/kernels/add/generic/neon/impl.h"
26#include "arm_compute/core/Helpers.h"
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000027#include "arm_compute/core/utils/misc/Traits.h"
28#include "src/core/NEON/wrapper/wrapper.h"
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000029namespace arm_compute
30{
31namespace cpu
32{
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000033template <typename ScalarType>
Sheri Zhang61243902021-01-12 18:25:16 +000034void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000035{
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +000036 /** SIMD vector tag type. */
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000037 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
38
39 // Create input windows
Sheri Zhang61243902021-01-12 18:25:16 +000040 Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41 Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000042
43 // Clear X Dimension on execution window as we handle manually
44 Window win = window;
45 win.set(Window::DimX, Window::Dimension(0, 1, 1));
46
47 constexpr int window_step_x = 16 / sizeof(ScalarType);
48 const auto window_start_x = static_cast<int>(window.x().start());
49 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang61243902021-01-12 18:25:16 +000050 const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000051
52 if(is_broadcast_across_x)
53 {
54 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
55 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
56 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang61243902021-01-12 18:25:16 +000057 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
58 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000059
60 // Clear X Dimension on execution window as we handle manually
61 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
62
63 Iterator broadcast_input(broadcast_tensor, broadcast_win);
64 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang61243902021-01-12 18:25:16 +000065 Iterator output(dst, win);
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000066
67 execute_window_loop(win, [&](const Coordinates &)
68 {
69 const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
70 const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
71
72 const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
73 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
74
75 // Compute S elements per iteration
76 int x = window_start_x;
77 for(; x <= (window_end_x - window_step_x); x += window_step_x)
78 {
79 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
80 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
81 wrapper::vstore(output_ptr + x, res);
82 }
83
84 // Compute left-over elements
85 for(; x < window_end_x; ++x)
86 {
87 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
88 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
89 }
90 },
91 broadcast_input, non_broadcast_input, output);
92 }
93 else
94 {
95 // Clear X Dimension on execution window as we handle manually
96 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
97 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
98
Sheri Zhang61243902021-01-12 18:25:16 +000099 Iterator input1(src0, input1_win);
100 Iterator input2(src1, input2_win);
101 Iterator output(dst, win);
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +0000102
103 execute_window_loop(win, [&](const Coordinates &)
104 {
105 const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
106 const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
107 const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
108
109 // Compute S elements per iteration
110 int x = window_start_x;
111 for(; x <= (window_end_x - window_step_x); x += window_step_x)
112 {
113 const auto val1 = wrapper::vloadq(input1_ptr + x);
114 const auto val2 = wrapper::vloadq(input2_ptr + x);
115 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
116 wrapper::vstore(output_ptr + x, res);
117 }
118
119 // Compute left-over elements
120 for(; x < window_end_x; ++x)
121 {
122 const auto val1 = *(input1_ptr + x);
123 const auto val2 = *(input2_ptr + x);
124 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
125 }
126 },
127 input1, input2, output);
128 }
129}
Dana Zlotnikbd2942d2021-11-15 08:46:04 +0200130
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100131bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
132{
133 const auto iq0 = src0->quantization_info().uniform();
134 const auto iq1 = src1->quantization_info().uniform();
135 const auto oq = dst->quantization_info().uniform();
136
137 const auto scale0 = iq0.scale / oq.scale;
138 const auto scale1 = iq1.scale / oq.scale;
139
140 if(scale0 < -31.f || scale0 > 31.f || scale1 < -31.f || scale1 > 31.f)
141 {
142 // The scale factor cannot be stored as 6.10 signed fixed-point number.
143 return false;
144 }
145
146 const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
147 const auto max_acc = (std::abs(scale0) + std::abs(scale1)) * 1024.f + std::abs(offset);
148
149 if(max_acc > 2097151.f) // 2^21 - 1
150 {
151 // It might not be possible to store the result as 22.10 signed fixed-point number.
152 return false;
153 }
154
155 return true;
156}
157
158template <typename ScalarType>
159void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
160{
161 ARM_COMPUTE_UNUSED(policy);
162
163 const auto in0_info = src0->info();
164 const auto in1_info = src1->info();
165
166 const auto &in0_shape = in0_info->tensor_shape();
167 const auto &in1_shape = in1_info->tensor_shape();
168
169 // Create input windows.
170 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
171 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
172
173 // Clear the x dimension on the execution window as we process the whole row each iteration.
174 Window win = window;
175 win.set(Window::DimX, Window::Dimension(0, 1, 1));
176
177 constexpr int window_step_x = 16;
178 const auto window_start_x = window.x().start();
179 const auto window_end_x = window.x().end();
180 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
181
182 const auto iq0_info = in0_info->quantization_info().uniform();
183 const auto iq1_info = in1_info->quantization_info().uniform();
184 const auto oq_info = dst->info()->quantization_info().uniform();
185
186 const auto in0_scale = iq0_info.scale / oq_info.scale;
187 const auto in1_scale = iq1_info.scale / oq_info.scale;
188 const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
189
190 const auto in0_scale_6p10 = static_cast<int16_t>(support::cpp11::lround(in0_scale * 1024.f));
191 const auto in1_scale_6p10 = static_cast<int16_t>(support::cpp11::lround(in1_scale * 1024.f));
192 const auto offset_22p10 = static_cast<int32_t>(support::cpp11::lround(offset * 1024.f));
193
194 if(is_broadcast_across_x)
195 {
196 // Prefix: a = non-broadcast, b = broadcast.
197
198 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
199 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
200 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
201 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
202 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
203
204 const auto a_scale_6p10 = is_broadcast_input_1 ? in0_scale_6p10 : in1_scale_6p10;
205 const auto b_scale = is_broadcast_input_1 ? in1_scale : in0_scale;
206 const auto a_vscale_6p10 = wrapper::vdup_n(a_scale_6p10, wrapper::traits::vector_64_tag());
207
Viet-Hoa Do910e3f92022-10-11 13:21:35 +0100208#ifndef __aarch64__
209 const auto a_scale = is_broadcast_input_1 ? in0_scale : in1_scale;
210#endif // __aarch64__
211
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100212 // Clear the x dimension on the execution window as we process the whole row each iteration.
213 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
214
215 Iterator a_input_it(a_tensor, a_win);
216 Iterator b_input_it(b_tensor, b_win);
217 Iterator out_it(dst, win);
218
219 execute_window_loop(win, [&](const Coordinates &)
220 {
221 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
222 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
223 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
224
225 const auto b_val = *b_ptr;
Viet-Hoa Do910e3f92022-10-11 13:21:35 +0100226 const auto b_scaled = b_scale * b_val;
227 const auto b_scaled_22p10 = static_cast<int32_t>(support::cpp11::lround(b_scaled * 1024.f));
228 const auto b_scaled_offseted_22p10 = b_scaled_22p10 + offset_22p10;
229 const auto b_vscaled_offseted_22p10 = wrapper::vdup_n(b_scaled_offseted_22p10, wrapper::traits::vector_128_tag());
230
231#ifndef __aarch64__
232 const auto b_scaled_offseted = b_scaled + offset;
233#endif // __aarch64__
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100234
235 int x = window_start_x;
236
237 for(; x <= (window_end_x - window_step_x); x += window_step_x)
238 {
239 // Load the input.
240 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
241
242 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
243 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
244 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
245
246 // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
247 // Widen and store the result in 32-bit integer.
248 const auto vout_22p10_00 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgetlow(a_vin_16p0_0), a_vscale_6p10);
249 const auto vout_22p10_01 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgethigh(a_vin_16p0_0), a_vscale_6p10);
250 const auto vout_22p10_10 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgetlow(a_vin_16p0_1), a_vscale_6p10);
251 const auto vout_22p10_11 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgethigh(a_vin_16p0_1), a_vscale_6p10);
252
253 // Remove 2 bits of the fractional part, round, narrow to 16-bit and saturate the result.
254 const auto vout_8p8_0 = wrapper::vcombine(
255 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_00),
256 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_01)
257 );
258 const auto vout_8p8_1 = wrapper::vcombine(
259 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_10),
260 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_11)
261 );
262
263 // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
264 const auto vout_8p0 = wrapper::vcombine(
265 wrapper::vqrshrn<8>(vout_8p8_0),
266 wrapper::vqrshrn<8>(vout_8p8_1)
267 );
268
269 // Store the result.
270 wrapper::vstore(out_ptr + x, vout_8p0);
271 }
272
273 // Process the left-over elements.
274 for(; x < window_end_x; ++x)
275 {
Viet-Hoa Do910e3f92022-10-11 13:21:35 +0100276#ifdef __aarch64__
277 out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<2, ScalarType>(int32_t(a_ptr[x]) * a_scale_6p10 + b_scaled_offseted_22p10));
278#else // __aarch64__
279 out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
280#endif // __aarch64__
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100281 }
282 },
283 b_input_it, a_input_it, out_it);
284 }
285 else
286 {
287 const auto vscale0_6p10 = wrapper::vdup_n(in0_scale_6p10, wrapper::traits::vector_64_tag());
288 const auto vscale1_6p10 = wrapper::vdup_n(in1_scale_6p10, wrapper::traits::vector_64_tag());
289 const auto voffset_22p10 = wrapper::vdup_n(offset_22p10, wrapper::traits::vector_128_tag());
290
291 // Clear the x dimension on the execution window as we process the whole row each iteration.
292 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
293 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
294
295 Iterator in0_it(src0, in0_win);
296 Iterator in1_it(src1, in1_win);
297 Iterator out_it(dst, win);
298
299 execute_window_loop(win, [&](const Coordinates &)
300 {
301 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
302 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
303 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
304
305 int x = window_start_x;
306
307 for(; x <= (window_end_x - window_step_x); x += window_step_x)
308 {
309 // Load the inputs.
310 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
311 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
312
313 // Widen the input elements to signed 16-bit regardless of the input signedness.
314 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
315 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
316 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
317 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
318
319 // Multiply the input elements by the scale factor and add the offset.
320 // Widen and store the result in 32-bit integer.
321 const auto vscaled0_offseted_22p10_00 = wrapper::vmlal(voffset_22p10, wrapper::vgetlow(vin0_16p0_0), vscale0_6p10);
322 const auto vscaled0_offseted_22p10_01 = wrapper::vmlal(voffset_22p10, wrapper::vgethigh(vin0_16p0_0), vscale0_6p10);
323 const auto vscaled0_offseted_22p10_10 = wrapper::vmlal(voffset_22p10, wrapper::vgetlow(vin0_16p0_1), vscale0_6p10);
324 const auto vscaled0_offseted_22p10_11 = wrapper::vmlal(voffset_22p10, wrapper::vgethigh(vin0_16p0_1), vscale0_6p10);
325
326 const auto vout_22p10_00 = wrapper::vmlal(vscaled0_offseted_22p10_00, wrapper::vgetlow(vin1_16p0_0), vscale1_6p10);
327 const auto vout_22p10_01 = wrapper::vmlal(vscaled0_offseted_22p10_01, wrapper::vgethigh(vin1_16p0_0), vscale1_6p10);
328 const auto vout_22p10_10 = wrapper::vmlal(vscaled0_offseted_22p10_10, wrapper::vgetlow(vin1_16p0_1), vscale1_6p10);
329 const auto vout_22p10_11 = wrapper::vmlal(vscaled0_offseted_22p10_11, wrapper::vgethigh(vin1_16p0_1), vscale1_6p10);
330
331 // Remove 2 bits of the fractional part, round, narrow to 16-bit and saturate the result.
332 const auto vout_8p8_0 = wrapper::vcombine(
333 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_00),
334 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_01)
335 );
336 const auto vout_8p8_1 = wrapper::vcombine(
337 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_10),
338 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_11)
339 );
340
341 // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
342 const auto vout_8p0 = wrapper::vcombine(
343 wrapper::vqrshrn<8>(vout_8p8_0),
344 wrapper::vqrshrn<8>(vout_8p8_1)
345 );
346
347 // Store the result.
348 wrapper::vstore(out_ptr + x, vout_8p0);
349 }
350
351 // Process the left-over elements.
352 for(; x < window_end_x; ++x)
353 {
Viet-Hoa Do910e3f92022-10-11 13:21:35 +0100354#ifdef __aarch64__
355 out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<2, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_6p10 + int32_t(in1_ptr[x]) * in1_scale_6p10 + offset_22p10));
356#else // __aarch64__
357 out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
358#endif // __aarch64__
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100359 }
360 },
361 in0_it, in1_it, out_it);
362 }
363}
364
Dana Zlotnikbd2942d2021-11-15 08:46:04 +0200365template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
366template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
367template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
368template void add_same_neon<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
369
370#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
371template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
372#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
373
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100374template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
375template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
376
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +0000377} // namespace cpu
378} // namespace arm_compute