blob: 0f7b31c754dc7145095db4ceb4cc70aceb0aa851 [file] [log] [blame]
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +00001/*
Gunes Bayir9b921be2022-07-28 17:44:00 +01002 * Copyright (c) 2020-2022 Arm Limited.
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000024
Dana Zlotnikbd2942d2021-11-15 08:46:04 +020025#include "src/cpu/kernels/add/generic/neon/impl.h"
26#include "arm_compute/core/Helpers.h"
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000027#include "arm_compute/core/utils/misc/Traits.h"
28#include "src/core/NEON/wrapper/wrapper.h"
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000029namespace arm_compute
30{
31namespace cpu
32{
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000033template <typename ScalarType>
Sheri Zhang61243902021-01-12 18:25:16 +000034void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000035{
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +000036 /** SIMD vector tag type. */
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000037 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
38
39 // Create input windows
Sheri Zhang61243902021-01-12 18:25:16 +000040 Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41 Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000042
43 // Clear X Dimension on execution window as we handle manually
44 Window win = window;
45 win.set(Window::DimX, Window::Dimension(0, 1, 1));
46
47 constexpr int window_step_x = 16 / sizeof(ScalarType);
48 const auto window_start_x = static_cast<int>(window.x().start());
49 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang61243902021-01-12 18:25:16 +000050 const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000051
52 if(is_broadcast_across_x)
53 {
54 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
55 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
56 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang61243902021-01-12 18:25:16 +000057 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
58 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000059
60 // Clear X Dimension on execution window as we handle manually
61 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
62
63 Iterator broadcast_input(broadcast_tensor, broadcast_win);
64 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang61243902021-01-12 18:25:16 +000065 Iterator output(dst, win);
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000066
67 execute_window_loop(win, [&](const Coordinates &)
68 {
69 const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
70 const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
71
72 const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
73 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
74
75 // Compute S elements per iteration
76 int x = window_start_x;
77 for(; x <= (window_end_x - window_step_x); x += window_step_x)
78 {
79 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
80 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
81 wrapper::vstore(output_ptr + x, res);
82 }
83
84 // Compute left-over elements
85 for(; x < window_end_x; ++x)
86 {
87 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
88 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
89 }
90 },
91 broadcast_input, non_broadcast_input, output);
92 }
93 else
94 {
95 // Clear X Dimension on execution window as we handle manually
96 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
97 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
98
Sheri Zhang61243902021-01-12 18:25:16 +000099 Iterator input1(src0, input1_win);
100 Iterator input2(src1, input2_win);
101 Iterator output(dst, win);
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +0000102
103 execute_window_loop(win, [&](const Coordinates &)
104 {
105 const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
106 const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
107 const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
108
109 // Compute S elements per iteration
110 int x = window_start_x;
111 for(; x <= (window_end_x - window_step_x); x += window_step_x)
112 {
113 const auto val1 = wrapper::vloadq(input1_ptr + x);
114 const auto val2 = wrapper::vloadq(input2_ptr + x);
115 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
116 wrapper::vstore(output_ptr + x, res);
117 }
118
119 // Compute left-over elements
120 for(; x < window_end_x; ++x)
121 {
122 const auto val1 = *(input1_ptr + x);
123 const auto val2 = *(input2_ptr + x);
124 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
125 }
126 },
127 input1, input2, output);
128 }
129}
Dana Zlotnikbd2942d2021-11-15 08:46:04 +0200130
Gunes Bayir9b921be2022-07-28 17:44:00 +0100131template <typename ScalarType>
132void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
133{
134 const ScalarType *src0_ptr = reinterpret_cast<const ScalarType *>(src0->buffer());
135 const ScalarType *src1_ptr = reinterpret_cast<const ScalarType *>(src1->buffer());
136 ScalarType *dst_ptr = reinterpret_cast<ScalarType *>(dst->buffer());
137
138 constexpr int window_step_x = 16 / sizeof(ScalarType);
139 const auto window_start_x = static_cast<int>(window.x().start());
140 const auto window_end_x = static_cast<int>(window.x().end());
141
142 int x = window_start_x;
143 for(; x <= (window_end_x - window_step_x); x += window_step_x)
144 {
145 const auto val1 = wrapper::vloadq(src0_ptr + x);
146 const auto val2 = wrapper::vloadq(src1_ptr + x);
147 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
148 wrapper::vstore(dst_ptr + x, res);
149 }
150
151 // Compute left-over elements
152 for(; x < window_end_x; ++x)
153 {
154 const auto val1 = *(src0_ptr + x);
155 const auto val2 = *(src1_ptr + x);
156 *(dst_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
157 }
158}
159
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100160bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
161{
162 const auto iq0 = src0->quantization_info().uniform();
163 const auto iq1 = src1->quantization_info().uniform();
164 const auto oq = dst->quantization_info().uniform();
165
166 const auto scale0 = iq0.scale / oq.scale;
167 const auto scale1 = iq1.scale / oq.scale;
168
169 if(scale0 < -31.f || scale0 > 31.f || scale1 < -31.f || scale1 > 31.f)
170 {
171 // The scale factor cannot be stored as 6.10 signed fixed-point number.
172 return false;
173 }
174
175 const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
176 const auto max_acc = (std::abs(scale0) + std::abs(scale1)) * 1024.f + std::abs(offset);
177
178 if(max_acc > 2097151.f) // 2^21 - 1
179 {
180 // It might not be possible to store the result as 22.10 signed fixed-point number.
181 return false;
182 }
183
184 return true;
185}
186
187template <typename ScalarType>
188void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
189{
190 ARM_COMPUTE_UNUSED(policy);
191
192 const auto in0_info = src0->info();
193 const auto in1_info = src1->info();
194
195 const auto &in0_shape = in0_info->tensor_shape();
196 const auto &in1_shape = in1_info->tensor_shape();
197
198 // Create input windows.
199 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
200 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
201
202 // Clear the x dimension on the execution window as we process the whole row each iteration.
203 Window win = window;
204 win.set(Window::DimX, Window::Dimension(0, 1, 1));
205
206 constexpr int window_step_x = 16;
207 const auto window_start_x = window.x().start();
208 const auto window_end_x = window.x().end();
209 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
210
211 const auto iq0_info = in0_info->quantization_info().uniform();
212 const auto iq1_info = in1_info->quantization_info().uniform();
213 const auto oq_info = dst->info()->quantization_info().uniform();
214
215 const auto in0_scale = iq0_info.scale / oq_info.scale;
216 const auto in1_scale = iq1_info.scale / oq_info.scale;
217 const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
218
219 const auto in0_scale_6p10 = static_cast<int16_t>(support::cpp11::lround(in0_scale * 1024.f));
220 const auto in1_scale_6p10 = static_cast<int16_t>(support::cpp11::lround(in1_scale * 1024.f));
221 const auto offset_22p10 = static_cast<int32_t>(support::cpp11::lround(offset * 1024.f));
222
223 if(is_broadcast_across_x)
224 {
225 // Prefix: a = non-broadcast, b = broadcast.
226
227 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
228 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
229 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
230 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
231 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
232
233 const auto a_scale_6p10 = is_broadcast_input_1 ? in0_scale_6p10 : in1_scale_6p10;
234 const auto b_scale = is_broadcast_input_1 ? in1_scale : in0_scale;
235 const auto a_vscale_6p10 = wrapper::vdup_n(a_scale_6p10, wrapper::traits::vector_64_tag());
236
237 // Clear the x dimension on the execution window as we process the whole row each iteration.
238 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
239
240 Iterator a_input_it(a_tensor, a_win);
241 Iterator b_input_it(b_tensor, b_win);
242 Iterator out_it(dst, win);
243
244 execute_window_loop(win, [&](const Coordinates &)
245 {
246 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
247 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
248 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
249
250 const auto b_val = *b_ptr;
251 const auto b_scaled_22p10 = static_cast<int32_t>(support::cpp11::lround(b_scale * b_val * 1024.f));
252 const auto b_vscaled_offseted_22p10 = wrapper::vdup_n(b_scaled_22p10 + offset_22p10, wrapper::traits::vector_128_tag());
253
254 int x = window_start_x;
255
256 for(; x <= (window_end_x - window_step_x); x += window_step_x)
257 {
258 // Load the input.
259 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
260
261 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
262 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
263 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
264
265 // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
266 // Widen and store the result in 32-bit integer.
267 const auto vout_22p10_00 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgetlow(a_vin_16p0_0), a_vscale_6p10);
268 const auto vout_22p10_01 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgethigh(a_vin_16p0_0), a_vscale_6p10);
269 const auto vout_22p10_10 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgetlow(a_vin_16p0_1), a_vscale_6p10);
270 const auto vout_22p10_11 = wrapper::vmlal(b_vscaled_offseted_22p10, wrapper::vgethigh(a_vin_16p0_1), a_vscale_6p10);
271
272 // Remove 2 bits of the fractional part, round, narrow to 16-bit and saturate the result.
273 const auto vout_8p8_0 = wrapper::vcombine(
274 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_00),
275 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_01)
276 );
277 const auto vout_8p8_1 = wrapper::vcombine(
278 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_10),
279 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_11)
280 );
281
282 // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
283 const auto vout_8p0 = wrapper::vcombine(
284 wrapper::vqrshrn<8>(vout_8p8_0),
285 wrapper::vqrshrn<8>(vout_8p8_1)
286 );
287
288 // Store the result.
289 wrapper::vstore(out_ptr + x, vout_8p0);
290 }
291
292 // Process the left-over elements.
293 for(; x < window_end_x; ++x)
294 {
295 out_ptr[x] = utility::clamp<int32_t, ScalarType>((int32_t(a_ptr[x]) * a_scale_6p10 + b_scaled_22p10 + offset_22p10) >> 10);
296 }
297 },
298 b_input_it, a_input_it, out_it);
299 }
300 else
301 {
302 const auto vscale0_6p10 = wrapper::vdup_n(in0_scale_6p10, wrapper::traits::vector_64_tag());
303 const auto vscale1_6p10 = wrapper::vdup_n(in1_scale_6p10, wrapper::traits::vector_64_tag());
304 const auto voffset_22p10 = wrapper::vdup_n(offset_22p10, wrapper::traits::vector_128_tag());
305
306 // Clear the x dimension on the execution window as we process the whole row each iteration.
307 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
308 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
309
310 Iterator in0_it(src0, in0_win);
311 Iterator in1_it(src1, in1_win);
312 Iterator out_it(dst, win);
313
314 execute_window_loop(win, [&](const Coordinates &)
315 {
316 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
317 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
318 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
319
320 int x = window_start_x;
321
322 for(; x <= (window_end_x - window_step_x); x += window_step_x)
323 {
324 // Load the inputs.
325 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
326 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
327
328 // Widen the input elements to signed 16-bit regardless of the input signedness.
329 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
330 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
331 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
332 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
333
334 // Multiply the input elements by the scale factor and add the offset.
335 // Widen and store the result in 32-bit integer.
336 const auto vscaled0_offseted_22p10_00 = wrapper::vmlal(voffset_22p10, wrapper::vgetlow(vin0_16p0_0), vscale0_6p10);
337 const auto vscaled0_offseted_22p10_01 = wrapper::vmlal(voffset_22p10, wrapper::vgethigh(vin0_16p0_0), vscale0_6p10);
338 const auto vscaled0_offseted_22p10_10 = wrapper::vmlal(voffset_22p10, wrapper::vgetlow(vin0_16p0_1), vscale0_6p10);
339 const auto vscaled0_offseted_22p10_11 = wrapper::vmlal(voffset_22p10, wrapper::vgethigh(vin0_16p0_1), vscale0_6p10);
340
341 const auto vout_22p10_00 = wrapper::vmlal(vscaled0_offseted_22p10_00, wrapper::vgetlow(vin1_16p0_0), vscale1_6p10);
342 const auto vout_22p10_01 = wrapper::vmlal(vscaled0_offseted_22p10_01, wrapper::vgethigh(vin1_16p0_0), vscale1_6p10);
343 const auto vout_22p10_10 = wrapper::vmlal(vscaled0_offseted_22p10_10, wrapper::vgetlow(vin1_16p0_1), vscale1_6p10);
344 const auto vout_22p10_11 = wrapper::vmlal(vscaled0_offseted_22p10_11, wrapper::vgethigh(vin1_16p0_1), vscale1_6p10);
345
346 // Remove 2 bits of the fractional part, round, narrow to 16-bit and saturate the result.
347 const auto vout_8p8_0 = wrapper::vcombine(
348 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_00),
349 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_01)
350 );
351 const auto vout_8p8_1 = wrapper::vcombine(
352 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_10),
353 wrapper::vqrshrn_ex<2, ScalarType>(vout_22p10_11)
354 );
355
356 // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
357 const auto vout_8p0 = wrapper::vcombine(
358 wrapper::vqrshrn<8>(vout_8p8_0),
359 wrapper::vqrshrn<8>(vout_8p8_1)
360 );
361
362 // Store the result.
363 wrapper::vstore(out_ptr + x, vout_8p0);
364 }
365
366 // Process the left-over elements.
367 for(; x < window_end_x; ++x)
368 {
369 out_ptr[x] = utility::clamp<int32_t, ScalarType>(
370 (int32_t(in0_ptr[x]) * in0_scale_6p10 + int32_t(in1_ptr[x]) * in1_scale_6p10 + offset_22p10) >> 10);
371 }
372 },
373 in0_it, in1_it, out_it);
374 }
375}
376
Dana Zlotnikbd2942d2021-11-15 08:46:04 +0200377template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
378template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
379template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
380template void add_same_neon<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
381
382#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
383template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
384#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
385
Gunes Bayir9b921be2022-07-28 17:44:00 +0100386template void add_same_neon_as_1d_array<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
387template void add_same_neon_as_1d_array<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
388template void add_same_neon_as_1d_array<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
389template void add_same_neon_as_1d_array<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
390
391#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
392template void add_same_neon_as_1d_array<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
393#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
394
Viet-Hoa Do40b44192022-09-22 10:24:23 +0100395template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
396template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
397
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +0000398} // namespace cpu
399} // namespace arm_compute