blob: fdf94f08594cc2c05748bc6c0675e9e7cc9cf913 [file] [log] [blame]
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +00001/*
Viet-Hoa Dofd472f02023-03-15 14:05:06 +00002 * Copyright (c) 2020-2023 Arm Limited.
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Viet-Hoa Do5ef0bdd2023-10-19 10:15:54 +010024
25#ifndef ACL_SRC_CORE_NEON_SVEMATH_INL
26#define ACL_SRC_CORE_NEON_SVEMATH_INL
27
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000028#include <cmath>
29#include <limits>
30
Michalis Spyrou20fca522021-06-07 14:23:57 +010031#if defined(__ARM_FEATURE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE)
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000032
Sang-Hoon Park0870db42020-12-08 18:42:19 +000033#ifndef M_PI
34#define M_PI (3.14159265358979323846)
35#endif // M_PI
36
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000037namespace arm_compute
38{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010039inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg,
40 svfloat32_t x,
41 svfloat32_t coeff_1,
42 svfloat32_t coeff_2,
43 svfloat32_t coeff_3,
44 svfloat32_t coeff_4,
45 svfloat32_t coeff_5,
46 svfloat32_t coeff_6,
47 svfloat32_t coeff_7,
48 svfloat32_t coeff_8)
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000049{
Michalis Spyrou226169f2021-03-22 10:08:38 +000050 const auto A = svmla_f32_z(pg, coeff_1, coeff_5, x);
51 const auto B = svmla_f32_z(pg, coeff_3, coeff_7, x);
52 const auto C = svmla_f32_z(pg, coeff_2, coeff_6, x);
53 const auto D = svmla_f32_z(pg, coeff_4, coeff_8, x);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000054 const auto x2 = svmul_f32_z(pg, x, x);
55 const auto x4 = svmul_f32_z(pg, x2, x2);
56 const auto res = svmla_f32_z(pg, svmla_f32_z(pg, A, B, x2), svmla_f32_z(pg, C, D, x2), x4);
57 return res;
58}
59
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010060inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg,
61 svfloat16_t x,
62 svfloat16_t coeff_1,
63 svfloat16_t coeff_2,
64 svfloat16_t coeff_3,
65 svfloat16_t coeff_4,
66 svfloat16_t coeff_5,
67 svfloat16_t coeff_6,
68 svfloat16_t coeff_7,
69 svfloat16_t coeff_8)
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000070{
Michalis Spyrou226169f2021-03-22 10:08:38 +000071 const auto A = svmla_f16_z(pg, coeff_1, coeff_5, x);
72 const auto B = svmla_f16_z(pg, coeff_3, coeff_7, x);
73 const auto C = svmla_f16_z(pg, coeff_2, coeff_6, x);
74 const auto D = svmla_f16_z(pg, coeff_4, coeff_8, x);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +000075 const auto x2 = svmul_f16_z(pg, x, x);
76 const auto x4 = svmul_f16_z(pg, x2, x2);
77 const auto res = svmla_f16_z(pg, svmla_f16_z(pg, A, B, x2), svmla_f16_z(pg, C, D, x2), x4);
78 return res;
79}
80
81inline svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x)
82{
83 auto recip = svrecpe_f16(x);
84 recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
85 recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
86 return recip;
87}
88
89inline svfloat32_t svinv_f32_z(svbool_t pg, svfloat32_t x)
90{
91 auto recip = svrecpe_f32(x);
92 recip = svmul_f32_z(pg, svrecps_f32(x, recip), recip);
93 recip = svmul_f32_z(pg, svrecps_f32(x, recip), recip);
94 return recip;
95}
96
Viet-Hoa Do24c87f02022-12-20 12:07:23 +000097static const uint32_t svexp_f32_coeff[] = {
98 0x3f7ffff6, // x^1: 0x1.ffffecp-1f
99 0x3efffedb, // x^2: 0x1.fffdb6p-2f
100 0x3e2aaf33, // x^3: 0x1.555e66p-3f
101 0x3d2b9f17, // x^4: 0x1.573e2ep-5f
102 0x3c072010, // x^5: 0x1.0e4020p-7f
103};
104
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000105inline svfloat32_t svexp_f32_z(svbool_t pg, svfloat32_t x)
106{
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000107 const auto c1 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[0]));
108 const auto c2 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[1]));
109 const auto c3 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[2]));
110 const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3]));
111 const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4]));
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000112
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100113 const auto shift = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
114 const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
115 const auto neg_ln2_hi =
116 svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f
117 const auto neg_ln2_lo =
118 svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000119
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000120 const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100121 const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000122 const auto zero = svdup_n_f32(0.f);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100123 const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000124
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000125 // Range reduction:
126 // e^x = 2^n * e^r
127 // where:
128 // n = floor(x / ln(2))
129 // r = x - n * ln(2)
130 //
131 // By adding x / ln(2) with 2^23 + 127 (shift):
132 // * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
133 // of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
134 // the whole fraction part of z in FP32 format.
135 // Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
136 // (i.e. n) because the decimal part has been pushed out and lost.
137 // * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
138 // in FP32 format. Left shifting z by 23 bits will result in 2^n.
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100139 const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
140 const auto n = svsub_f32_z(pg, z, shift);
141 const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000142
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000143 // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
144 // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
145 const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100146 const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000147
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000148 // Compute the truncated Taylor series of e^r.
149 // poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
150 const auto r2 = svmul_f32_z(pg, r, r);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000151
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100152 const auto p1 = svmul_f32_z(pg, c1, r);
153 const auto p23 = svmla_f32_z(pg, c2, c3, r);
154 const auto p45 = svmla_f32_z(pg, c4, c5, r);
155 const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
Viet-Hoa Do24c87f02022-12-20 12:07:23 +0000156 const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
157
158 auto poly = svmla_f32_z(pg, scale, p12345, scale);
159
160 // Handle underflow and overflow.
161 poly = svsel_f32(svcmplt_f32(pg, x, min_input), zero, poly);
162 poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000163
164 return poly;
165}
166
167inline svfloat16_t svexp_f16_z(svbool_t pg, svfloat16_t x)
168{
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000169 auto bottom = svcvt_f32_z(pg, x);
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000170 auto pg_top = svptrue_b16();
171 auto top = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000172
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000173 bottom = svexp_f32_z(pg, bottom);
174 top = svexp_f32_z(pg_top, top);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000175
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000176 return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000177}
178
Viet-Hoa Do5ef0bdd2023-10-19 10:15:54 +0100179#ifdef ARM_COMPUTE_ENABLE_SVE2
180
181inline svfloat16_t svexp_f16_z_sve2(svbool_t pg, svfloat16_t x)
182{
183 auto bottom = svcvt_f32_z(pg, x);
184 auto top = svcvtlt_f32_x(pg, x);
185 auto pg_top = pg;
186
187 bottom = svexp_f32_z(pg, bottom);
188 top = svexp_f32_z(pg_top, top);
189
190 return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
191}
192
193#endif // ARM_COMPUTE_ENABLE_SVE2
194
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000195inline svfloat32_t svtanh_f32_z(svbool_t pg, svfloat32_t val)
196{
197 const svfloat32_t CONST_1 = svdup_n_f32(1.f);
198 const svfloat32_t CONST_2 = svdup_n_f32(2.f);
199 const svfloat32_t CONST_MIN_TANH = svdup_n_f32(-10.f);
200 const svfloat32_t CONST_MAX_TANH = svdup_n_f32(10.f);
201
202 svfloat32_t x = svmin_f32_z(pg, svmax_f32_z(pg, val, CONST_MIN_TANH), CONST_MAX_TANH);
203 svfloat32_t exp2x = svexp_f32_z(pg, svmul_f32_z(pg, CONST_2, x));
204 svfloat32_t num = svsub_f32_z(pg, exp2x, CONST_1);
205 svfloat32_t den = svadd_f32_z(pg, exp2x, CONST_1);
206 svfloat32_t tanh = svdiv_f32_z(pg, num, den);
207 return tanh;
208}
209
210inline svfloat16_t svtanh_f16_z(svbool_t pg, svfloat16_t val)
211{
212 const svfloat16_t CONST_1 = svdup_n_f16(1.f);
213 const svfloat16_t CONST_2 = svdup_n_f16(2.f);
214 const svfloat16_t CONST_MIN_TANH = svdup_n_f16(-10.f);
215 const svfloat16_t CONST_MAX_TANH = svdup_n_f16(10.f);
216
217 const svfloat16_t x = svmin_f16_z(pg, svmax_f16_z(pg, val, CONST_MIN_TANH), CONST_MAX_TANH);
218 const svfloat16_t exp2x = svexp_f16_z(pg, svmul_f16_z(pg, CONST_2, x));
219 const svfloat16_t num = svsub_f16_z(pg, exp2x, CONST_1);
220 const svfloat16_t den = svadd_f16_z(pg, exp2x, CONST_1);
221 const svfloat16_t tanh = svdiv_f16_z(pg, num, den);
222 return tanh;
223}
224
225inline svfloat32_t svlog_f32_z(svbool_t pg, svfloat32_t x)
226{
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000227 /** Logarithm polynomial coefficients */
Michalis Spyrou226169f2021-03-22 10:08:38 +0000228 const svfloat32_t log_tab_1 = svdup_n_f32(-2.29561495781f);
229 const svfloat32_t log_tab_2 = svdup_n_f32(-2.47071170807f);
230 const svfloat32_t log_tab_3 = svdup_n_f32(-5.68692588806f);
231 const svfloat32_t log_tab_4 = svdup_n_f32(-0.165253549814f);
232 const svfloat32_t log_tab_5 = svdup_n_f32(5.17591238022f);
233 const svfloat32_t log_tab_6 = svdup_n_f32(0.844007015228f);
234 const svfloat32_t log_tab_7 = svdup_n_f32(4.58445882797f);
235 const svfloat32_t log_tab_8 = svdup_n_f32(0.0141278216615f);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000236
237 const auto CONST_127 = svdup_n_s32(127); // 127
238 const auto CONST_LN2 = svdup_n_f32(0.6931471805f); // ln(2)
239
240 // Extract exponent
241 auto m = svsub_s32_z(pg, svasr_n_s32_z(pg, svreinterpret_s32_f32(x), 23), CONST_127);
242 auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
243
244 // Polynomial Approximation
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100245 auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6,
246 log_tab_7, log_tab_8);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000247
248 // Reconstruct
249 poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
250
251 return poly;
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000252}
253
254inline svfloat16_t svlog_f16_z(svbool_t pg, svfloat16_t x)
255{
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000256 auto bottom = svcvt_f32_z(pg, x);
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000257 auto pg_top = svptrue_b16();
258 auto top = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(x))));
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000259
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000260 bottom = svlog_f32_z(pg, bottom);
261 top = svlog_f32_z(pg_top, top);
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000262
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000263 return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000264}
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000265
Viet-Hoa Do5ef0bdd2023-10-19 10:15:54 +0100266#ifdef ARM_COMPUTE_ENABLE_SVE2
267
268inline svfloat16_t svlog_f16_z_sve2(svbool_t pg, svfloat16_t x)
269{
270 auto bottom = svcvt_f32_z(pg, x);
271 auto top = svcvtlt_f32_x(pg, x);
272 auto pg_top = pg;
273
274 bottom = svlog_f32_z(pg, bottom);
275 top = svlog_f32_z(pg_top, top);
276
277 return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
278}
279
280#endif // ARM_COMPUTE_ENABLE_SVE2
281
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000282inline svfloat32_t svsin_f32_z(svbool_t pg, svfloat32_t val)
283{
284 using ScalarType = float;
Georgios Pinitasf8f04422021-01-08 17:25:55 +0000285 using IntType = uint32_t;
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000286
287 constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
288 constexpr float te_sin_coeff3 = 0.05f; // 1/(4*5)
289 constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7)
290 constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9)
291
292 const auto pi_v = wrapper::svdup_n(ScalarType(M_PI));
293 const auto pio2_v = wrapper::svdup_n(ScalarType(M_PI / 2));
294 const auto ipi_v = wrapper::svdup_n(ScalarType(1 / M_PI));
295
296 //Find positive or negative
297 const auto c_v = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v)));
298 const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0)));
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100299 const auto odd_v = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))),
300 wrapper::svdup_n(IntType(0)));
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000301
302 auto neg_v = sveor_z(pg, odd_v, sign_v);
303
304 //Modulus a - (n * int(a*(1/n)))
305 auto ma = svsub_z(pg, svabs_z(pg, val), svmul_z(pg, pi_v, wrapper::svcvt_z<ScalarType>(pg, c_v)));
306 const auto reb_v = svcmpge(pg, ma, pio2_v);
307
308 //Rebase a between 0 and pi/2
309 ma = svsel(reb_v, svsub_z(pg, pi_v, ma), ma);
310
311 //Taylor series
312 const auto ma2 = svmul_z(pg, ma, ma);
313
314 //2nd elem: x^3 / 3!
315 auto elem = svmul_z(pg, svmul_z(pg, ma, ma2), wrapper::svdup_n(ScalarType(te_sin_coeff2)));
316 auto res = svsub_z(pg, ma, elem);
317
318 //3rd elem: x^5 / 5!
319 elem = svmul_z(pg, svmul_z(pg, elem, ma2), wrapper::svdup_n(ScalarType(te_sin_coeff3)));
320 res = svadd_z(pg, res, elem);
321
322 //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
323 elem = svmul_z(pg, svmul_z(pg, elem, ma2), wrapper::svdup_n(ScalarType(te_sin_coeff4)));
324 res = svsub_z(pg, res, elem);
325
326 //5th elem: x^9 / 9!
327 elem = svmul_z(pg, svmul_z(pg, elem, ma2), wrapper::svdup_n(ScalarType(te_sin_coeff5)));
328 res = svadd_z(pg, res, elem);
329
330 //Change of sign
331 res = svneg_m(res, neg_v, res);
332 return res;
333}
334
335inline svfloat16_t svsin_f16_z(svbool_t pg, svfloat16_t val)
336{
337 auto bottom = svcvt_f32_z(pg, val);
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000338 auto pg_top = svptrue_b16();
339 auto top = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(val))));
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000340
341 bottom = svsin_f32_z(pg, bottom);
342 top = svsin_f32_z(pg_top, top);
343
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000344 return svtrn1(svcvt_f16_z(pg, bottom), svcvt_f16_z(pg_top, top));
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000345}
346
Viet-Hoa Do5ef0bdd2023-10-19 10:15:54 +0100347#ifdef ARM_COMPUTE_ENABLE_SVE2
348
349inline svfloat16_t svsin_f16_z_sve2(svbool_t pg, svfloat16_t val)
350{
351 auto bottom = svcvt_f32_z(pg, val);
352 auto top = svcvtlt_f32_x(pg, val);
353 auto pg_top = pg;
354
355 bottom = svsin_f32_z(pg, bottom);
356 top = svsin_f32_z(pg_top, top);
357
358 return svcvtnt_f16_m(svcvt_f16_z(pg, bottom), pg_top, top);
359}
360
361#endif // ARM_COMPUTE_ENABLE_SVE2
362
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000363inline svfloat32_t svpow_f32_z(svbool_t pg, svfloat32_t a, svfloat32_t b)
364{
365 return svexp_f32_z(pg, svmul_z(pg, b, svlog_f32_z(pg, a)));
366}
367
368inline svfloat16_t svpow_f16_z(svbool_t pg, svfloat16_t a, svfloat16_t b)
369{
370 auto a_bottom = svcvt_f32_z(pg, a);
371 auto b_bottom = svcvt_f32_z(pg, b);
372
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000373 auto pg_top = svptrue_b16();
374 auto a_top = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(a))));
375 auto b_top = svcvt_f32_z(pg_top, svreinterpret_f16(svrevh_z(svptrue_b16(), svreinterpret_u32(b))));
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000376
Sang-Hoon Parkaf1870b2020-12-08 18:50:56 +0000377 auto res_bottom = svpow_f32_z(pg, a_bottom, b_bottom);
378 auto res_top = svpow_f32_z(pg_top, a_top, b_top);
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000379
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000380 return svtrn1(svcvt_f16_z(pg, res_bottom), svcvt_f16_z(pg_top, res_top));
Sang-Hoon Park0870db42020-12-08 18:42:19 +0000381}
382
Viet-Hoa Do5ef0bdd2023-10-19 10:15:54 +0100383#ifdef ARM_COMPUTE_ENABLE_SVE2
384
385inline svfloat16_t svpow_f16_z_sve2(svbool_t pg, svfloat16_t a, svfloat16_t b)
386{
387 auto a_bottom = svcvt_f32_z(pg, a);
388 auto b_bottom = svcvt_f32_z(pg, b);
389
390 auto pg_top = pg;
391 auto a_top = svcvtlt_f32_x(pg, a);
392 auto b_top = svcvtlt_f32_x(pg, b);
393
394 auto res_bottom = svpow_f32_z(pg, a_bottom, b_bottom);
395 auto res_top = svpow_f32_z(pg_top, a_top, b_top);
396
397 return svcvtnt_f16_m(svcvt_f16_z(pg, res_bottom), pg_top, res_top);
398}
399
400#endif // ARM_COMPUTE_ENABLE_SVE2
401
Michalis Spyrou20fca522021-06-07 14:23:57 +0100402#if defined(ARM_COMPUTE_ENABLE_SVE2)
Sang-Hoon Parkdcf3c7e2021-03-04 17:03:46 +0000403template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100404inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0,
405 const svfloat32_t &in_1,
406 const svfloat32_t &in_2,
407 const svfloat32_t &in_3)
Sang-Hoon Parkdcf3c7e2021-03-04 17:03:46 +0000408{
409 svuint8_t out;
410 const auto all_true_pg = svptrue_b32();
411 auto tmp_0 = svcvt_u32_f32_z(all_true_pg, in_0);
412 auto tmp_1 = svcvt_u32_f32_z(all_true_pg, in_1);
413 auto tmp_2 = svcvt_u32_f32_z(all_true_pg, in_2);
414 auto tmp_3 = svcvt_u32_f32_z(all_true_pg, in_3);
415
416 auto tmp_16_0 = svqxtnt_u32(svqxtnb_u32(tmp_0), tmp_1);
417 auto tmp_16_1 = svqxtnt_u32(svqxtnb_u32(tmp_2), tmp_3);
418
419 auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0);
420 auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0);
421 auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1);
422 auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1);
423
424 auto pg = svwhilelt_b16_s32(0, svcnth() / 2);
425
426 tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1);
427 tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3);
428
429 out = svqxtnt_u16(svqxtnb_u16(tmp_16_0), tmp_16_1);
430
431 auto out_uzp_0 = svuzp1(out, out);
432 auto out_uzp_1 = svuzp2(out, out);
433
434 pg = svwhilelt_b8_s32(0, svcntb() / 2);
435 out = svsplice(pg, out_uzp_0, out_uzp_1);
436
437 return out;
438}
439
440template <>
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100441inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0,
442 const svfloat32_t &in_1,
443 const svfloat32_t &in_2,
444 const svfloat32_t &in_3)
Sang-Hoon Parkdcf3c7e2021-03-04 17:03:46 +0000445{
446 svint8_t out;
447 const auto all_true_pg = svptrue_b32();
448 auto tmp_0 = svcvt_s32_f32_z(all_true_pg, in_0);
449 auto tmp_1 = svcvt_s32_f32_z(all_true_pg, in_1);
450 auto tmp_2 = svcvt_s32_f32_z(all_true_pg, in_2);
451 auto tmp_3 = svcvt_s32_f32_z(all_true_pg, in_3);
452
453 auto tmp_16_0 = svqxtnt_s32(svqxtnb_s32(tmp_0), tmp_1);
454 auto tmp_16_1 = svqxtnt_s32(svqxtnb_s32(tmp_2), tmp_3);
455
456 auto tmp_16_uzp_0 = svuzp1(tmp_16_0, tmp_16_0);
457 auto tmp_16_uzp_1 = svuzp2(tmp_16_0, tmp_16_0);
458 auto tmp_16_uzp_2 = svuzp1(tmp_16_1, tmp_16_1);
459 auto tmp_16_uzp_3 = svuzp2(tmp_16_1, tmp_16_1);
460
461 auto pg = svwhilelt_b16_s32(0, svcnth() / 2);
462
463 tmp_16_0 = svsplice(pg, tmp_16_uzp_0, tmp_16_uzp_1);
464 tmp_16_1 = svsplice(pg, tmp_16_uzp_2, tmp_16_uzp_3);
465
466 out = svqxtnt_s16(svqxtnb_s16(tmp_16_0), tmp_16_1);
467
468 auto out_uzp_0 = svuzp1(out, out);
469 auto out_uzp_1 = svuzp2(out, out);
470
471 pg = svwhilelt_b8_s32(0, svcntb() / 2);
472 out = svsplice(pg, out_uzp_0, out_uzp_1);
473
474 return out;
475}
Michalis Spyrou20fca522021-06-07 14:23:57 +0100476#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
Sang-Hoon Parkdcf3c7e2021-03-04 17:03:46 +0000477
Michalis Spyrouaa51a5b2020-11-22 00:49:42 +0000478} // namespace arm_compute
Michalis Spyrou20fca522021-06-07 14:23:57 +0100479#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
Viet-Hoa Do5ef0bdd2023-10-19 10:15:54 +0100480
481#endif // ACL_SRC_CORE_NEON_SVEMATH_INL