blob: b57fd3edd2527f23936951d74c7b33c9c3d32ca7 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25namespace arm_compute
26{
27/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
28 * Format is in Q0.7 for all elements */
29const std::array<qint8x8_t, 4> exp_tab_qs8 =
30{
31 {
32 vdup_n_s8(0x7F), // 0.9978546
33 vdup_n_s8(0x3F), // 0.4994721
34 vdup_n_s8(0x16), // 0.1763723
35 vdup_n_s8(0x05), // 0.0435108
36 }
37};
38
39/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
40 * Format is in Q0.7 for all elements */
41const std::array<qint8x16_t, 4> exp_tabq_qs8 =
42{
43 {
44 vdupq_n_s8(0x7F), // 0.9978546
45 vdupq_n_s8(0x3F), // 0.4994721
46 vdupq_n_s8(0x16), // 0.1763723
47 vdupq_n_s8(0x05), // 0.0435108
48 }
49};
50
51/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
52 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
53const std::array<qint8x8_t, 4> log_tab_qs8 =
54{
55 {
56 vdup_n_s8(0x5C), // 1.4384189
57 vdup_n_s8(-0x56), // -0.6771900
58 vdup_n_s8(0x29), // 0.3218538
59 vdup_n_s8(-0x0A), // -0.0832229
60 }
61};
62
63/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
64 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
65const std::array<qint8x16_t, 4> log_tabq_qs8 =
66{
67 {
68 vdupq_n_s8(0x5C), // 1.4384189
69 vdupq_n_s8(-0x56), // -0.6771900
70 vdupq_n_s8(0x29), // 0.3218538
71 vdupq_n_s8(-0x0A), // -0.0832229
72 }
73};
74
75inline qint8x8_t vget_low_qs8(qint8x16_t a)
76{
77 return vget_low_s8(a);
78}
79
80inline qint8x8_t vget_high_qs8(qint8x16_t a)
81{
82 return vget_high_s8(a);
83}
84
85inline qint8x8_t vld1_qs8(const qint8_t *addr)
86{
87 return vld1_s8(addr);
88}
89
90inline qint8x16_t vld1q_qs8(const qint8_t *addr)
91{
92 return vld1q_s8(addr);
93}
94
95inline qint16x4_t vld1_qs16(const qint16_t *addr)
96{
97 return vld1_s16(addr);
98}
99
100inline qint16x8_t vld1q_qs16(const qint16_t *addr)
101{
102 return vld1q_s16(addr);
103}
104
105inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
106{
107 return vld1_dup_s8(addr);
108}
109
110inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
111{
112 return vld1q_dup_s8(addr);
113}
114
115inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
116{
117 vst1_s8(addr, b);
118}
119
120inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
121{
122 vst1q_s8(addr, b);
123}
124
125inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
126{
127 vst1_s16(addr, b);
128}
129
130inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
131{
132 vst1q_s16(addr, b);
133}
134
135inline qint8x8_t vqmovn_qs16(qint16x8_t a)
136{
137 return vqmovn_s16(a);
138}
139
140inline qint8x8_t vdup_n_qs8(qint8_t a)
141{
142 return vdup_n_s8(a);
143}
144
145inline qint8x16_t vdupq_n_qs8(qint8_t a)
146{
147 return vdupq_n_s8(a);
148}
149
150inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
151{
152 float32x4x4_t res =
153 {
154 {
155 vdupq_n_f32(a),
156 vdupq_n_f32(a),
157 vdupq_n_f32(a),
158 vdupq_n_f32(a),
159 }
160 };
161 return vcvtq_qs8_f32(res, fixed_point_position);
162}
163
164inline qint16x8_t vdupq_n_qs16(qint16_t a)
165{
166 return vdupq_n_s16(a);
167}
168
169inline qint8x8_t vabs_qs8(qint8x8_t a)
170{
171 return vabs_s8(a);
172}
173
174inline qint8x16_t vabsq_qs8(qint8x16_t a)
175{
176 return vabsq_s8(a);
177}
178
179inline qint8x8_t vqabs_qs8(qint8x8_t a)
180{
181 return vqabs_s8(a);
182}
183
184inline qint8x16_t vqabsq_qs8(qint8x16_t a)
185{
186 return vqabsq_s8(a);
187}
188
189inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
190{
191 return vmax_s8(a, b);
192}
193
194inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
195{
196 return vmaxq_s8(a, b);
197}
198
199inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
200{
201 return vpmax_s8(a, b);
202}
203
204inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
205{
206 return vmin_s8(a, b);
207}
208
209inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
210{
211 return vminq_s8(a, b);
212}
213
214inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
215{
216 return vpmin_s8(a, b);
217}
218
219inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
220{
221 return vadd_s8(a, b);
222}
223
224inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
225{
226 return vaddq_s8(a, b);
227}
228
229inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
230{
231 return vqadd_s8(a, b);
232}
233
234inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
235{
236 return vqaddq_s8(a, b);
237}
238
239inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
240{
241 return vqadd_s16(a, b);
242}
243
244inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
245{
246 return vqaddq_s16(a, b);
247}
248
249inline int16x4_t vpaddl_qs8(qint8x8_t a)
250{
251 return vpaddl_s8(a);
252}
253
254inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
255{
256 return vsub_s8(a, b);
257}
258
259inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
260{
261 return vsubq_s8(a, b);
262}
263
264inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
265{
266 return vqsub_s8(a, b);
267}
268
269inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
270{
271 return vqsubq_s8(a, b);
272}
273
274inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
275{
276 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
277
278 // Initialize the temporary result with a constant used to round up the result
279 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
280
281 // Vector multiply-accumulate long
282 res = vmlal_s8(res, a, b);
283
284 // Shift right by fixed_point_position
285 res = vshlq_s16(res, fixed_point_position_s16);
286
287 // Convert back to qint8
288 return vmovn_s16(res);
289}
290
291inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
292{
293 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
294
295 // Initialize the temporary results with a constant used to round up the result
296 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
297 qint16x8_t res1 = res0;
298
299 // Vector multiply-accumulate long
300 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
301 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
302
303 // Shift right by fixed_point_position
304 res0 = vshlq_s16(res0, fixed_point_position_s16);
305 res1 = vshlq_s16(res1, fixed_point_position_s16);
306
307 // Convert back to qint8
308 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
309}
310
311inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
312{
313 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
314
315 // Initialize the temporary result with a constant used to round up the result
316 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
317
318 // Vector multiply-accumulate long
319 res = vmlal_s8(res, a, b);
320
321 // Shift right by fixed_point_position
322 res = vqshlq_s16(res, fixed_point_position_s16);
323
324 // Convert back to qint8 and saturate
325 return vqmovn_s16(res);
326}
327
328inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
329{
330 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
331
332 // Initialize the temporary results with a constant used to round up the result
333 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
334 qint16x8_t res1 = res0;
335
336 // Vector multiply-accumulate long
337 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
338 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
339
340 // Shift right by fixed_point_position
341 res0 = vqshlq_s16(res0, fixed_point_position_s16);
342 res1 = vqshlq_s16(res1, fixed_point_position_s16);
343
344 // Convert back to qint8 and saturate
345 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
346}
347
348inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
349{
350 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
351
352 qint16x8_t res = vmull_s8(a, b);
353
354 return vqrshlq_s16(res, fixed_point_position_s16);
355}
356
357inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
358{
359 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
360
361 // Initialize the temporary results with a constant used to round up the result
362 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
363
364 // Vector multiply-accumulate long
365 tmp = vmlal_s8(tmp, b, c);
366
367 // Shift right by fixed_point_position
368 tmp = vshlq_s16(tmp, fixed_point_position_s16);
369
370 // Convert back to qint8 and accumulate
371 return vadd_s8(a, vmovn_s16(tmp));
372}
373
374inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
375{
376 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
377
378 // Initialize the temporary results with a constant used to round up the result
379 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
380 qint16x8_t tmp1 = tmp0;
381
382 // Vector multiply-accumulate long
383 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
384 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
385
386 // Shift right by fixed_point_position
387 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
388 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
389
390 // Convert back to qint8 and accumulate
391 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
392}
393
394inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
395{
396 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
397
398 // Initialize the temporary results with a constant used to round up the result
399 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
400
401 // Vector multiply-accumulate long
402 tmp = vmlal_s8(tmp, b, c);
403
404 // Shift right by fixed_point_position
405 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
406
407 // Convert back to qint8 and accumulate
408 return vqadd_s8(a, vqmovn_s16(tmp));
409}
410
411inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
412{
413 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
414
415 // Initialize the temporary results with a constant used to round up the result
416 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
417 qint16x8_t tmp1 = tmp0;
418
419 // Vector multiply-accumulate long
420 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
421 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
422
423 // Shift right by fixed_point_position
424 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
425 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
426
427 // Convert back to qint8 and accumulate
428 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
429 return vqaddq_s8(a, res);
430}
431
432inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
433{
434 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
435
436 // Initialize the temporary results with a constant used to round up the result
437 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
438
439 // Vector multiply-accumulate long
440 tmp = vmlal_s8(tmp, b, c);
441
442 // Shift right by fixed_point_position
443 tmp = vshlq_s16(tmp, fixed_point_position_s16);
444
445 // Accumulate
446 return vaddq_s16(a, tmp);
447}
448
449inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
450{
451 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
452
453 // Initialize the temporary results with a constant used to round up the result
454 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
455
456 // Vector multiply-accumulate long
457 tmp = vmlal_s8(tmp, b, c);
458
459 // Shift right by fixed_point_position
460 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
461
462 // Accumulate
463 return vqaddq_s16(a, tmp);
464}
465
466inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
467{
468 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
469
470 float32x4x2_t res_f32 =
471 {
472 {
473 vdupq_n_f32(0.5f),
474 vdupq_n_f32(0.5f)
475 }
476 };
477
478 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
479 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
480
481 const int32x4x2_t res_s32 =
482 {
483 {
484 vcvtq_s32_f32(res_f32.val[0]),
485 vcvtq_s32_f32(res_f32.val[1]),
486 }
487 };
488
489 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
490
491 return vqmovn_s16(res_s16);
492}
493
494inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
495{
496 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
497
498 float32x4x4_t res_f32 =
499 {
500 {
501 vdupq_n_f32(0.5f),
502 vdupq_n_f32(0.5f),
503 vdupq_n_f32(0.5f),
504 vdupq_n_f32(0.5f)
505 }
506 };
507
508 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
509 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
510 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
511 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
512
513 const int32x4x4_t res_s32 =
514 {
515 {
516 vcvtq_s32_f32(res_f32.val[0]),
517 vcvtq_s32_f32(res_f32.val[1]),
518 vcvtq_s32_f32(res_f32.val[2]),
519 vcvtq_s32_f32(res_f32.val[3]),
520 }
521 };
522
523 const int16x8x2_t res_s16 =
524 {
525 {
526 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
527 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
528 }
529 };
530
531 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
532}
533
534inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
535{
536 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
537
538 const int16x8_t res_s16 = vmovl_s8(a);
539
540 const int32x4x2_t res_s32 =
541 {
542 {
543 vmovl_s16(vget_low_s16(res_s16)),
544 vmovl_s16(vget_high_s16(res_s16))
545 }
546 };
547
548 float32x4x2_t res_f32 =
549 {
550 {
551 vcvtq_f32_s32(res_s32.val[0]),
552 vcvtq_f32_s32(res_s32.val[1])
553 }
554 };
555
556 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
557 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
558
559 return res_f32;
560}
561
562inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
563{
564 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
565
566 const int16x8x2_t res_s16 =
567 {
568 {
569 vmovl_s8(vget_low_s8(a)),
570 vmovl_s8(vget_high_s8(a)),
571 }
572 };
573
574 const int32x4x4_t res_s32 =
575 {
576 {
577 vmovl_s16(vget_low_s16(res_s16.val[0])),
578 vmovl_s16(vget_high_s16(res_s16.val[0])),
579 vmovl_s16(vget_low_s16(res_s16.val[1])),
580 vmovl_s16(vget_high_s16(res_s16.val[1])),
581 }
582 };
583
584 float32x4x4_t res_f32 =
585 {
586 {
587 vcvtq_f32_s32(res_s32.val[0]),
588 vcvtq_f32_s32(res_s32.val[1]),
589 vcvtq_f32_s32(res_s32.val[2]),
590 vcvtq_f32_s32(res_s32.val[3])
591 }
592 };
593
594 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
595 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
596 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
597 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
598
599 return res_f32;
600}
601
602inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
603{
604 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
605 const qint8x8_t const_48_over_17 = vdup_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823
606 const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
607 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
608
609 // Find shift value
610 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
611 const qint8x8_t temp = vshl_s8(a, shift_value);
612
613 qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
614
615 uint8x8_t set_one = vcgt_s8(x, const_one);
616 x = vbsl_s8(set_one, const_one, x);
617
618 // Use three iterations of Newton-Raphson method to get the result
619 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
620 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
621 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
622
623 return vshl_s8(x, shift_value);
624}
625
626inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
627{
628 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
629 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823
630 const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
631 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
632
633 // Find shift value
634 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
635 const qint8x16_t temp = vshlq_s8(a, shift_value);
636
637 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
638
639 // Set initial guess to one if x > 1
640 uint8x16_t set_one = vcgtq_s8(x, const_one);
641 x = vbslq_s8(set_one, const_one, x);
642
643 // Use three iterations of Newton-Raphson method to get the result
644 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
645 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
646 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
647
648 return vshlq_s8(x, shift_value);
649}
650
651inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
652{
653 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
654 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823
655 const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
656 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
657
658 // Find shift value
659 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
660 const qint8x16_t temp = vqshlq_s8(a, shift_value);
661
662 qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
663
664 // Set initial guess to one if x > 1
665 uint8x16_t set_one = vcgtq_s8(x, const_one);
666 x = vbslq_s8(set_one, const_one, x);
667
668 // Use three iterations of Newton-Raphson method to get the result
669 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
670 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
671 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
672
673 return vqshlq_s8(x, shift_value);
674}
675
676inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
677{
678 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
679}
680
681inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
682{
683 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
684}
685
686template <bool islog>
687inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
688{
689 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
690 const qint8x8_t const_one = vdup_n_s8(1);
691 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
692 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
693 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
694 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
695 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
696 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
697 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
698 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
699 return res;
700}
701
702template <bool islog>
703inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
704{
705 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
706 const qint8x8_t const_one = vdup_n_s8(1);
707 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
708 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
709 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
710 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
711 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
712 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
713 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
714 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
715 return res;
716}
717
718template <bool islog>
719inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
720{
721 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
722 const qint8x16_t const_one = vdupq_n_s8(1);
723 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
724 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
725 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
726 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
727 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
728 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
729 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
730 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
731 return res;
732}
733
734template <bool islog>
735inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
736{
737 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
738 const qint8x16_t const_one = vdupq_n_s8(1);
739 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
740 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
741 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
742 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
743 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
744 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
745 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
746 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
747 return res;
748}
749
750inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
751{
752 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
753 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
754 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
755 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
756
757 // Perform range reduction [-log(2),log(2)]
758 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
759
760 // get decimal part from m
761 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
762
763 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
764 alpha = vqabs_qs8(vqsub_s8(a, alpha));
765
766 // Polynomial Approximation
767 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
768 poly = vqadd_s8(poly, const_one);
769
770 // Reconstruct
771 poly = vqshl_s8(poly, dec_m);
772
773 return poly;
774}
775
776inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
777{
778 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
779 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
780 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
781 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
782
783 // Perform range reduction [-log(2),log(2)]
784 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
785
786 // get decimal part from m
787 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
788
789 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
790 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
791
792 // Polynomial Approximation
793 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
794 poly = vqaddq_s8(poly, const_one);
795
796 // Reconstruct
797 poly = vqshlq_s8(poly, dec_m);
798
799 return poly;
800}
801
802inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
803{
804 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
805 const qint8x8_t const_seven_dec = vdup_n_s8(7);
806 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
807
808 // If 0 < a < 1, calculate log(1/x)
809 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
810 qint8x8_t recip = vdup_n_s8(0);
811 recip = vbsl_s8(calc_reciprocal, recip, a);
812
813 // Calculate reciprocal
814 recip = vrecip_qs8(recip, fixed_point_position);
815 a = vbsl_s8(calc_reciprocal, recip, a);
816
817 // Get decimal part of a
818 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
819 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
820
821 // Get exponent of 2^n which is equal or less than dec_a
822 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
823
824 // Get x to range (1, 2]
825 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
826 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
827 const qint8x8_t sum = vmul_s8(shift_value, const_one);
828
829 // Polynomial Approximation
830 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
831
832 // Reconstruct
833 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
834
835 // Set negative value for 0 < a < 1
836 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
837
838 return poly;
839}
840
841inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
842{
843 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
844 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
845 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
846
847 // If 0 < a < 1, calculate log(1/x)
848 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
849 qint8x16_t recip = vdupq_n_s8(0);
850 recip = vbslq_s8(calc_reciprocal, a, recip);
851
852 // Calculate reciprocal
853 recip = vrecipq_qs8(recip, fixed_point_position);
854 a = vbslq_s8(calc_reciprocal, recip, a);
855
856 // Get decimal part of a
857 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
858 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
859
860 // Get exponent of 2^n which is equal or less than dec_a
861 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
862
863 // Get x to range (1, 2]
864 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
865 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
866 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
867
868 // Polynomial Approximation
869 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
870
871 // Reconstruct
872 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
873
874 // Set negative value for 0 < a < 1
875 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
876
877 return poly;
878}
879
880inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
881{
882 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
883
884 // Find shift value. Number must be in (0.5, 2) range.
885 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
886
887 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
888 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
889 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
890 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
891 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
892
893 temp = vshl_s8(a, shift_value);
894
895 // Initial guess
896 qint8x8_t x = temp;
897
898 // Calculate (x / 2) * (3 - a * x^2)
899 // After three iterations we have the result for 8 bit
900 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
901 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
902 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
903
904 return vshl_s8(x, shift_value2);
905}
906
907inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
908{
909 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
910
911 // Find shift value. Number must be in (0.5, 2) range.
912 qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
913
914 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
915 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
916 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
917 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
918 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
919
920 temp = vshl_s8(a, shift_value);
921
922 // Initial guess
923 qint8x8_t x = temp;
924
925 // Calculate (x / 2) * (3 - a * x^2)
926 // After three iterations we have the result for 8 bit
927 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
928 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
929 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
930
931 return vshl_s8(x, shift_value2);
932}
933
934inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
935{
936 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
937
938 // Find shift value. Number must be in (0.5, 2) range.
939 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
940
941 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
942 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
943 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
944 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
945 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
946
947 temp = vshlq_s8(a, shift_value);
948
949 // Initial guess
950 qint8x16_t x = temp;
951
952 // Calculate (x / 2) * (3 - a * x^2)
953 // After three iterations we have the result for 8 bit
954 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
955 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
956 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
957
958 return vshlq_s8(x, shift_value2);
959}
960
961inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
962{
963 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
964
965 // Find shift value. Number must be in (0.5, 2) range.
966 qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
967
968 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
969 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
970 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
971 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
972 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
973
974 temp = vshlq_s8(a, shift_value);
975
976 // Initial guess
977 qint8x16_t x = temp;
978
979 // Calculate (x / 2) * (3 - a * x^2)
980 // After three iterations we have the result for 8 bit
981 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
982 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
983 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
984
985 return vshlq_s8(x, shift_value2);
986}
987
988inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
989{
990 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
991 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
992
993 qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
994 qint8x8_t num = vqsub_qs8(exp2x, const_one);
995 qint8x8_t den = vqadd_qs8(exp2x, const_one);
996 qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
997
998 return tanh;
999}
1000
1001inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
1002{
1003 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1004 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1005
1006 qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1007 qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1008 qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1009 qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
1010
1011 return tanh;
1012}
1013
1014inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1015{
1016 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1017}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001018
1019inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1020{
1021 float32x4x2_t res =
1022 {
1023 {
1024 vmaxq_f32(a.val[0], b.val[0]),
1025 vmaxq_f32(a.val[1], b.val[1])
1026 }
1027 };
1028 return res;
1029}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001030}