blob: 05e481561d2f547e2e172057908cd419d59ebdcb [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
28/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010030static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031{
32 {
33 vdup_n_s8(0x7F), // 0.9978546
34 vdup_n_s8(0x3F), // 0.4994721
35 vdup_n_s8(0x16), // 0.1763723
36 vdup_n_s8(0x05), // 0.0435108
37 }
38};
39
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010040/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
41 * Format is in Q0.15 for all elements */
42static const std::array<qint16x4_t, 4> exp_tab_qs16 =
43{
44 {
45 vdup_n_s16(0x7FBA), // 0.9978546
46 vdup_n_s16(0x3FE9), // 0.4994721
47 vdup_n_s16(0x1693), // 0.1763723
48 vdup_n_s16(0x0592), // 0.0435108
49 }
50};
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
53 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010054static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055{
56 {
57 vdupq_n_s8(0x7F), // 0.9978546
58 vdupq_n_s8(0x3F), // 0.4994721
59 vdupq_n_s8(0x16), // 0.1763723
60 vdupq_n_s8(0x05), // 0.0435108
61 }
62};
63
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010064/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
65 * Format is in Q0.15 for all elements */
66static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
67{
68 {
69 vdupq_n_s16(0x7FBA), // 0.9978546
70 vdupq_n_s16(0x3FE9), // 0.4994721
71 vdupq_n_s16(0x1693), // 0.1763723
72 vdupq_n_s16(0x0592), // 0.0435108
73 }
74};
75
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
77 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010078static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079{
80 {
81 vdup_n_s8(0x5C), // 1.4384189
82 vdup_n_s8(-0x56), // -0.6771900
83 vdup_n_s8(0x29), // 0.3218538
84 vdup_n_s8(-0x0A), // -0.0832229
85 }
86};
87
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010088/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
89 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
90static const std::array<qint16x4_t, 4> log_tab_qs16 =
91{
92 {
93 vdup_n_s16(0x5C0F), // 1.4384189
94 vdup_n_s16(-0x56AE), // -0.6771900
95 vdup_n_s16(0x2933), // 0.3218538
96 vdup_n_s16(-0x0AA7), // -0.0832229
97 }
98};
99
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
101 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100102static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103{
104 {
105 vdupq_n_s8(0x5C), // 1.4384189
106 vdupq_n_s8(-0x56), // -0.6771900
107 vdupq_n_s8(0x29), // 0.3218538
108 vdupq_n_s8(-0x0A), // -0.0832229
109 }
110};
111
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100112/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
113 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
114static const std::array<qint16x8_t, 4> log_tabq_qs16 =
115{
116 {
117 vdupq_n_s16(0x5C0F), // 1.4384189
118 vdupq_n_s16(-0x56AE), // -0.6771900
119 vdupq_n_s16(0x2933), // 0.3218538
120 vdupq_n_s16(-0x0AA7), // -0.0832229
121 }
122};
123
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124inline qint8x8_t vget_low_qs8(qint8x16_t a)
125{
126 return vget_low_s8(a);
127}
128
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100129inline qint16x4_t vget_low_qs16(qint16x8_t a)
130{
131 return vget_low_s16(a);
132}
133
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134inline qint8x8_t vget_high_qs8(qint8x16_t a)
135{
136 return vget_high_s8(a);
137}
138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100139inline qint16x4_t vget_high_qs16(qint16x8_t a)
140{
141 return vget_high_s16(a);
142}
143
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144inline qint8x8_t vld1_qs8(const qint8_t *addr)
145{
146 return vld1_s8(addr);
147}
148
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149inline qint16x4_t vld1_qs16(const qint16_t *addr)
150{
151 return vld1_s16(addr);
152}
153
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100154inline qint8x16_t vld1q_qs8(const qint8_t *addr)
155{
156 return vld1q_s8(addr);
157}
158
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159inline qint16x8_t vld1q_qs16(const qint16_t *addr)
160{
161 return vld1q_s16(addr);
162}
163
164inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
165{
166 return vld1_dup_s8(addr);
167}
168
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100169inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
170{
171 return vld1_dup_s16(addr);
172}
173
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
175{
176 return vld1q_dup_s8(addr);
177}
178
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100179inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
180{
181 return vld1q_dup_s16(addr);
182}
183
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100184inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
185{
186 return vld2q_s16(addr);
187}
188
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
190{
191 vst1_s8(addr, b);
192}
193
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
195{
196 vst1_s16(addr, b);
197}
198
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100199inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
200{
201 vst1q_s8(addr, b);
202}
203
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
205{
206 vst1q_s16(addr, b);
207}
208
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100209inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
210{
211 vst2q_s16(addr, b);
212}
213
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214inline qint8x8_t vqmovn_qs16(qint16x8_t a)
215{
216 return vqmovn_s16(a);
217}
218
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100219inline qint16x4_t vqmovn_qs32(qint32x4_t a)
220{
221 return vqmovn_s32(a);
222}
223
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224inline qint8x8_t vdup_n_qs8(qint8_t a)
225{
226 return vdup_n_s8(a);
227}
228
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100229inline qint16x4_t vdup_n_qs16(qint16_t a)
230{
231 return vdup_n_s16(a);
232}
233
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234inline qint8x16_t vdupq_n_qs8(qint8_t a)
235{
236 return vdupq_n_s8(a);
237}
238
239inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
240{
241 float32x4x4_t res =
242 {
243 {
244 vdupq_n_f32(a),
245 vdupq_n_f32(a),
246 vdupq_n_f32(a),
247 vdupq_n_f32(a),
248 }
249 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100250 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251}
252
253inline qint16x8_t vdupq_n_qs16(qint16_t a)
254{
255 return vdupq_n_s16(a);
256}
257
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100258inline qint32x4_t vdupq_n_qs32(qint32_t a)
259{
260 return vdupq_n_s32(a);
261}
262
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100263inline qint8x8_t vabs_qs8(qint8x8_t a)
264{
265 return vabs_s8(a);
266}
267
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100268inline qint16x4_t vabs_qs16(qint16x4_t a)
269{
270 return vabs_s16(a);
271}
272
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100273inline qint8x16_t vabsq_qs8(qint8x16_t a)
274{
275 return vabsq_s8(a);
276}
277
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100278inline qint16x8_t vabsq_qs16(qint16x8_t a)
279{
280 return vabsq_s16(a);
281}
282
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283inline qint8x8_t vqabs_qs8(qint8x8_t a)
284{
285 return vqabs_s8(a);
286}
287
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100288inline qint16x4_t vqabs_qs16(qint16x4_t a)
289{
290 return vqabs_s16(a);
291}
292
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100293inline qint8x16_t vqabsq_qs8(qint8x16_t a)
294{
295 return vqabsq_s8(a);
296}
297
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100298inline qint16x8_t vqabsq_qs16(qint16x8_t a)
299{
300 return vqabsq_s16(a);
301}
302
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100303inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
304{
305 return vmax_s8(a, b);
306}
307
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100308inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
309{
310 return vmax_s16(a, b);
311}
312
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100313inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
314{
315 return vmaxq_s8(a, b);
316}
317
318inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
319{
320 return vpmax_s8(a, b);
321}
322
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100323inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
324{
325 return vpmax_s16(a, b);
326}
327
328inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
329{
330 return vmaxq_s16(a, b);
331}
332
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100333inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
334{
335 return vmin_s8(a, b);
336}
337
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100338inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
339{
340 return vmin_s16(a, b);
341}
342
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100343inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
344{
345 return vminq_s8(a, b);
346}
347
348inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
349{
350 return vpmin_s8(a, b);
351}
352
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100353inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
354{
355 return vpmin_s16(a, b);
356}
357
358inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
359{
360 return vminq_s16(a, b);
361}
362
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100363inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
364{
365 return vadd_s8(a, b);
366}
367
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100368inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
369{
370 return vadd_s16(a, b);
371}
372
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100373inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
374{
375 return vaddq_s8(a, b);
376}
377
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100378inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
379{
380 return vaddq_s16(a, b);
381}
382
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100383inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
384{
385 return vqadd_s8(a, b);
386}
387
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100388inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
389{
390 return vqadd_s16(a, b);
391}
392
Georgios Pinitas9247c922017-06-28 18:29:47 +0100393inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
394{
395 return vqadd_s32(a, b);
396}
397
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100398inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
399{
400 return vqaddq_s8(a, b);
401}
402
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
404{
405 return vqaddq_s16(a, b);
406}
407
Georgios Pinitas9247c922017-06-28 18:29:47 +0100408inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
409{
410 return vqaddq_s32(a, b);
411}
412
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100413inline int16x4_t vpaddl_qs8(qint8x8_t a)
414{
415 return vpaddl_s8(a);
416}
417
418inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
419{
420 return vsub_s8(a, b);
421}
422
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100423inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
424{
425 return vsub_s16(a, b);
426}
427
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100428inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
429{
430 return vsubq_s8(a, b);
431}
432
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100433inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
434{
435 return vsubq_s16(a, b);
436}
437
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100438inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
439{
440 return vqsub_s8(a, b);
441}
442
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100443inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
444{
445 return vqsub_s16(a, b);
446}
447
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100448inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
449{
450 return vqsubq_s8(a, b);
451}
452
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100453inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
454{
455 return vqsubq_s16(a, b);
456}
457
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100458inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
459{
460 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
461
462 // Initialize the temporary result with a constant used to round up the result
463 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
464
465 // Vector multiply-accumulate long
466 res = vmlal_s8(res, a, b);
467
468 // Shift right by fixed_point_position
469 res = vshlq_s16(res, fixed_point_position_s16);
470
471 // Convert back to qint8
472 return vmovn_s16(res);
473}
474
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100475inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
476{
477 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
478
479 // Initialize the temporary result with a constant used to round up the result
480 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
481
482 // Vector multiply-accumulate long
483 res = vmlal_s16(res, a, b);
484
485 // Shift right by fixed_point_position
486 res = vshlq_s32(res, fixed_point_position_s32);
487
488 // Convert back to qint16
489 return vmovn_s32(res);
490}
491
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100492inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
493{
494 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
495
496 // Initialize the temporary results with a constant used to round up the result
497 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
498 qint16x8_t res1 = res0;
499
500 // Vector multiply-accumulate long
501 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
502 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
503
504 // Shift right by fixed_point_position
505 res0 = vshlq_s16(res0, fixed_point_position_s16);
506 res1 = vshlq_s16(res1, fixed_point_position_s16);
507
508 // Convert back to qint8
509 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
510}
511
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100512inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
513{
514 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
515
516 // Initialize the temporary results with a constant used to round up the result
517 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
518 qint32x4_t res1 = res0;
519
520 // Vector multiply-accumulate long
521 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
522 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
523
524 // Shift right by fixed_point_position
525 res0 = vshlq_s32(res0, fixed_point_position_s32);
526 res1 = vshlq_s32(res1, fixed_point_position_s32);
527
528 // Convert back to qint16
529 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
530}
531
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100532inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
533{
534 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
535
536 // Initialize the temporary result with a constant used to round up the result
537 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
538
539 // Vector multiply-accumulate long
540 res = vmlal_s8(res, a, b);
541
542 // Shift right by fixed_point_position
543 res = vqshlq_s16(res, fixed_point_position_s16);
544
545 // Convert back to qint8 and saturate
546 return vqmovn_s16(res);
547}
548
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100549inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
550{
551 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
552
553 // Initialize the temporary result with a constant used to round up the result
554 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
555
556 // Vector multiply-accumulate long
557 res = vmlal_s16(res, a, b);
558
559 // Shift right by fixed_point_position
560 res = vqshlq_s32(res, fixed_point_position_s32);
561
562 // Convert back to qint16 and saturate
563 return vqmovn_s32(res);
564}
565
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100566inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
567{
568 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
569
570 // Initialize the temporary results with a constant used to round up the result
571 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
572 qint16x8_t res1 = res0;
573
574 // Vector multiply-accumulate long
575 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
576 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
577
578 // Shift right by fixed_point_position
579 res0 = vqshlq_s16(res0, fixed_point_position_s16);
580 res1 = vqshlq_s16(res1, fixed_point_position_s16);
581
582 // Convert back to qint8 and saturate
583 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
584}
585
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100586inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
587{
588 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
589
590 // Initialize the temporary results with a constant used to round up the result
591 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
592 qint32x4_t res1 = res0;
593
594 // Vector multiply-accumulate long
595 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
596 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
597
598 // Shift right by fixed_point_position
599 res0 = vqshlq_s32(res0, fixed_point_position_s32);
600 res1 = vqshlq_s32(res1, fixed_point_position_s32);
601
602 // Convert back to qint16 and saturate
603 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
604}
605
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100606inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
607{
608 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
609
610 qint16x8_t res = vmull_s8(a, b);
611
612 return vqrshlq_s16(res, fixed_point_position_s16);
613}
614
615inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
616{
617 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
618
619 // Initialize the temporary results with a constant used to round up the result
620 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
621
622 // Vector multiply-accumulate long
623 tmp = vmlal_s8(tmp, b, c);
624
625 // Shift right by fixed_point_position
626 tmp = vshlq_s16(tmp, fixed_point_position_s16);
627
628 // Convert back to qint8 and accumulate
629 return vadd_s8(a, vmovn_s16(tmp));
630}
631
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100632inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
633{
634 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
635
636 // Initialize the temporary results with a constant used to round up the result
637 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
638
639 // Vector multiply-accumulate long
640 tmp = vmlal_s16(tmp, b, c);
641
642 // Shift right by fixed_point_position
643 tmp = vshlq_s32(tmp, fixed_point_position_s32);
644
645 // Convert back to qint16 and accumulate
646 return vadd_s16(a, vmovn_s32(tmp));
647}
648
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100649inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
650{
651 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
652
653 // Initialize the temporary results with a constant used to round up the result
654 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
655 qint16x8_t tmp1 = tmp0;
656
657 // Vector multiply-accumulate long
658 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
659 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
660
661 // Shift right by fixed_point_position
662 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
663 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
664
665 // Convert back to qint8 and accumulate
666 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
667}
668
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100669inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
670{
671 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
672
673 // Initialize the temporary results with a constant used to round up the result
674 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
675 qint32x4_t tmp1 = tmp0;
676
677 // Vector multiply-accumulate long
678 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
679 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
680
681 // Shift right by fixed_point_position
682 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
683 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
684
685 // Convert back to qint16 and accumulate
686 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
687}
688
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100689inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
690{
691 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
692
693 // Initialize the temporary results with a constant used to round up the result
694 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
695
696 // Vector multiply-accumulate long
697 tmp = vmlal_s8(tmp, b, c);
698
699 // Shift right by fixed_point_position
700 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
701
702 // Convert back to qint8 and accumulate
703 return vqadd_s8(a, vqmovn_s16(tmp));
704}
705
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100706inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
707{
708 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
709
710 // Initialize the temporary results with a constant used to round up the result
711 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
712
713 // Vector multiply-accumulate long
714 tmp = vmlal_s16(tmp, b, c);
715
716 // Shift right by fixed_point_position
717 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
718
719 // Convert back to qint8 and accumulate
720 return vqadd_s16(a, vqmovn_s32(tmp));
721}
722
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
724{
725 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
726
727 // Initialize the temporary results with a constant used to round up the result
728 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
729 qint16x8_t tmp1 = tmp0;
730
731 // Vector multiply-accumulate long
732 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
733 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
734
735 // Shift right by fixed_point_position
736 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
737 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
738
739 // Convert back to qint8 and accumulate
740 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
741 return vqaddq_s8(a, res);
742}
743
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100744inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
745{
746 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
747
748 // Initialize the temporary results with a constant used to round up the result
749 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
750 qint32x4_t tmp1 = tmp0;
751
752 // Vector multiply-accumulate long
753 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
754 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
755
756 // Shift right by fixed_point_position
757 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
758 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
759
760 // Convert back to qint16 and accumulate
761 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
762 return vqaddq_s16(a, res);
763}
764
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
766{
767 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
768
769 // Initialize the temporary results with a constant used to round up the result
770 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
771
772 // Vector multiply-accumulate long
773 tmp = vmlal_s8(tmp, b, c);
774
775 // Shift right by fixed_point_position
776 tmp = vshlq_s16(tmp, fixed_point_position_s16);
777
778 // Accumulate
779 return vaddq_s16(a, tmp);
780}
781
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100782inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
783{
784 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
785
786 // Initialize the temporary results with a constant used to round up the result
787 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
788
789 // Vector multiply-accumulate long
790 tmp = vmlal_s16(tmp, b, c);
791
792 // Shift right by fixed_point_position
793 tmp = vshlq_s32(tmp, fixed_point_position_s32);
794
795 // Accumulate
796 return vaddq_s32(a, tmp);
797}
798
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100799inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
800{
801 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
802
803 // Initialize the temporary results with a constant used to round up the result
804 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
805
806 // Vector multiply-accumulate long
807 tmp = vmlal_s8(tmp, b, c);
808
809 // Shift right by fixed_point_position
810 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
811
812 // Accumulate
813 return vqaddq_s16(a, tmp);
814}
815
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100816inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
817{
818 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
819
820 // Initialize the temporary results with a constant used to round up the result
821 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
822
823 // Vector multiply-accumulate long
824 tmp = vmlal_s16(tmp, b, c);
825
826 // Shift right by fixed_point_position
827 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
828
829 // Accumulate
830 return vqaddq_s32(a, tmp);
831}
832
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100833inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834{
835 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
836
837 float32x4x2_t res_f32 =
838 {
839 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100840 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
841 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100842 }
843 };
844
845 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
846 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
847
848 const int32x4x2_t res_s32 =
849 {
850 {
851 vcvtq_s32_f32(res_f32.val[0]),
852 vcvtq_s32_f32(res_f32.val[1]),
853 }
854 };
855
856 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
857
858 return vqmovn_s16(res_s16);
859}
860
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100861inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100862{
863 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
864
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100865 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100866
867 res_f32 = vmlaq_f32(res_f32, a, pow2);
868
869 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
870
871 return vqmovn_s32(res_s32);
872}
873
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100874inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100875{
876 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
877
878 float32x4x4_t res_f32 =
879 {
880 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100881 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
882 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
883 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
884 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100885 }
886 };
887
888 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
889 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
890 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
891 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
892
893 const int32x4x4_t res_s32 =
894 {
895 {
896 vcvtq_s32_f32(res_f32.val[0]),
897 vcvtq_s32_f32(res_f32.val[1]),
898 vcvtq_s32_f32(res_f32.val[2]),
899 vcvtq_s32_f32(res_f32.val[3]),
900 }
901 };
902
903 const int16x8x2_t res_s16 =
904 {
905 {
906 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
907 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
908 }
909 };
910
911 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
912}
913
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100914inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100915{
916 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
917
918 float32x4x2_t res_f32 =
919 {
920 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100921 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
922 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100923 }
924 };
925
926 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
927 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
928
929 const int32x4x2_t res_s32 =
930 {
931 {
932 vcvtq_s32_f32(res_f32.val[0]),
933 vcvtq_s32_f32(res_f32.val[1])
934 }
935 };
936
937 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
938}
939
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100940inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
941{
942 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
943
944 const int16x8_t res_s16 = vmovl_s8(a);
945
946 const int32x4x2_t res_s32 =
947 {
948 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100949 vmovl_s16(vget_low_qs16(res_s16)),
950 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100951 }
952 };
953
954 float32x4x2_t res_f32 =
955 {
956 {
957 vcvtq_f32_s32(res_s32.val[0]),
958 vcvtq_f32_s32(res_s32.val[1])
959 }
960 };
961
962 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
963 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
964
965 return res_f32;
966}
967
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100968inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
969{
970 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
971 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
972
973 return vmulq_f32(res_f32, pow2);
974}
975
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100976inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
977{
978 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
979
980 const int16x8x2_t res_s16 =
981 {
982 {
983 vmovl_s8(vget_low_s8(a)),
984 vmovl_s8(vget_high_s8(a)),
985 }
986 };
987
988 const int32x4x4_t res_s32 =
989 {
990 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100991 vmovl_s16(vget_low_qs16(res_s16.val[0])),
992 vmovl_s16(vget_high_qs16(res_s16.val[0])),
993 vmovl_s16(vget_low_qs16(res_s16.val[1])),
994 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100995 }
996 };
997
998 float32x4x4_t res_f32 =
999 {
1000 {
1001 vcvtq_f32_s32(res_s32.val[0]),
1002 vcvtq_f32_s32(res_s32.val[1]),
1003 vcvtq_f32_s32(res_s32.val[2]),
1004 vcvtq_f32_s32(res_s32.val[3])
1005 }
1006 };
1007
1008 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1009 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1010 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1011 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1012
1013 return res_f32;
1014}
1015
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001016inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1017{
1018 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1019
1020 const int32x4x2_t res_s32 =
1021 {
1022 {
1023 vmovl_s16(vget_low_qs16(a)),
1024 vmovl_s16(vget_high_qs16(a))
1025 }
1026 };
1027
1028 float32x4x2_t res_f32 =
1029 {
1030 {
1031 vcvtq_f32_s32(res_s32.val[0]),
1032 vcvtq_f32_s32(res_s32.val[1])
1033 }
1034 };
1035
1036 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1037 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1038
1039 return res_f32;
1040}
1041
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001042inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1043{
1044 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001045 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1046 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1047 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001048
1049 // Find shift value
1050 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1051 const qint8x8_t temp = vshl_s8(a, shift_value);
1052
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001053 qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001054
1055 uint8x8_t set_one = vcgt_s8(x, const_one);
1056 x = vbsl_s8(set_one, const_one, x);
1057
1058 // Use three iterations of Newton-Raphson method to get the result
1059 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1060 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1061 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1062
1063 return vshl_s8(x, shift_value);
1064}
1065
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001066inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1067{
1068 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1069 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1070 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1071 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1072
1073 // Find shift value
1074 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1075 const qint16x4_t temp = vshl_s16(a, shift_value);
1076
1077 qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
1078
1079 uint16x4_t set_one = vcgt_s16(x, const_one);
1080 x = vbsl_s16(set_one, const_one, x);
1081
1082 // Use five iterations of Newton-Raphson method to get the result
1083 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1084 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1085 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1086 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1087 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1088
1089 return vshl_s16(x, shift_value);
1090}
1091
Georgios Pinitas9247c922017-06-28 18:29:47 +01001092inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1093{
1094 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1095 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1096 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1097 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1098
1099 // Find shift value
1100 const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1101 const qint8x8_t temp = vqshl_s8(a, shift_value);
1102
1103 qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
1104
1105 uint8x8_t set_one = vcgt_s8(x, const_one);
1106 x = vbsl_s8(set_one, const_one, x);
1107
1108 // Use three iterations of Newton-Raphson method to get the result
1109 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1110 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1111 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1112
1113 return vqshl_s8(x, shift_value);
1114}
1115
1116inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1117{
1118 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1119 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1120 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1121 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1122
1123 // Find shift value
1124 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1125 const qint16x4_t temp = vqshl_s16(a, shift_value);
1126
1127 qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
1128
1129 uint16x4_t set_one = vcgt_s16(x, const_one);
1130 x = vbsl_s16(set_one, const_one, x);
1131
1132 // Use five iterations of Newton-Raphson method to get the result
1133 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1134 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1135 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1136 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1137 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1138
1139 return vqshl_s16(x, shift_value);
1140}
1141
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001142inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1143{
1144 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001145 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1146 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1147 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001148
1149 // Find shift value
1150 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1151 const qint8x16_t temp = vshlq_s8(a, shift_value);
1152
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001153 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001154
1155 // Set initial guess to one if x > 1
1156 uint8x16_t set_one = vcgtq_s8(x, const_one);
1157 x = vbslq_s8(set_one, const_one, x);
1158
1159 // Use three iterations of Newton-Raphson method to get the result
1160 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1161 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1162 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1163
1164 return vshlq_s8(x, shift_value);
1165}
1166
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001167inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1168{
1169 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1170 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1171 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1172 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1173
1174 // Find shift value
1175 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1176 const qint16x8_t temp = vshlq_s16(a, shift_value);
1177
1178 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1179
1180 // Set initial guess to one if x > 1
1181 uint16x8_t set_one = vcgtq_s16(x, const_one);
1182 x = vbslq_s16(set_one, const_one, x);
1183
1184 // Use five iterations of Newton-Raphson method to get the result
1185 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1186 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1187 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1188 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1189 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1190
1191 return vshlq_s16(x, shift_value);
1192}
1193
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001194inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1195{
1196 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001197 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1198 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1199 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001200
1201 // Find shift value
1202 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1203 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1204
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001205 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001206
1207 // Set initial guess to one if x > 1
1208 uint8x16_t set_one = vcgtq_s8(x, const_one);
1209 x = vbslq_s8(set_one, const_one, x);
1210
1211 // Use three iterations of Newton-Raphson method to get the result
1212 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1213 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1214 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1215
1216 return vqshlq_s8(x, shift_value);
1217}
1218
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001219inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1220{
1221 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1222 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1223 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1224 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1225
1226 // Find shift value
1227 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1228 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1229
1230 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1231
1232 // Set initial guess to one if x > 1
1233 uint16x8_t set_one = vcgtq_s16(x, const_one);
1234 x = vbslq_s16(set_one, const_one, x);
1235
1236 // Use five iterations of Newton-Raphson method to get the result
1237 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1238 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1239 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1240 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1241 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1242
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001243 // Saturate result in case of overflow
1244 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001245}
1246
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001247inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1248{
1249 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1250}
1251
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001252inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1253{
1254 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1255}
1256
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001257inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1258{
1259 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1260}
1261
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001262inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1263{
1264 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1265}
1266
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001267template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001268inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001269{
1270 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1271 const qint8x8_t const_one = vdup_n_s8(1);
1272 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1273 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1274 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1275 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1276 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1277 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1278 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1279 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1280 return res;
1281}
1282
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001283template <bool islog>
1284inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1285{
1286 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1287 const qint16x4_t const_one = vdup_n_s16(1);
1288 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1289 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1290 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1291 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1292 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1293 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1294 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1295 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1296 return res;
1297}
1298
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001299template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001300inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001301{
1302 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1303 const qint8x8_t const_one = vdup_n_s8(1);
1304 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1305 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1306 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1307 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1308 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1309 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1310 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1311 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1312 return res;
1313}
1314
1315template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001316inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1317{
1318 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1319 const qint16x4_t const_one = vdup_n_s16(1);
1320 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1321 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1322 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1323 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1324 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1325 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1326 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1327 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1328 return res;
1329}
1330
1331template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001332inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1333{
1334 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1335 const qint8x16_t const_one = vdupq_n_s8(1);
1336 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1337 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1338 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1339 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1340 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1341 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1342 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1343 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1344 return res;
1345}
1346
1347template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001348inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1349{
1350 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1351 const qint16x8_t const_one = vdupq_n_s16(1);
1352 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1353 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1354 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1355 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1356 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1357 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1358 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1359 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1360 return res;
1361}
1362
1363template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001364inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1365{
1366 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1367 const qint8x16_t const_one = vdupq_n_s8(1);
1368 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1369 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1370 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1371 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1372 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1373 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1374 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1375 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1376 return res;
1377}
1378
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001379template <bool islog>
1380inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1381{
1382 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1383 const qint16x8_t const_one = vdupq_n_s16(1);
1384 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1385 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1386 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1387 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1388 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1389 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1390 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1391 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1392 return res;
1393}
1394
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001395inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1396{
1397 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1398 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1399 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1400 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1401
1402 // Perform range reduction [-log(2),log(2)]
1403 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1404
1405 // get decimal part from m
1406 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1407
1408 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1409 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1410
1411 // Polynomial Approximation
1412 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1413 poly = vqadd_s8(poly, const_one);
1414
1415 // Reconstruct
1416 poly = vqshl_s8(poly, dec_m);
1417
1418 return poly;
1419}
1420
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001421inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1422{
1423 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1424 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1425 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1426 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1427
1428 // Perform range reduction [-log(2),log(2)]
1429 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1430
1431 // get decimal part from m
1432 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1433
1434 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1435 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1436
1437 // Polynomial Approximation
1438 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1439 poly = vqadd_s16(poly, const_one);
1440
1441 // Reconstruct
1442 poly = vqshl_s16(poly, dec_m);
1443
1444 return poly;
1445}
1446
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001447inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1448{
1449 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1450 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1451 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1452 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1453
1454 // Perform range reduction [-log(2),log(2)]
1455 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1456
1457 // get decimal part from m
1458 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1459
1460 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1461 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1462
1463 // Polynomial Approximation
1464 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1465 poly = vqaddq_s8(poly, const_one);
1466
1467 // Reconstruct
1468 poly = vqshlq_s8(poly, dec_m);
1469
1470 return poly;
1471}
1472
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001473inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1474{
1475 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1476 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1477 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1478 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1479
1480 // Perform range reduction [-log(2),log(2)]
1481 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1482
1483 // get decimal part from m
1484 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1485
1486 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1487 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1488
1489 // Polynomial Approximation
1490 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1491 poly = vqaddq_s16(poly, const_one);
1492
1493 // Reconstruct
1494 poly = vqshlq_s16(poly, dec_m);
1495
1496 return poly;
1497}
1498
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001499inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1500{
1501 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1502 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1503 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1504
1505 // If 0 < a < 1, calculate log(1/x)
1506 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1507 qint8x8_t recip = vdup_n_s8(0);
1508 recip = vbsl_s8(calc_reciprocal, recip, a);
1509
1510 // Calculate reciprocal
1511 recip = vrecip_qs8(recip, fixed_point_position);
1512 a = vbsl_s8(calc_reciprocal, recip, a);
1513
1514 // Get decimal part of a
1515 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1516 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1517
1518 // Get exponent of 2^n which is equal or less than dec_a
1519 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1520
1521 // Get x to range (1, 2]
1522 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1523 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1524 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1525
1526 // Polynomial Approximation
1527 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1528
1529 // Reconstruct
1530 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1531
1532 // Set negative value for 0 < a < 1
1533 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1534
1535 return poly;
1536}
1537
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001538inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1539{
1540 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1541 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1542 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1543
1544 // If 0 < a < 1, calculate log(1/x)
1545 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1546 qint16x4_t recip = vdup_n_s16(0);
1547 recip = vbsl_s16(calc_reciprocal, recip, a);
1548
1549 // Calculate reciprocal
1550 recip = vrecip_qs16(recip, fixed_point_position);
1551 a = vbsl_s16(calc_reciprocal, recip, a);
1552
1553 // Get decimal part of a
1554 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1555 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1556
1557 // Get exponent of 2^n which is equal or less than dec_a
1558 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1559
1560 // Get x to range (1, 2]
1561 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1562 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1563 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1564
1565 // Polynomial Approximation
1566 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1567
1568 // Reconstruct
1569 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1570
1571 // Set negative value for 0 < a < 1
1572 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1573
1574 return poly;
1575}
1576
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001577inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1578{
1579 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1580 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1581 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1582
1583 // If 0 < a < 1, calculate log(1/x)
1584 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1585 qint8x16_t recip = vdupq_n_s8(0);
1586 recip = vbslq_s8(calc_reciprocal, a, recip);
1587
1588 // Calculate reciprocal
1589 recip = vrecipq_qs8(recip, fixed_point_position);
1590 a = vbslq_s8(calc_reciprocal, recip, a);
1591
1592 // Get decimal part of a
1593 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1594 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1595
1596 // Get exponent of 2^n which is equal or less than dec_a
1597 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1598
1599 // Get x to range (1, 2]
1600 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1601 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1602 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1603
1604 // Polynomial Approximation
1605 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1606
1607 // Reconstruct
1608 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1609
1610 // Set negative value for 0 < a < 1
1611 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1612
1613 return poly;
1614}
1615
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001616inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1617{
1618 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1619 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1620 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1621
1622 // If 0 < a < 1, calculate log(1/x)
1623 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1624 qint16x8_t recip = vdupq_n_s16(0);
1625 recip = vbslq_s16(calc_reciprocal, a, recip);
1626
1627 // Calculate reciprocal
1628 recip = vqrecipq_qs16(recip, fixed_point_position);
1629 a = vbslq_s16(calc_reciprocal, recip, a);
1630
1631 // Get decimal part of a
1632 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1633 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1634
1635 // Get exponent of 2^n which is equal or less than dec_a
1636 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1637
1638 // Get x to range (1, 2]
1639 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1640 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1641 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1642
1643 // Polynomial Approximation
1644 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1645
1646 // Reconstruct
1647 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1648
1649 // Set negative value for 0 < a < 1
1650 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1651
1652 return poly;
1653}
1654
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001655inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1656{
1657 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1658
1659 // Find shift value. Number must be in (0.5, 2) range.
1660 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1661
1662 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1663 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1664 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1665 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1666 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1667
1668 temp = vshl_s8(a, shift_value);
1669
1670 // Initial guess
1671 qint8x8_t x = temp;
1672
1673 // Calculate (x / 2) * (3 - a * x^2)
1674 // After three iterations we have the result for 8 bit
1675 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1676 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1677 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1678
1679 return vshl_s8(x, shift_value2);
1680}
1681
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001682inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1683{
1684 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1685
1686 // Find shift value. Number must be in (0.5, 2) range.
1687 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1688
1689 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1690 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1691 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1692 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1693 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1694
1695 temp = vshl_s16(a, shift_value);
1696
1697 // Initial guess
1698 qint16x4_t x = temp;
1699
1700 // Calculate (x / 2) * (3 - a * x^2)
1701 // After five iterations we have the result for 8 bit
1702 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1703 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1704 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1705 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1706 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1707
1708 return vshl_s16(x, shift_value2);
1709}
1710
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001711inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1712{
1713 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1714
1715 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001716 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001717
1718 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001719 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001720 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001721 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001722 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001723
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001724 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001725
1726 // Initial guess
1727 qint8x8_t x = temp;
1728
1729 // Calculate (x / 2) * (3 - a * x^2)
1730 // After three iterations we have the result for 8 bit
1731 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1732 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1733 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1734
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001735 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001736}
1737
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001738inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1739{
1740 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1741
1742 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001743 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001744
1745 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1746 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1747 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1748 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001749 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001750
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001751 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001752
1753 // Initial guess
1754 qint16x4_t x = temp;
1755
1756 // Calculate (x / 2) * (3 - a * x^2)
1757 // After five iterations we have the result for 16 bit
1758 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1759 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1760 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1761 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1762 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1763
1764 return vqshl_s16(x, shift_value2);
1765}
1766
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001767inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1768{
1769 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1770
1771 // Find shift value. Number must be in (0.5, 2) range.
1772 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1773
1774 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1775 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1776 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1777 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1778 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1779
1780 temp = vshlq_s8(a, shift_value);
1781
1782 // Initial guess
1783 qint8x16_t x = temp;
1784
1785 // Calculate (x / 2) * (3 - a * x^2)
1786 // After three iterations we have the result for 8 bit
1787 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1788 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1789 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1790
1791 return vshlq_s8(x, shift_value2);
1792}
1793
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001794inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1795{
1796 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1797
1798 // Find shift value. Number must be in (0.5, 2) range.
1799 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1800
1801 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1802 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1803 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1804 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1805 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1806
1807 temp = vshlq_s16(a, shift_value);
1808
1809 // Initial guess
1810 qint16x8_t x = temp;
1811
1812 // Calculate (x / 2) * (3 - a * x^2)
1813 // After five iterations we have the result for 16 bit
1814 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1815 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1816 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1817 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1818 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1819
1820 return vshlq_s16(x, shift_value2);
1821}
1822
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001823inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1824{
1825 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1826
1827 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001828 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001829
1830 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001831 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001832 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001833 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001834 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001835
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001836 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001837
1838 // Initial guess
1839 qint8x16_t x = temp;
1840
1841 // Calculate (x / 2) * (3 - a * x^2)
1842 // After three iterations we have the result for 8 bit
1843 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1844 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1845 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1846
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001847 return vqshlq_s8(x, shift_value2);
1848}
1849
1850inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1851{
1852 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1853
1854 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001855 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001856
1857 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1858 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1859 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1860 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001861 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001862
1863 temp = vqshlq_s16(a, shift_value);
1864
1865 // Initial guess
1866 qint16x8_t x = temp;
1867
1868 // Calculate (x / 2) * (3 - a * x^2)
1869 // After five iterations we have the result for 16 bit
1870 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1871 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1872 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1873 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1874 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1875
1876 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001877}
1878
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001879inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001880{
1881 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1882 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1883
1884 qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1885 qint8x8_t num = vqsub_qs8(exp2x, const_one);
1886 qint8x8_t den = vqadd_qs8(exp2x, const_one);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001887 qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001888
1889 return tanh;
1890}
1891
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001892inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001893{
1894 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1895 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1896
1897 qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1898 qint16x4_t num = vqsub_qs16(exp2x, const_one);
1899 qint16x4_t den = vqadd_qs16(exp2x, const_one);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001900 qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001901
1902 return tanh;
1903}
1904
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001905inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001906{
1907 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1908 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1909
1910 qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1911 qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1912 qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1913 qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
1914
1915 return tanh;
1916}
1917
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001918inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1919{
1920 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1921 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1922
1923 qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1924 qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1925 qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1926 qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
1927
1928 return tanh;
1929}
1930
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001931inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1932{
1933 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1934}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001935
1936inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1937{
1938 float32x4x2_t res =
1939 {
1940 {
1941 vmaxq_f32(a.val[0], b.val[0]),
1942 vmaxq_f32(a.val[1], b.val[1])
1943 }
1944 };
1945 return res;
1946}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001947}