blob: 7cebfad9247d0201d5fdb9c00311603385feba55 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
28/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010030static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031{
32 {
33 vdup_n_s8(0x7F), // 0.9978546
34 vdup_n_s8(0x3F), // 0.4994721
35 vdup_n_s8(0x16), // 0.1763723
36 vdup_n_s8(0x05), // 0.0435108
37 }
38};
39
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010040/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
41 * Format is in Q0.15 for all elements */
42static const std::array<qint16x4_t, 4> exp_tab_qs16 =
43{
44 {
45 vdup_n_s16(0x7FBA), // 0.9978546
46 vdup_n_s16(0x3FE9), // 0.4994721
47 vdup_n_s16(0x1693), // 0.1763723
48 vdup_n_s16(0x0592), // 0.0435108
49 }
50};
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
53 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010054static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055{
56 {
57 vdupq_n_s8(0x7F), // 0.9978546
58 vdupq_n_s8(0x3F), // 0.4994721
59 vdupq_n_s8(0x16), // 0.1763723
60 vdupq_n_s8(0x05), // 0.0435108
61 }
62};
63
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010064/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
65 * Format is in Q0.15 for all elements */
66static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
67{
68 {
69 vdupq_n_s16(0x7FBA), // 0.9978546
70 vdupq_n_s16(0x3FE9), // 0.4994721
71 vdupq_n_s16(0x1693), // 0.1763723
72 vdupq_n_s16(0x0592), // 0.0435108
73 }
74};
75
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
77 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010078static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079{
80 {
81 vdup_n_s8(0x5C), // 1.4384189
82 vdup_n_s8(-0x56), // -0.6771900
83 vdup_n_s8(0x29), // 0.3218538
84 vdup_n_s8(-0x0A), // -0.0832229
85 }
86};
87
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010088/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
89 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
90static const std::array<qint16x4_t, 4> log_tab_qs16 =
91{
92 {
93 vdup_n_s16(0x5C0F), // 1.4384189
94 vdup_n_s16(-0x56AE), // -0.6771900
95 vdup_n_s16(0x2933), // 0.3218538
96 vdup_n_s16(-0x0AA7), // -0.0832229
97 }
98};
99
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
101 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100102static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103{
104 {
105 vdupq_n_s8(0x5C), // 1.4384189
106 vdupq_n_s8(-0x56), // -0.6771900
107 vdupq_n_s8(0x29), // 0.3218538
108 vdupq_n_s8(-0x0A), // -0.0832229
109 }
110};
111
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100112/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
113 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
114static const std::array<qint16x8_t, 4> log_tabq_qs16 =
115{
116 {
117 vdupq_n_s16(0x5C0F), // 1.4384189
118 vdupq_n_s16(-0x56AE), // -0.6771900
119 vdupq_n_s16(0x2933), // 0.3218538
120 vdupq_n_s16(-0x0AA7), // -0.0832229
121 }
122};
123
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124inline qint8x8_t vget_low_qs8(qint8x16_t a)
125{
126 return vget_low_s8(a);
127}
128
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100129inline qint16x4_t vget_low_qs16(qint16x8_t a)
130{
131 return vget_low_s16(a);
132}
133
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134inline qint8x8_t vget_high_qs8(qint8x16_t a)
135{
136 return vget_high_s8(a);
137}
138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100139inline qint16x4_t vget_high_qs16(qint16x8_t a)
140{
141 return vget_high_s16(a);
142}
143
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144inline qint8x8_t vld1_qs8(const qint8_t *addr)
145{
146 return vld1_s8(addr);
147}
148
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149inline qint16x4_t vld1_qs16(const qint16_t *addr)
150{
151 return vld1_s16(addr);
152}
153
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100154inline qint8x16_t vld1q_qs8(const qint8_t *addr)
155{
156 return vld1q_s8(addr);
157}
158
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159inline qint16x8_t vld1q_qs16(const qint16_t *addr)
160{
161 return vld1q_s16(addr);
162}
163
164inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
165{
166 return vld1_dup_s8(addr);
167}
168
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100169inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
170{
171 return vld1_dup_s16(addr);
172}
173
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
175{
176 return vld1q_dup_s8(addr);
177}
178
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100179inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
180{
181 return vld1q_dup_s16(addr);
182}
183
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100184inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
185{
186 return vld2q_s16(addr);
187}
188
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
190{
191 vst1_s8(addr, b);
192}
193
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
195{
196 vst1_s16(addr, b);
197}
198
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100199inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
200{
201 vst1q_s8(addr, b);
202}
203
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
205{
206 vst1q_s16(addr, b);
207}
208
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100209inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
210{
211 vst2q_s16(addr, b);
212}
213
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214inline qint8x8_t vqmovn_qs16(qint16x8_t a)
215{
216 return vqmovn_s16(a);
217}
218
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100219inline qint16x4_t vqmovn_qs32(qint32x4_t a)
220{
221 return vqmovn_s32(a);
222}
223
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224inline qint8x8_t vdup_n_qs8(qint8_t a)
225{
226 return vdup_n_s8(a);
227}
228
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100229inline qint16x4_t vdup_n_qs16(qint16_t a)
230{
231 return vdup_n_s16(a);
232}
233
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234inline qint8x16_t vdupq_n_qs8(qint8_t a)
235{
236 return vdupq_n_s8(a);
237}
238
239inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
240{
241 float32x4x4_t res =
242 {
243 {
244 vdupq_n_f32(a),
245 vdupq_n_f32(a),
246 vdupq_n_f32(a),
247 vdupq_n_f32(a),
248 }
249 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100250 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251}
252
253inline qint16x8_t vdupq_n_qs16(qint16_t a)
254{
255 return vdupq_n_s16(a);
256}
257
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100258inline qint32x4_t vdupq_n_qs32(qint32_t a)
259{
260 return vdupq_n_s32(a);
261}
262
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100263inline qint8x8_t vabs_qs8(qint8x8_t a)
264{
265 return vabs_s8(a);
266}
267
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100268inline qint16x4_t vabs_qs16(qint16x4_t a)
269{
270 return vabs_s16(a);
271}
272
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100273inline qint8x16_t vabsq_qs8(qint8x16_t a)
274{
275 return vabsq_s8(a);
276}
277
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100278inline qint16x8_t vabsq_qs16(qint16x8_t a)
279{
280 return vabsq_s16(a);
281}
282
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283inline qint8x8_t vqabs_qs8(qint8x8_t a)
284{
285 return vqabs_s8(a);
286}
287
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100288inline qint16x4_t vqabs_qs16(qint16x4_t a)
289{
290 return vqabs_s16(a);
291}
292
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100293inline qint8x16_t vqabsq_qs8(qint8x16_t a)
294{
295 return vqabsq_s8(a);
296}
297
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100298inline qint16x8_t vqabsq_qs16(qint16x8_t a)
299{
300 return vqabsq_s16(a);
301}
302
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100303inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
304{
305 return vmax_s8(a, b);
306}
307
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100308inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
309{
310 return vmax_s16(a, b);
311}
312
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100313inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
314{
315 return vmaxq_s8(a, b);
316}
317
318inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
319{
320 return vpmax_s8(a, b);
321}
322
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100323inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
324{
325 return vpmax_s16(a, b);
326}
327
328inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
329{
330 return vmaxq_s16(a, b);
331}
332
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100333inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
334{
335 return vmin_s8(a, b);
336}
337
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100338inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
339{
340 return vmin_s16(a, b);
341}
342
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100343inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
344{
345 return vminq_s8(a, b);
346}
347
348inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
349{
350 return vpmin_s8(a, b);
351}
352
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100353inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
354{
355 return vpmin_s16(a, b);
356}
357
358inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
359{
360 return vminq_s16(a, b);
361}
362
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100363inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
364{
365 return vadd_s8(a, b);
366}
367
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100368inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
369{
370 return vadd_s16(a, b);
371}
372
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100373inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
374{
375 return vaddq_s8(a, b);
376}
377
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100378inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
379{
380 return vaddq_s16(a, b);
381}
382
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100383inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
384{
385 return vqadd_s8(a, b);
386}
387
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100388inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
389{
390 return vqadd_s16(a, b);
391}
392
Georgios Pinitas9247c922017-06-28 18:29:47 +0100393inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
394{
395 return vqadd_s32(a, b);
396}
397
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100398inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
399{
400 return vqaddq_s8(a, b);
401}
402
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
404{
405 return vqaddq_s16(a, b);
406}
407
Georgios Pinitas9247c922017-06-28 18:29:47 +0100408inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
409{
410 return vqaddq_s32(a, b);
411}
412
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100413inline int16x4_t vpaddl_qs8(qint8x8_t a)
414{
415 return vpaddl_s8(a);
416}
417
418inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
419{
420 return vsub_s8(a, b);
421}
422
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100423inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
424{
425 return vsub_s16(a, b);
426}
427
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100428inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
429{
430 return vsubq_s8(a, b);
431}
432
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100433inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
434{
435 return vsubq_s16(a, b);
436}
437
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100438inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
439{
440 return vqsub_s8(a, b);
441}
442
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100443inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
444{
445 return vqsub_s16(a, b);
446}
447
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100448inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
449{
450 return vqsubq_s8(a, b);
451}
452
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100453inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
454{
455 return vqsubq_s16(a, b);
456}
457
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100458inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
459{
460 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
461
462 // Initialize the temporary result with a constant used to round up the result
463 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
464
465 // Vector multiply-accumulate long
466 res = vmlal_s8(res, a, b);
467
468 // Shift right by fixed_point_position
469 res = vshlq_s16(res, fixed_point_position_s16);
470
471 // Convert back to qint8
472 return vmovn_s16(res);
473}
474
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100475inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
476{
477 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
478
479 // Initialize the temporary result with a constant used to round up the result
480 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
481
482 // Vector multiply-accumulate long
483 res = vmlal_s16(res, a, b);
484
485 // Shift right by fixed_point_position
486 res = vshlq_s32(res, fixed_point_position_s32);
487
488 // Convert back to qint16
489 return vmovn_s32(res);
490}
491
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100492inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
493{
494 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
495
496 // Initialize the temporary results with a constant used to round up the result
497 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
498 qint16x8_t res1 = res0;
499
500 // Vector multiply-accumulate long
501 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
502 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
503
504 // Shift right by fixed_point_position
505 res0 = vshlq_s16(res0, fixed_point_position_s16);
506 res1 = vshlq_s16(res1, fixed_point_position_s16);
507
508 // Convert back to qint8
509 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
510}
511
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100512inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
513{
514 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
515
516 // Initialize the temporary results with a constant used to round up the result
517 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
518 qint32x4_t res1 = res0;
519
520 // Vector multiply-accumulate long
521 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
522 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
523
524 // Shift right by fixed_point_position
525 res0 = vshlq_s32(res0, fixed_point_position_s32);
526 res1 = vshlq_s32(res1, fixed_point_position_s32);
527
528 // Convert back to qint16
529 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
530}
531
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100532inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
533{
534 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
535
536 // Initialize the temporary result with a constant used to round up the result
537 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
538
539 // Vector multiply-accumulate long
540 res = vmlal_s8(res, a, b);
541
542 // Shift right by fixed_point_position
543 res = vqshlq_s16(res, fixed_point_position_s16);
544
545 // Convert back to qint8 and saturate
546 return vqmovn_s16(res);
547}
548
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100549inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
550{
551 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
552
553 // Initialize the temporary result with a constant used to round up the result
554 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
555
556 // Vector multiply-accumulate long
557 res = vmlal_s16(res, a, b);
558
559 // Shift right by fixed_point_position
560 res = vqshlq_s32(res, fixed_point_position_s32);
561
562 // Convert back to qint16 and saturate
563 return vqmovn_s32(res);
564}
565
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100566inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
567{
568 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
569
570 // Initialize the temporary results with a constant used to round up the result
571 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
572 qint16x8_t res1 = res0;
573
574 // Vector multiply-accumulate long
575 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
576 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
577
578 // Shift right by fixed_point_position
579 res0 = vqshlq_s16(res0, fixed_point_position_s16);
580 res1 = vqshlq_s16(res1, fixed_point_position_s16);
581
582 // Convert back to qint8 and saturate
583 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
584}
585
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100586inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
587{
588 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
589
590 // Initialize the temporary results with a constant used to round up the result
591 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
592 qint32x4_t res1 = res0;
593
594 // Vector multiply-accumulate long
595 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
596 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
597
598 // Shift right by fixed_point_position
599 res0 = vqshlq_s32(res0, fixed_point_position_s32);
600 res1 = vqshlq_s32(res1, fixed_point_position_s32);
601
602 // Convert back to qint16 and saturate
603 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
604}
605
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100606inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
607{
608 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
609
610 qint16x8_t res = vmull_s8(a, b);
611
612 return vqrshlq_s16(res, fixed_point_position_s16);
613}
614
615inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
616{
617 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
618
619 // Initialize the temporary results with a constant used to round up the result
620 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
621
622 // Vector multiply-accumulate long
623 tmp = vmlal_s8(tmp, b, c);
624
625 // Shift right by fixed_point_position
626 tmp = vshlq_s16(tmp, fixed_point_position_s16);
627
628 // Convert back to qint8 and accumulate
629 return vadd_s8(a, vmovn_s16(tmp));
630}
631
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100632inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
633{
634 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
635
636 // Initialize the temporary results with a constant used to round up the result
637 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
638
639 // Vector multiply-accumulate long
640 tmp = vmlal_s16(tmp, b, c);
641
642 // Shift right by fixed_point_position
643 tmp = vshlq_s32(tmp, fixed_point_position_s32);
644
645 // Convert back to qint16 and accumulate
646 return vadd_s16(a, vmovn_s32(tmp));
647}
648
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100649inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
650{
651 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
652
653 // Initialize the temporary results with a constant used to round up the result
654 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
655 qint16x8_t tmp1 = tmp0;
656
657 // Vector multiply-accumulate long
658 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
659 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
660
661 // Shift right by fixed_point_position
662 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
663 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
664
665 // Convert back to qint8 and accumulate
666 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
667}
668
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100669inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
670{
671 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
672
673 // Initialize the temporary results with a constant used to round up the result
674 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
675 qint32x4_t tmp1 = tmp0;
676
677 // Vector multiply-accumulate long
678 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
679 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
680
681 // Shift right by fixed_point_position
682 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
683 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
684
685 // Convert back to qint16 and accumulate
686 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
687}
688
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100689inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
690{
691 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
692
693 // Initialize the temporary results with a constant used to round up the result
694 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
695
696 // Vector multiply-accumulate long
697 tmp = vmlal_s8(tmp, b, c);
698
699 // Shift right by fixed_point_position
700 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
701
702 // Convert back to qint8 and accumulate
703 return vqadd_s8(a, vqmovn_s16(tmp));
704}
705
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100706inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
707{
708 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
709
710 // Initialize the temporary results with a constant used to round up the result
711 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
712
713 // Vector multiply-accumulate long
714 tmp = vmlal_s16(tmp, b, c);
715
716 // Shift right by fixed_point_position
717 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
718
719 // Convert back to qint8 and accumulate
720 return vqadd_s16(a, vqmovn_s32(tmp));
721}
722
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
724{
725 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
726
727 // Initialize the temporary results with a constant used to round up the result
728 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
729 qint16x8_t tmp1 = tmp0;
730
731 // Vector multiply-accumulate long
732 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
733 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
734
735 // Shift right by fixed_point_position
736 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
737 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
738
739 // Convert back to qint8 and accumulate
740 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
741 return vqaddq_s8(a, res);
742}
743
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100744inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
745{
746 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
747
748 // Initialize the temporary results with a constant used to round up the result
749 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
750 qint32x4_t tmp1 = tmp0;
751
752 // Vector multiply-accumulate long
753 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
754 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
755
756 // Shift right by fixed_point_position
757 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
758 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
759
760 // Convert back to qint16 and accumulate
761 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
762 return vqaddq_s16(a, res);
763}
764
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
766{
767 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
768
769 // Initialize the temporary results with a constant used to round up the result
770 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
771
772 // Vector multiply-accumulate long
773 tmp = vmlal_s8(tmp, b, c);
774
775 // Shift right by fixed_point_position
776 tmp = vshlq_s16(tmp, fixed_point_position_s16);
777
778 // Accumulate
779 return vaddq_s16(a, tmp);
780}
781
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100782inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
783{
784 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
785
786 // Initialize the temporary results with a constant used to round up the result
787 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
788
789 // Vector multiply-accumulate long
790 tmp = vmlal_s16(tmp, b, c);
791
792 // Shift right by fixed_point_position
793 tmp = vshlq_s32(tmp, fixed_point_position_s32);
794
795 // Accumulate
796 return vaddq_s32(a, tmp);
797}
798
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100799inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
800{
801 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
802
803 // Initialize the temporary results with a constant used to round up the result
804 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
805
806 // Vector multiply-accumulate long
807 tmp = vmlal_s8(tmp, b, c);
808
809 // Shift right by fixed_point_position
810 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
811
812 // Accumulate
813 return vqaddq_s16(a, tmp);
814}
815
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100816inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
817{
818 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
819
820 // Initialize the temporary results with a constant used to round up the result
821 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
822
823 // Vector multiply-accumulate long
824 tmp = vmlal_s16(tmp, b, c);
825
826 // Shift right by fixed_point_position
827 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
828
829 // Accumulate
830 return vqaddq_s32(a, tmp);
831}
832
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100833inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834{
835 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
836
837 float32x4x2_t res_f32 =
838 {
839 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100840 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
841 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100842 }
843 };
844
845 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
846 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
847
848 const int32x4x2_t res_s32 =
849 {
850 {
851 vcvtq_s32_f32(res_f32.val[0]),
852 vcvtq_s32_f32(res_f32.val[1]),
853 }
854 };
855
856 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
857
858 return vqmovn_s16(res_s16);
859}
860
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100861inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100862{
863 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
864
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100865 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100866
867 res_f32 = vmlaq_f32(res_f32, a, pow2);
868
869 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
870
871 return vqmovn_s32(res_s32);
872}
873
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100874inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100875{
876 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
877
878 float32x4x4_t res_f32 =
879 {
880 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100881 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
882 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
883 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
884 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100885 }
886 };
887
888 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
889 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
890 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
891 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
892
893 const int32x4x4_t res_s32 =
894 {
895 {
896 vcvtq_s32_f32(res_f32.val[0]),
897 vcvtq_s32_f32(res_f32.val[1]),
898 vcvtq_s32_f32(res_f32.val[2]),
899 vcvtq_s32_f32(res_f32.val[3]),
900 }
901 };
902
903 const int16x8x2_t res_s16 =
904 {
905 {
906 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
907 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
908 }
909 };
910
911 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
912}
913
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100914inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100915{
916 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
917
918 float32x4x2_t res_f32 =
919 {
920 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100921 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
922 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100923 }
924 };
925
926 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
927 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
928
929 const int32x4x2_t res_s32 =
930 {
931 {
932 vcvtq_s32_f32(res_f32.val[0]),
933 vcvtq_s32_f32(res_f32.val[1])
934 }
935 };
936
937 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
938}
939
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100940inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
941{
942 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
943
944 const int16x8_t res_s16 = vmovl_s8(a);
945
946 const int32x4x2_t res_s32 =
947 {
948 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100949 vmovl_s16(vget_low_qs16(res_s16)),
950 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100951 }
952 };
953
954 float32x4x2_t res_f32 =
955 {
956 {
957 vcvtq_f32_s32(res_s32.val[0]),
958 vcvtq_f32_s32(res_s32.val[1])
959 }
960 };
961
962 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
963 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
964
965 return res_f32;
966}
967
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100968inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
969{
970 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
971 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
972
973 return vmulq_f32(res_f32, pow2);
974}
975
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100976inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
977{
978 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
979
980 const int16x8x2_t res_s16 =
981 {
982 {
983 vmovl_s8(vget_low_s8(a)),
984 vmovl_s8(vget_high_s8(a)),
985 }
986 };
987
988 const int32x4x4_t res_s32 =
989 {
990 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100991 vmovl_s16(vget_low_qs16(res_s16.val[0])),
992 vmovl_s16(vget_high_qs16(res_s16.val[0])),
993 vmovl_s16(vget_low_qs16(res_s16.val[1])),
994 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100995 }
996 };
997
998 float32x4x4_t res_f32 =
999 {
1000 {
1001 vcvtq_f32_s32(res_s32.val[0]),
1002 vcvtq_f32_s32(res_s32.val[1]),
1003 vcvtq_f32_s32(res_s32.val[2]),
1004 vcvtq_f32_s32(res_s32.val[3])
1005 }
1006 };
1007
1008 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1009 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1010 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1011 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1012
1013 return res_f32;
1014}
1015
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001016inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1017{
1018 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1019
1020 const int32x4x2_t res_s32 =
1021 {
1022 {
1023 vmovl_s16(vget_low_qs16(a)),
1024 vmovl_s16(vget_high_qs16(a))
1025 }
1026 };
1027
1028 float32x4x2_t res_f32 =
1029 {
1030 {
1031 vcvtq_f32_s32(res_s32.val[0]),
1032 vcvtq_f32_s32(res_s32.val[1])
1033 }
1034 };
1035
1036 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1037 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1038
1039 return res_f32;
1040}
1041
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001042inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1043{
1044 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001045 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1046 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1047 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001048
1049 // Find shift value
1050 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1051 const qint8x8_t temp = vshl_s8(a, shift_value);
1052
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001053 // Newton-Raphson division initial estimate X0 calculation
1054 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001055
1056 uint8x8_t set_one = vcgt_s8(x, const_one);
1057 x = vbsl_s8(set_one, const_one, x);
1058
1059 // Use three iterations of Newton-Raphson method to get the result
1060 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1061 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1062 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1063
1064 return vshl_s8(x, shift_value);
1065}
1066
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001067inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1068{
1069 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1070 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1071 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1072 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1073
1074 // Find shift value
1075 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1076 const qint16x4_t temp = vshl_s16(a, shift_value);
1077
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001078 // Newton-Raphson division initial estimate X0 calculation
1079 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001080
1081 uint16x4_t set_one = vcgt_s16(x, const_one);
1082 x = vbsl_s16(set_one, const_one, x);
1083
1084 // Use five iterations of Newton-Raphson method to get the result
1085 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1086 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1087 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1088 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1089 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1090
1091 return vshl_s16(x, shift_value);
1092}
1093
Georgios Pinitas9247c922017-06-28 18:29:47 +01001094inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1095{
1096 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1097 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1098 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1099 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1100
1101 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001102 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001103 const qint8x8_t temp = vqshl_s8(a, shift_value);
1104
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001105 // Newton-Raphson division initial estimate X0 calculation
1106 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001107
1108 uint8x8_t set_one = vcgt_s8(x, const_one);
1109 x = vbsl_s8(set_one, const_one, x);
1110
1111 // Use three iterations of Newton-Raphson method to get the result
1112 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1113 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1114 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1115
1116 return vqshl_s8(x, shift_value);
1117}
1118
1119inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1120{
1121 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1122 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1123 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1124 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1125
1126 // Find shift value
1127 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1128 const qint16x4_t temp = vqshl_s16(a, shift_value);
1129
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001130 // Newton-Raphson division initial estimate X0 calculation
1131 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001132
1133 uint16x4_t set_one = vcgt_s16(x, const_one);
1134 x = vbsl_s16(set_one, const_one, x);
1135
1136 // Use five iterations of Newton-Raphson method to get the result
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001137 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1138 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1139 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1140 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1141 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001142
1143 return vqshl_s16(x, shift_value);
1144}
1145
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001146inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1147{
1148 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001149 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1150 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1151 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001152
1153 // Find shift value
1154 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1155 const qint8x16_t temp = vshlq_s8(a, shift_value);
1156
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001157 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001158 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001159
1160 // Set initial guess to one if x > 1
1161 uint8x16_t set_one = vcgtq_s8(x, const_one);
1162 x = vbslq_s8(set_one, const_one, x);
1163
1164 // Use three iterations of Newton-Raphson method to get the result
1165 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1166 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1167 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1168
1169 return vshlq_s8(x, shift_value);
1170}
1171
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001172inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1173{
1174 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1175 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1176 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1177 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1178
1179 // Find shift value
1180 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1181 const qint16x8_t temp = vshlq_s16(a, shift_value);
1182
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001183 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001184 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1185
1186 // Set initial guess to one if x > 1
1187 uint16x8_t set_one = vcgtq_s16(x, const_one);
1188 x = vbslq_s16(set_one, const_one, x);
1189
1190 // Use five iterations of Newton-Raphson method to get the result
1191 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1192 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1193 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1194 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1195 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1196
1197 return vshlq_s16(x, shift_value);
1198}
1199
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001200inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1201{
1202 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001203 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1204 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1205 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001206
1207 // Find shift value
1208 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1209 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1210
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001211 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001212 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001213
1214 // Set initial guess to one if x > 1
1215 uint8x16_t set_one = vcgtq_s8(x, const_one);
1216 x = vbslq_s8(set_one, const_one, x);
1217
1218 // Use three iterations of Newton-Raphson method to get the result
1219 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1220 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1221 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1222
1223 return vqshlq_s8(x, shift_value);
1224}
1225
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001226inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1227{
1228 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1229 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1230 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1231 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1232
1233 // Find shift value
1234 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1235 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1236
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001237 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001238 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1239
1240 // Set initial guess to one if x > 1
1241 uint16x8_t set_one = vcgtq_s16(x, const_one);
1242 x = vbslq_s16(set_one, const_one, x);
1243
1244 // Use five iterations of Newton-Raphson method to get the result
1245 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1246 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1247 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1248 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1249 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1250
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001251 // Saturate result in case of overflow
1252 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001253}
1254
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001255inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1256{
1257 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1258}
1259
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001260inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1261{
1262 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1263}
1264
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001265inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1266{
1267 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1268}
1269
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001270inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1271{
1272 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1273}
1274
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001275template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001276inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001277{
1278 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1279 const qint8x8_t const_one = vdup_n_s8(1);
1280 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1281 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1282 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1283 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1284 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1285 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1286 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1287 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1288 return res;
1289}
1290
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001291template <bool islog>
1292inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1293{
1294 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1295 const qint16x4_t const_one = vdup_n_s16(1);
1296 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1297 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1298 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1299 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1300 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1301 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1302 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1303 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1304 return res;
1305}
1306
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001307template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001308inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001309{
1310 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1311 const qint8x8_t const_one = vdup_n_s8(1);
1312 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1313 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1314 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1315 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1316 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1317 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1318 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1319 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1320 return res;
1321}
1322
1323template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001324inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1325{
1326 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1327 const qint16x4_t const_one = vdup_n_s16(1);
1328 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1329 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1330 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1331 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1332 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1333 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1334 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1335 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1336 return res;
1337}
1338
1339template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001340inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1341{
1342 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1343 const qint8x16_t const_one = vdupq_n_s8(1);
1344 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1345 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1346 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1347 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1348 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1349 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1350 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1351 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1352 return res;
1353}
1354
1355template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001356inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1357{
1358 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1359 const qint16x8_t const_one = vdupq_n_s16(1);
1360 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1361 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1362 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1363 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1364 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1365 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1366 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1367 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1368 return res;
1369}
1370
1371template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001372inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1373{
1374 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1375 const qint8x16_t const_one = vdupq_n_s8(1);
1376 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1377 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1378 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1379 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1380 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1381 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1382 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1383 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1384 return res;
1385}
1386
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001387template <bool islog>
1388inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1389{
1390 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1391 const qint16x8_t const_one = vdupq_n_s16(1);
1392 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1393 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1394 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1395 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1396 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1397 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1398 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1399 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1400 return res;
1401}
1402
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001403inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1404{
1405 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1406 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1407 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1408 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1409
1410 // Perform range reduction [-log(2),log(2)]
1411 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1412
1413 // get decimal part from m
1414 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1415
1416 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1417 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1418
1419 // Polynomial Approximation
1420 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1421 poly = vqadd_s8(poly, const_one);
1422
1423 // Reconstruct
1424 poly = vqshl_s8(poly, dec_m);
1425
1426 return poly;
1427}
1428
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001429inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1430{
1431 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1432 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1433 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1434 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1435
1436 // Perform range reduction [-log(2),log(2)]
1437 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1438
1439 // get decimal part from m
1440 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1441
1442 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1443 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1444
1445 // Polynomial Approximation
1446 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1447 poly = vqadd_s16(poly, const_one);
1448
1449 // Reconstruct
1450 poly = vqshl_s16(poly, dec_m);
1451
1452 return poly;
1453}
1454
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001455inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1456{
1457 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1458 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1459 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1460 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1461
1462 // Perform range reduction [-log(2),log(2)]
1463 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1464
1465 // get decimal part from m
1466 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1467
1468 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1469 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1470
1471 // Polynomial Approximation
1472 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1473 poly = vqaddq_s8(poly, const_one);
1474
1475 // Reconstruct
1476 poly = vqshlq_s8(poly, dec_m);
1477
1478 return poly;
1479}
1480
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001481inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1482{
1483 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1484 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1485 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1486 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1487
1488 // Perform range reduction [-log(2),log(2)]
1489 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1490
1491 // get decimal part from m
1492 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1493
1494 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1495 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1496
1497 // Polynomial Approximation
1498 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1499 poly = vqaddq_s16(poly, const_one);
1500
1501 // Reconstruct
1502 poly = vqshlq_s16(poly, dec_m);
1503
1504 return poly;
1505}
1506
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001507inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1508{
1509 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1510 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1511 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1512
1513 // If 0 < a < 1, calculate log(1/x)
1514 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1515 qint8x8_t recip = vdup_n_s8(0);
1516 recip = vbsl_s8(calc_reciprocal, recip, a);
1517
1518 // Calculate reciprocal
1519 recip = vrecip_qs8(recip, fixed_point_position);
1520 a = vbsl_s8(calc_reciprocal, recip, a);
1521
1522 // Get decimal part of a
1523 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1524 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1525
1526 // Get exponent of 2^n which is equal or less than dec_a
1527 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1528
1529 // Get x to range (1, 2]
1530 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1531 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1532 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1533
1534 // Polynomial Approximation
1535 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1536
1537 // Reconstruct
1538 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1539
1540 // Set negative value for 0 < a < 1
1541 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1542
1543 return poly;
1544}
1545
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001546inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1547{
1548 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1549 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1550 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1551
1552 // If 0 < a < 1, calculate log(1/x)
1553 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1554 qint16x4_t recip = vdup_n_s16(0);
1555 recip = vbsl_s16(calc_reciprocal, recip, a);
1556
1557 // Calculate reciprocal
1558 recip = vrecip_qs16(recip, fixed_point_position);
1559 a = vbsl_s16(calc_reciprocal, recip, a);
1560
1561 // Get decimal part of a
1562 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1563 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1564
1565 // Get exponent of 2^n which is equal or less than dec_a
1566 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1567
1568 // Get x to range (1, 2]
1569 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1570 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1571 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1572
1573 // Polynomial Approximation
1574 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1575
1576 // Reconstruct
1577 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1578
1579 // Set negative value for 0 < a < 1
1580 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1581
1582 return poly;
1583}
1584
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001585inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1586{
1587 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1588 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1589 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1590
1591 // If 0 < a < 1, calculate log(1/x)
1592 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1593 qint8x16_t recip = vdupq_n_s8(0);
1594 recip = vbslq_s8(calc_reciprocal, a, recip);
1595
1596 // Calculate reciprocal
1597 recip = vrecipq_qs8(recip, fixed_point_position);
1598 a = vbslq_s8(calc_reciprocal, recip, a);
1599
1600 // Get decimal part of a
1601 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1602 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1603
1604 // Get exponent of 2^n which is equal or less than dec_a
1605 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1606
1607 // Get x to range (1, 2]
1608 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1609 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1610 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1611
1612 // Polynomial Approximation
1613 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1614
1615 // Reconstruct
1616 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1617
1618 // Set negative value for 0 < a < 1
1619 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1620
1621 return poly;
1622}
1623
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001624inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1625{
1626 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1627 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1628 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1629
1630 // If 0 < a < 1, calculate log(1/x)
1631 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1632 qint16x8_t recip = vdupq_n_s16(0);
1633 recip = vbslq_s16(calc_reciprocal, a, recip);
1634
1635 // Calculate reciprocal
1636 recip = vqrecipq_qs16(recip, fixed_point_position);
1637 a = vbslq_s16(calc_reciprocal, recip, a);
1638
1639 // Get decimal part of a
1640 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1641 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1642
1643 // Get exponent of 2^n which is equal or less than dec_a
1644 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1645
1646 // Get x to range (1, 2]
1647 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1648 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1649 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1650
1651 // Polynomial Approximation
1652 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1653
1654 // Reconstruct
1655 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1656
1657 // Set negative value for 0 < a < 1
1658 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1659
1660 return poly;
1661}
1662
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001663inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1664{
1665 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1666
1667 // Find shift value. Number must be in (0.5, 2) range.
1668 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1669
1670 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1671 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1672 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1673 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1674 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1675
1676 temp = vshl_s8(a, shift_value);
1677
1678 // Initial guess
1679 qint8x8_t x = temp;
1680
1681 // Calculate (x / 2) * (3 - a * x^2)
1682 // After three iterations we have the result for 8 bit
1683 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1684 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1685 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1686
1687 return vshl_s8(x, shift_value2);
1688}
1689
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001690inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1691{
1692 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1693
1694 // Find shift value. Number must be in (0.5, 2) range.
1695 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1696
1697 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1698 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1699 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1700 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1701 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1702
1703 temp = vshl_s16(a, shift_value);
1704
1705 // Initial guess
1706 qint16x4_t x = temp;
1707
1708 // Calculate (x / 2) * (3 - a * x^2)
1709 // After five iterations we have the result for 8 bit
1710 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1711 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1712 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1713 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1714 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1715
1716 return vshl_s16(x, shift_value2);
1717}
1718
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001719inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1720{
1721 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1722
1723 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001724 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001725
1726 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001727 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001728 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001729 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001730 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001731
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001732 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001733
1734 // Initial guess
1735 qint8x8_t x = temp;
1736
1737 // Calculate (x / 2) * (3 - a * x^2)
1738 // After three iterations we have the result for 8 bit
1739 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1740 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1741 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1742
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001743 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001744}
1745
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001746inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1747{
1748 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1749
1750 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001751 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001752
1753 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1754 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1755 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1756 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001757 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001758
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001759 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001760
1761 // Initial guess
1762 qint16x4_t x = temp;
1763
1764 // Calculate (x / 2) * (3 - a * x^2)
1765 // After five iterations we have the result for 16 bit
1766 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1767 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1768 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1769 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1770 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1771
1772 return vqshl_s16(x, shift_value2);
1773}
1774
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001775inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1776{
1777 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1778
1779 // Find shift value. Number must be in (0.5, 2) range.
1780 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1781
1782 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1783 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1784 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1785 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1786 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1787
1788 temp = vshlq_s8(a, shift_value);
1789
1790 // Initial guess
1791 qint8x16_t x = temp;
1792
1793 // Calculate (x / 2) * (3 - a * x^2)
1794 // After three iterations we have the result for 8 bit
1795 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1796 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1797 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1798
1799 return vshlq_s8(x, shift_value2);
1800}
1801
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001802inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1803{
1804 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1805
1806 // Find shift value. Number must be in (0.5, 2) range.
1807 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1808
1809 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1810 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1811 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1812 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1813 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1814
1815 temp = vshlq_s16(a, shift_value);
1816
1817 // Initial guess
1818 qint16x8_t x = temp;
1819
1820 // Calculate (x / 2) * (3 - a * x^2)
1821 // After five iterations we have the result for 16 bit
1822 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1823 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1824 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1825 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1826 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1827
1828 return vshlq_s16(x, shift_value2);
1829}
1830
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001831inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1832{
1833 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1834
1835 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001836 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001837
1838 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001839 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001840 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001841 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001842 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001843
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001844 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001845
1846 // Initial guess
1847 qint8x16_t x = temp;
1848
1849 // Calculate (x / 2) * (3 - a * x^2)
1850 // After three iterations we have the result for 8 bit
1851 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1852 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1853 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1854
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001855 return vqshlq_s8(x, shift_value2);
1856}
1857
1858inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1859{
1860 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1861
1862 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001863 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001864
1865 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1866 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1867 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1868 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001869 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001870
1871 temp = vqshlq_s16(a, shift_value);
1872
1873 // Initial guess
1874 qint16x8_t x = temp;
1875
1876 // Calculate (x / 2) * (3 - a * x^2)
1877 // After five iterations we have the result for 16 bit
1878 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1879 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1880 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1881 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1882 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1883
1884 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001885}
1886
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001887inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001888{
1889 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1890 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1891
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001892 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1893 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1894 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1895 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001896
1897 return tanh;
1898}
1899
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001900inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001901{
1902 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1903 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1904
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001905 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1906 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1907 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1908 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001909
1910 return tanh;
1911}
1912
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001913inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001914{
1915 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1916 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1917
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001918 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1919 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1920 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1921 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001922
1923 return tanh;
1924}
1925
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001926inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1927{
1928 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1929 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1930
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001931 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1932 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1933 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1934 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001935
1936 return tanh;
1937}
1938
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001939inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1940{
1941 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1942}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001943
1944inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1945{
1946 float32x4x2_t res =
1947 {
1948 {
1949 vmaxq_f32(a.val[0], b.val[0]),
1950 vmaxq_f32(a.val[1], b.val[1])
1951 }
1952 };
1953 return res;
1954}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001955}