blob: 4e862ba387f381662f0b52333b2fbefcb2ac29c0 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
28/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010030static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031{
32 {
33 vdup_n_s8(0x7F), // 0.9978546
34 vdup_n_s8(0x3F), // 0.4994721
35 vdup_n_s8(0x16), // 0.1763723
36 vdup_n_s8(0x05), // 0.0435108
37 }
38};
39
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010040/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
41 * Format is in Q0.15 for all elements */
42static const std::array<qint16x4_t, 4> exp_tab_qs16 =
43{
44 {
45 vdup_n_s16(0x7FBA), // 0.9978546
46 vdup_n_s16(0x3FE9), // 0.4994721
47 vdup_n_s16(0x1693), // 0.1763723
48 vdup_n_s16(0x0592), // 0.0435108
49 }
50};
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
53 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010054static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055{
56 {
57 vdupq_n_s8(0x7F), // 0.9978546
58 vdupq_n_s8(0x3F), // 0.4994721
59 vdupq_n_s8(0x16), // 0.1763723
60 vdupq_n_s8(0x05), // 0.0435108
61 }
62};
63
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010064/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
65 * Format is in Q0.15 for all elements */
66static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
67{
68 {
69 vdupq_n_s16(0x7FBA), // 0.9978546
70 vdupq_n_s16(0x3FE9), // 0.4994721
71 vdupq_n_s16(0x1693), // 0.1763723
72 vdupq_n_s16(0x0592), // 0.0435108
73 }
74};
75
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
77 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010078static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079{
80 {
81 vdup_n_s8(0x5C), // 1.4384189
82 vdup_n_s8(-0x56), // -0.6771900
83 vdup_n_s8(0x29), // 0.3218538
84 vdup_n_s8(-0x0A), // -0.0832229
85 }
86};
87
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010088/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
89 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
90static const std::array<qint16x4_t, 4> log_tab_qs16 =
91{
92 {
93 vdup_n_s16(0x5C0F), // 1.4384189
94 vdup_n_s16(-0x56AE), // -0.6771900
95 vdup_n_s16(0x2933), // 0.3218538
96 vdup_n_s16(-0x0AA7), // -0.0832229
97 }
98};
99
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
101 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100102static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103{
104 {
105 vdupq_n_s8(0x5C), // 1.4384189
106 vdupq_n_s8(-0x56), // -0.6771900
107 vdupq_n_s8(0x29), // 0.3218538
108 vdupq_n_s8(-0x0A), // -0.0832229
109 }
110};
111
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100112/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
113 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
114static const std::array<qint16x8_t, 4> log_tabq_qs16 =
115{
116 {
117 vdupq_n_s16(0x5C0F), // 1.4384189
118 vdupq_n_s16(-0x56AE), // -0.6771900
119 vdupq_n_s16(0x2933), // 0.3218538
120 vdupq_n_s16(-0x0AA7), // -0.0832229
121 }
122};
123
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124inline qint8x8_t vget_low_qs8(qint8x16_t a)
125{
126 return vget_low_s8(a);
127}
128
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100129inline qint16x4_t vget_low_qs16(qint16x8_t a)
130{
131 return vget_low_s16(a);
132}
133
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134inline qint8x8_t vget_high_qs8(qint8x16_t a)
135{
136 return vget_high_s8(a);
137}
138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100139inline qint16x4_t vget_high_qs16(qint16x8_t a)
140{
141 return vget_high_s16(a);
142}
143
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144inline qint8x8_t vld1_qs8(const qint8_t *addr)
145{
146 return vld1_s8(addr);
147}
148
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149inline qint16x4_t vld1_qs16(const qint16_t *addr)
150{
151 return vld1_s16(addr);
152}
153
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100154inline qint8x16_t vld1q_qs8(const qint8_t *addr)
155{
156 return vld1q_s8(addr);
157}
158
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159inline qint16x8_t vld1q_qs16(const qint16_t *addr)
160{
161 return vld1q_s16(addr);
162}
163
164inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
165{
166 return vld1_dup_s8(addr);
167}
168
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100169inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
170{
171 return vld1_dup_s16(addr);
172}
173
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
175{
176 return vld1q_dup_s8(addr);
177}
178
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100179inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
180{
181 return vld1q_dup_s16(addr);
182}
183
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100184inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
185{
186 vst1_s8(addr, b);
187}
188
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
190{
191 vst1_s16(addr, b);
192}
193
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100194inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
195{
196 vst1q_s8(addr, b);
197}
198
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100199inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
200{
201 vst1q_s16(addr, b);
202}
203
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100204inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
205{
206 vst2q_s16(addr, b);
207}
208
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100209inline qint8x8_t vqmovn_qs16(qint16x8_t a)
210{
211 return vqmovn_s16(a);
212}
213
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100214inline qint16x4_t vqmovn_qs32(qint32x4_t a)
215{
216 return vqmovn_s32(a);
217}
218
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100219inline qint8x8_t vdup_n_qs8(qint8_t a)
220{
221 return vdup_n_s8(a);
222}
223
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100224inline qint16x4_t vdup_n_qs16(qint16_t a)
225{
226 return vdup_n_s16(a);
227}
228
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100229inline qint8x16_t vdupq_n_qs8(qint8_t a)
230{
231 return vdupq_n_s8(a);
232}
233
234inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
235{
236 float32x4x4_t res =
237 {
238 {
239 vdupq_n_f32(a),
240 vdupq_n_f32(a),
241 vdupq_n_f32(a),
242 vdupq_n_f32(a),
243 }
244 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100245 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100246}
247
248inline qint16x8_t vdupq_n_qs16(qint16_t a)
249{
250 return vdupq_n_s16(a);
251}
252
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100253inline qint32x4_t vdupq_n_qs32(qint32_t a)
254{
255 return vdupq_n_s32(a);
256}
257
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100258inline qint8x8_t vabs_qs8(qint8x8_t a)
259{
260 return vabs_s8(a);
261}
262
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100263inline qint16x4_t vabs_qs16(qint16x4_t a)
264{
265 return vabs_s16(a);
266}
267
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100268inline qint8x16_t vabsq_qs8(qint8x16_t a)
269{
270 return vabsq_s8(a);
271}
272
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100273inline qint16x8_t vabsq_qs16(qint16x8_t a)
274{
275 return vabsq_s16(a);
276}
277
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100278inline qint8x8_t vqabs_qs8(qint8x8_t a)
279{
280 return vqabs_s8(a);
281}
282
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100283inline qint16x4_t vqabs_qs16(qint16x4_t a)
284{
285 return vqabs_s16(a);
286}
287
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100288inline qint8x16_t vqabsq_qs8(qint8x16_t a)
289{
290 return vqabsq_s8(a);
291}
292
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100293inline qint16x8_t vqabsq_qs16(qint16x8_t a)
294{
295 return vqabsq_s16(a);
296}
297
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100298inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
299{
300 return vmax_s8(a, b);
301}
302
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100303inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
304{
305 return vmax_s16(a, b);
306}
307
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100308inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
309{
310 return vmaxq_s8(a, b);
311}
312
313inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
314{
315 return vpmax_s8(a, b);
316}
317
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100318inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
319{
320 return vpmax_s16(a, b);
321}
322
323inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
324{
325 return vmaxq_s16(a, b);
326}
327
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100328inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
329{
330 return vmin_s8(a, b);
331}
332
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100333inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
334{
335 return vmin_s16(a, b);
336}
337
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100338inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
339{
340 return vminq_s8(a, b);
341}
342
343inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
344{
345 return vpmin_s8(a, b);
346}
347
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100348inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
349{
350 return vpmin_s16(a, b);
351}
352
353inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
354{
355 return vminq_s16(a, b);
356}
357
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100358inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
359{
360 return vadd_s8(a, b);
361}
362
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100363inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
364{
365 return vadd_s16(a, b);
366}
367
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100368inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
369{
370 return vaddq_s8(a, b);
371}
372
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100373inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
374{
375 return vaddq_s16(a, b);
376}
377
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100378inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
379{
380 return vqadd_s8(a, b);
381}
382
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100383inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
384{
385 return vqadd_s16(a, b);
386}
387
Georgios Pinitas9247c922017-06-28 18:29:47 +0100388inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
389{
390 return vqadd_s32(a, b);
391}
392
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100393inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
394{
395 return vqaddq_s8(a, b);
396}
397
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100398inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
399{
400 return vqaddq_s16(a, b);
401}
402
Georgios Pinitas9247c922017-06-28 18:29:47 +0100403inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
404{
405 return vqaddq_s32(a, b);
406}
407
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100408inline int16x4_t vpaddl_qs8(qint8x8_t a)
409{
410 return vpaddl_s8(a);
411}
412
413inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
414{
415 return vsub_s8(a, b);
416}
417
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100418inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
419{
420 return vsub_s16(a, b);
421}
422
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100423inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
424{
425 return vsubq_s8(a, b);
426}
427
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100428inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
429{
430 return vsubq_s16(a, b);
431}
432
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100433inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
434{
435 return vqsub_s8(a, b);
436}
437
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100438inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
439{
440 return vqsub_s16(a, b);
441}
442
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100443inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
444{
445 return vqsubq_s8(a, b);
446}
447
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100448inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
449{
450 return vqsubq_s16(a, b);
451}
452
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100453inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
454{
455 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
456
457 // Initialize the temporary result with a constant used to round up the result
458 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
459
460 // Vector multiply-accumulate long
461 res = vmlal_s8(res, a, b);
462
463 // Shift right by fixed_point_position
464 res = vshlq_s16(res, fixed_point_position_s16);
465
466 // Convert back to qint8
467 return vmovn_s16(res);
468}
469
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100470inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
471{
472 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
473
474 // Initialize the temporary result with a constant used to round up the result
475 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
476
477 // Vector multiply-accumulate long
478 res = vmlal_s16(res, a, b);
479
480 // Shift right by fixed_point_position
481 res = vshlq_s32(res, fixed_point_position_s32);
482
483 // Convert back to qint16
484 return vmovn_s32(res);
485}
486
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100487inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
488{
489 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
490
491 // Initialize the temporary results with a constant used to round up the result
492 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
493 qint16x8_t res1 = res0;
494
495 // Vector multiply-accumulate long
496 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
497 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
498
499 // Shift right by fixed_point_position
500 res0 = vshlq_s16(res0, fixed_point_position_s16);
501 res1 = vshlq_s16(res1, fixed_point_position_s16);
502
503 // Convert back to qint8
504 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
505}
506
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100507inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
508{
509 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
510
511 // Initialize the temporary results with a constant used to round up the result
512 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
513 qint32x4_t res1 = res0;
514
515 // Vector multiply-accumulate long
516 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
517 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
518
519 // Shift right by fixed_point_position
520 res0 = vshlq_s32(res0, fixed_point_position_s32);
521 res1 = vshlq_s32(res1, fixed_point_position_s32);
522
523 // Convert back to qint16
524 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
525}
526
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100527inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
528{
529 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
530
531 // Initialize the temporary result with a constant used to round up the result
532 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
533
534 // Vector multiply-accumulate long
535 res = vmlal_s8(res, a, b);
536
537 // Shift right by fixed_point_position
538 res = vqshlq_s16(res, fixed_point_position_s16);
539
540 // Convert back to qint8 and saturate
541 return vqmovn_s16(res);
542}
543
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100544inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
545{
546 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
547
548 // Initialize the temporary result with a constant used to round up the result
549 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
550
551 // Vector multiply-accumulate long
552 res = vmlal_s16(res, a, b);
553
554 // Shift right by fixed_point_position
555 res = vqshlq_s32(res, fixed_point_position_s32);
556
557 // Convert back to qint16 and saturate
558 return vqmovn_s32(res);
559}
560
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100561inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
562{
563 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
564
565 // Initialize the temporary results with a constant used to round up the result
566 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
567 qint16x8_t res1 = res0;
568
569 // Vector multiply-accumulate long
570 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
571 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
572
573 // Shift right by fixed_point_position
574 res0 = vqshlq_s16(res0, fixed_point_position_s16);
575 res1 = vqshlq_s16(res1, fixed_point_position_s16);
576
577 // Convert back to qint8 and saturate
578 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
579}
580
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100581inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
582{
583 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
584
585 // Initialize the temporary results with a constant used to round up the result
586 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
587 qint32x4_t res1 = res0;
588
589 // Vector multiply-accumulate long
590 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
591 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
592
593 // Shift right by fixed_point_position
594 res0 = vqshlq_s32(res0, fixed_point_position_s32);
595 res1 = vqshlq_s32(res1, fixed_point_position_s32);
596
597 // Convert back to qint16 and saturate
598 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
599}
600
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100601inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
602{
603 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
604
605 qint16x8_t res = vmull_s8(a, b);
606
607 return vqrshlq_s16(res, fixed_point_position_s16);
608}
609
610inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
611{
612 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
613
614 // Initialize the temporary results with a constant used to round up the result
615 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
616
617 // Vector multiply-accumulate long
618 tmp = vmlal_s8(tmp, b, c);
619
620 // Shift right by fixed_point_position
621 tmp = vshlq_s16(tmp, fixed_point_position_s16);
622
623 // Convert back to qint8 and accumulate
624 return vadd_s8(a, vmovn_s16(tmp));
625}
626
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100627inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
628{
629 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
630
631 // Initialize the temporary results with a constant used to round up the result
632 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
633
634 // Vector multiply-accumulate long
635 tmp = vmlal_s16(tmp, b, c);
636
637 // Shift right by fixed_point_position
638 tmp = vshlq_s32(tmp, fixed_point_position_s32);
639
640 // Convert back to qint16 and accumulate
641 return vadd_s16(a, vmovn_s32(tmp));
642}
643
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100644inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
645{
646 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
647
648 // Initialize the temporary results with a constant used to round up the result
649 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
650 qint16x8_t tmp1 = tmp0;
651
652 // Vector multiply-accumulate long
653 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
654 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
655
656 // Shift right by fixed_point_position
657 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
658 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
659
660 // Convert back to qint8 and accumulate
661 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
662}
663
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100664inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
665{
666 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
667
668 // Initialize the temporary results with a constant used to round up the result
669 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
670 qint32x4_t tmp1 = tmp0;
671
672 // Vector multiply-accumulate long
673 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
674 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
675
676 // Shift right by fixed_point_position
677 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
678 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
679
680 // Convert back to qint16 and accumulate
681 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
682}
683
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100684inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
685{
686 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
687
688 // Initialize the temporary results with a constant used to round up the result
689 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
690
691 // Vector multiply-accumulate long
692 tmp = vmlal_s8(tmp, b, c);
693
694 // Shift right by fixed_point_position
695 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
696
697 // Convert back to qint8 and accumulate
698 return vqadd_s8(a, vqmovn_s16(tmp));
699}
700
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100701inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
702{
703 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
704
705 // Initialize the temporary results with a constant used to round up the result
706 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
707
708 // Vector multiply-accumulate long
709 tmp = vmlal_s16(tmp, b, c);
710
711 // Shift right by fixed_point_position
712 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
713
714 // Convert back to qint8 and accumulate
715 return vqadd_s16(a, vqmovn_s32(tmp));
716}
717
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100718inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
719{
720 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
721
722 // Initialize the temporary results with a constant used to round up the result
723 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
724 qint16x8_t tmp1 = tmp0;
725
726 // Vector multiply-accumulate long
727 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
728 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
729
730 // Shift right by fixed_point_position
731 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
732 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
733
734 // Convert back to qint8 and accumulate
735 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
736 return vqaddq_s8(a, res);
737}
738
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100739inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
740{
741 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
742
743 // Initialize the temporary results with a constant used to round up the result
744 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
745 qint32x4_t tmp1 = tmp0;
746
747 // Vector multiply-accumulate long
748 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
749 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
750
751 // Shift right by fixed_point_position
752 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
753 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
754
755 // Convert back to qint16 and accumulate
756 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
757 return vqaddq_s16(a, res);
758}
759
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100760inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
761{
762 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
763
764 // Initialize the temporary results with a constant used to round up the result
765 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
766
767 // Vector multiply-accumulate long
768 tmp = vmlal_s8(tmp, b, c);
769
770 // Shift right by fixed_point_position
771 tmp = vshlq_s16(tmp, fixed_point_position_s16);
772
773 // Accumulate
774 return vaddq_s16(a, tmp);
775}
776
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100777inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
778{
779 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
780
781 // Initialize the temporary results with a constant used to round up the result
782 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
783
784 // Vector multiply-accumulate long
785 tmp = vmlal_s16(tmp, b, c);
786
787 // Shift right by fixed_point_position
788 tmp = vshlq_s32(tmp, fixed_point_position_s32);
789
790 // Accumulate
791 return vaddq_s32(a, tmp);
792}
793
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100794inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
795{
796 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
797
798 // Initialize the temporary results with a constant used to round up the result
799 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
800
801 // Vector multiply-accumulate long
802 tmp = vmlal_s8(tmp, b, c);
803
804 // Shift right by fixed_point_position
805 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
806
807 // Accumulate
808 return vqaddq_s16(a, tmp);
809}
810
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100811inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
812{
813 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
814
815 // Initialize the temporary results with a constant used to round up the result
816 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
817
818 // Vector multiply-accumulate long
819 tmp = vmlal_s16(tmp, b, c);
820
821 // Shift right by fixed_point_position
822 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
823
824 // Accumulate
825 return vqaddq_s32(a, tmp);
826}
827
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100828inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100829{
830 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
831
832 float32x4x2_t res_f32 =
833 {
834 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100835 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
836 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100837 }
838 };
839
840 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
841 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
842
843 const int32x4x2_t res_s32 =
844 {
845 {
846 vcvtq_s32_f32(res_f32.val[0]),
847 vcvtq_s32_f32(res_f32.val[1]),
848 }
849 };
850
851 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
852
853 return vqmovn_s16(res_s16);
854}
855
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100856inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100857{
858 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
859
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100860 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100861
862 res_f32 = vmlaq_f32(res_f32, a, pow2);
863
864 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
865
866 return vqmovn_s32(res_s32);
867}
868
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100869inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100870{
871 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
872
873 float32x4x4_t res_f32 =
874 {
875 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100876 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
877 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
878 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
879 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100880 }
881 };
882
883 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
884 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
885 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
886 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
887
888 const int32x4x4_t res_s32 =
889 {
890 {
891 vcvtq_s32_f32(res_f32.val[0]),
892 vcvtq_s32_f32(res_f32.val[1]),
893 vcvtq_s32_f32(res_f32.val[2]),
894 vcvtq_s32_f32(res_f32.val[3]),
895 }
896 };
897
898 const int16x8x2_t res_s16 =
899 {
900 {
901 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
902 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
903 }
904 };
905
906 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
907}
908
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100909inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100910{
911 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
912
913 float32x4x2_t res_f32 =
914 {
915 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100916 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
917 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100918 }
919 };
920
921 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
922 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
923
924 const int32x4x2_t res_s32 =
925 {
926 {
927 vcvtq_s32_f32(res_f32.val[0]),
928 vcvtq_s32_f32(res_f32.val[1])
929 }
930 };
931
932 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
933}
934
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100935inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
936{
937 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
938
939 const int16x8_t res_s16 = vmovl_s8(a);
940
941 const int32x4x2_t res_s32 =
942 {
943 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100944 vmovl_s16(vget_low_qs16(res_s16)),
945 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100946 }
947 };
948
949 float32x4x2_t res_f32 =
950 {
951 {
952 vcvtq_f32_s32(res_s32.val[0]),
953 vcvtq_f32_s32(res_s32.val[1])
954 }
955 };
956
957 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
958 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
959
960 return res_f32;
961}
962
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100963inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
964{
965 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
966 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
967
968 return vmulq_f32(res_f32, pow2);
969}
970
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100971inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
972{
973 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
974
975 const int16x8x2_t res_s16 =
976 {
977 {
978 vmovl_s8(vget_low_s8(a)),
979 vmovl_s8(vget_high_s8(a)),
980 }
981 };
982
983 const int32x4x4_t res_s32 =
984 {
985 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100986 vmovl_s16(vget_low_qs16(res_s16.val[0])),
987 vmovl_s16(vget_high_qs16(res_s16.val[0])),
988 vmovl_s16(vget_low_qs16(res_s16.val[1])),
989 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100990 }
991 };
992
993 float32x4x4_t res_f32 =
994 {
995 {
996 vcvtq_f32_s32(res_s32.val[0]),
997 vcvtq_f32_s32(res_s32.val[1]),
998 vcvtq_f32_s32(res_s32.val[2]),
999 vcvtq_f32_s32(res_s32.val[3])
1000 }
1001 };
1002
1003 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1004 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1005 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1006 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1007
1008 return res_f32;
1009}
1010
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001011inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1012{
1013 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1014
1015 const int32x4x2_t res_s32 =
1016 {
1017 {
1018 vmovl_s16(vget_low_qs16(a)),
1019 vmovl_s16(vget_high_qs16(a))
1020 }
1021 };
1022
1023 float32x4x2_t res_f32 =
1024 {
1025 {
1026 vcvtq_f32_s32(res_s32.val[0]),
1027 vcvtq_f32_s32(res_s32.val[1])
1028 }
1029 };
1030
1031 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1032 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1033
1034 return res_f32;
1035}
1036
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001037inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1038{
1039 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001040 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1041 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1042 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001043
1044 // Find shift value
1045 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1046 const qint8x8_t temp = vshl_s8(a, shift_value);
1047
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001048 qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001049
1050 uint8x8_t set_one = vcgt_s8(x, const_one);
1051 x = vbsl_s8(set_one, const_one, x);
1052
1053 // Use three iterations of Newton-Raphson method to get the result
1054 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1055 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1056 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1057
1058 return vshl_s8(x, shift_value);
1059}
1060
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001061inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1062{
1063 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1064 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1065 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1066 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1067
1068 // Find shift value
1069 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1070 const qint16x4_t temp = vshl_s16(a, shift_value);
1071
1072 qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
1073
1074 uint16x4_t set_one = vcgt_s16(x, const_one);
1075 x = vbsl_s16(set_one, const_one, x);
1076
1077 // Use five iterations of Newton-Raphson method to get the result
1078 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1079 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1080 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1081 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1082 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1083
1084 return vshl_s16(x, shift_value);
1085}
1086
Georgios Pinitas9247c922017-06-28 18:29:47 +01001087inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1088{
1089 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1090 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1091 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1092 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1093
1094 // Find shift value
1095 const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1096 const qint8x8_t temp = vqshl_s8(a, shift_value);
1097
1098 qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
1099
1100 uint8x8_t set_one = vcgt_s8(x, const_one);
1101 x = vbsl_s8(set_one, const_one, x);
1102
1103 // Use three iterations of Newton-Raphson method to get the result
1104 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1105 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1106 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1107
1108 return vqshl_s8(x, shift_value);
1109}
1110
1111inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1112{
1113 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1114 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1115 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1116 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1117
1118 // Find shift value
1119 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1120 const qint16x4_t temp = vqshl_s16(a, shift_value);
1121
1122 qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
1123
1124 uint16x4_t set_one = vcgt_s16(x, const_one);
1125 x = vbsl_s16(set_one, const_one, x);
1126
1127 // Use five iterations of Newton-Raphson method to get the result
1128 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1129 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1130 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1131 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1132 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1133
1134 return vqshl_s16(x, shift_value);
1135}
1136
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001137inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1138{
1139 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001140 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1141 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1142 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001143
1144 // Find shift value
1145 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1146 const qint8x16_t temp = vshlq_s8(a, shift_value);
1147
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001148 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001149
1150 // Set initial guess to one if x > 1
1151 uint8x16_t set_one = vcgtq_s8(x, const_one);
1152 x = vbslq_s8(set_one, const_one, x);
1153
1154 // Use three iterations of Newton-Raphson method to get the result
1155 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1156 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1157 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1158
1159 return vshlq_s8(x, shift_value);
1160}
1161
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001162inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1163{
1164 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1165 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1166 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1167 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1168
1169 // Find shift value
1170 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1171 const qint16x8_t temp = vshlq_s16(a, shift_value);
1172
1173 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1174
1175 // Set initial guess to one if x > 1
1176 uint16x8_t set_one = vcgtq_s16(x, const_one);
1177 x = vbslq_s16(set_one, const_one, x);
1178
1179 // Use five iterations of Newton-Raphson method to get the result
1180 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1181 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1182 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1183 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1184 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1185
1186 return vshlq_s16(x, shift_value);
1187}
1188
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001189inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1190{
1191 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001192 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1193 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1194 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001195
1196 // Find shift value
1197 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1198 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1199
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001200 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001201
1202 // Set initial guess to one if x > 1
1203 uint8x16_t set_one = vcgtq_s8(x, const_one);
1204 x = vbslq_s8(set_one, const_one, x);
1205
1206 // Use three iterations of Newton-Raphson method to get the result
1207 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1208 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1209 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1210
1211 return vqshlq_s8(x, shift_value);
1212}
1213
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001214inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1215{
1216 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1217 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1218 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1219 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1220
1221 // Find shift value
1222 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1223 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1224
1225 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1226
1227 // Set initial guess to one if x > 1
1228 uint16x8_t set_one = vcgtq_s16(x, const_one);
1229 x = vbslq_s16(set_one, const_one, x);
1230
1231 // Use five iterations of Newton-Raphson method to get the result
1232 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1233 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1234 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1235 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1236 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1237
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001238 // Saturate result in case of overflow
1239 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001240}
1241
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001242inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1243{
1244 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1245}
1246
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001247inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1248{
1249 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1250}
1251
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001252inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1253{
1254 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1255}
1256
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001257inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1258{
1259 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1260}
1261
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001262template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001263inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001264{
1265 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1266 const qint8x8_t const_one = vdup_n_s8(1);
1267 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1268 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1269 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1270 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1271 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1272 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1273 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1274 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1275 return res;
1276}
1277
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001278template <bool islog>
1279inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1280{
1281 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1282 const qint16x4_t const_one = vdup_n_s16(1);
1283 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1284 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1285 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1286 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1287 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1288 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1289 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1290 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1291 return res;
1292}
1293
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001295inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001296{
1297 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1298 const qint8x8_t const_one = vdup_n_s8(1);
1299 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1300 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1301 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1302 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1303 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1304 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1305 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1306 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1307 return res;
1308}
1309
1310template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001311inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1312{
1313 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1314 const qint16x4_t const_one = vdup_n_s16(1);
1315 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1316 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1317 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1318 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1319 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1320 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1321 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1322 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1323 return res;
1324}
1325
1326template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001327inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1328{
1329 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1330 const qint8x16_t const_one = vdupq_n_s8(1);
1331 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1332 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1333 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1334 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1335 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1336 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1337 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1338 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1339 return res;
1340}
1341
1342template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001343inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1344{
1345 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1346 const qint16x8_t const_one = vdupq_n_s16(1);
1347 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1348 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1349 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1350 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1351 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1352 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1353 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1354 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1355 return res;
1356}
1357
1358template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001359inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1360{
1361 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1362 const qint8x16_t const_one = vdupq_n_s8(1);
1363 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1364 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1365 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1366 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1367 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1368 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1369 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1370 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1371 return res;
1372}
1373
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001374template <bool islog>
1375inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1376{
1377 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1378 const qint16x8_t const_one = vdupq_n_s16(1);
1379 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1380 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1381 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1382 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1383 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1384 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1385 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1386 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1387 return res;
1388}
1389
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001390inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1391{
1392 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1393 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1394 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1395 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1396
1397 // Perform range reduction [-log(2),log(2)]
1398 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1399
1400 // get decimal part from m
1401 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1402
1403 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1404 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1405
1406 // Polynomial Approximation
1407 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1408 poly = vqadd_s8(poly, const_one);
1409
1410 // Reconstruct
1411 poly = vqshl_s8(poly, dec_m);
1412
1413 return poly;
1414}
1415
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001416inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1417{
1418 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1419 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1420 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1421 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1422
1423 // Perform range reduction [-log(2),log(2)]
1424 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1425
1426 // get decimal part from m
1427 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1428
1429 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1430 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1431
1432 // Polynomial Approximation
1433 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1434 poly = vqadd_s16(poly, const_one);
1435
1436 // Reconstruct
1437 poly = vqshl_s16(poly, dec_m);
1438
1439 return poly;
1440}
1441
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001442inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1443{
1444 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1445 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1446 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1447 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1448
1449 // Perform range reduction [-log(2),log(2)]
1450 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1451
1452 // get decimal part from m
1453 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1454
1455 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1456 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1457
1458 // Polynomial Approximation
1459 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1460 poly = vqaddq_s8(poly, const_one);
1461
1462 // Reconstruct
1463 poly = vqshlq_s8(poly, dec_m);
1464
1465 return poly;
1466}
1467
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001468inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1469{
1470 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1471 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1472 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1473 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1474
1475 // Perform range reduction [-log(2),log(2)]
1476 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1477
1478 // get decimal part from m
1479 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1480
1481 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1482 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1483
1484 // Polynomial Approximation
1485 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1486 poly = vqaddq_s16(poly, const_one);
1487
1488 // Reconstruct
1489 poly = vqshlq_s16(poly, dec_m);
1490
1491 return poly;
1492}
1493
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001494inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1495{
1496 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1497 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1498 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1499
1500 // If 0 < a < 1, calculate log(1/x)
1501 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1502 qint8x8_t recip = vdup_n_s8(0);
1503 recip = vbsl_s8(calc_reciprocal, recip, a);
1504
1505 // Calculate reciprocal
1506 recip = vrecip_qs8(recip, fixed_point_position);
1507 a = vbsl_s8(calc_reciprocal, recip, a);
1508
1509 // Get decimal part of a
1510 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1511 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1512
1513 // Get exponent of 2^n which is equal or less than dec_a
1514 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1515
1516 // Get x to range (1, 2]
1517 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1518 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1519 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1520
1521 // Polynomial Approximation
1522 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1523
1524 // Reconstruct
1525 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1526
1527 // Set negative value for 0 < a < 1
1528 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1529
1530 return poly;
1531}
1532
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001533inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1534{
1535 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1536 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1537 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1538
1539 // If 0 < a < 1, calculate log(1/x)
1540 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1541 qint16x4_t recip = vdup_n_s16(0);
1542 recip = vbsl_s16(calc_reciprocal, recip, a);
1543
1544 // Calculate reciprocal
1545 recip = vrecip_qs16(recip, fixed_point_position);
1546 a = vbsl_s16(calc_reciprocal, recip, a);
1547
1548 // Get decimal part of a
1549 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1550 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1551
1552 // Get exponent of 2^n which is equal or less than dec_a
1553 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1554
1555 // Get x to range (1, 2]
1556 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1557 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1558 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1559
1560 // Polynomial Approximation
1561 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1562
1563 // Reconstruct
1564 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1565
1566 // Set negative value for 0 < a < 1
1567 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1568
1569 return poly;
1570}
1571
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001572inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1573{
1574 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1575 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1576 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1577
1578 // If 0 < a < 1, calculate log(1/x)
1579 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1580 qint8x16_t recip = vdupq_n_s8(0);
1581 recip = vbslq_s8(calc_reciprocal, a, recip);
1582
1583 // Calculate reciprocal
1584 recip = vrecipq_qs8(recip, fixed_point_position);
1585 a = vbslq_s8(calc_reciprocal, recip, a);
1586
1587 // Get decimal part of a
1588 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1589 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1590
1591 // Get exponent of 2^n which is equal or less than dec_a
1592 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1593
1594 // Get x to range (1, 2]
1595 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1596 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1597 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1598
1599 // Polynomial Approximation
1600 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1601
1602 // Reconstruct
1603 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1604
1605 // Set negative value for 0 < a < 1
1606 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1607
1608 return poly;
1609}
1610
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001611inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1612{
1613 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1614 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1615 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1616
1617 // If 0 < a < 1, calculate log(1/x)
1618 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1619 qint16x8_t recip = vdupq_n_s16(0);
1620 recip = vbslq_s16(calc_reciprocal, a, recip);
1621
1622 // Calculate reciprocal
1623 recip = vqrecipq_qs16(recip, fixed_point_position);
1624 a = vbslq_s16(calc_reciprocal, recip, a);
1625
1626 // Get decimal part of a
1627 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1628 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1629
1630 // Get exponent of 2^n which is equal or less than dec_a
1631 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1632
1633 // Get x to range (1, 2]
1634 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1635 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1636 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1637
1638 // Polynomial Approximation
1639 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1640
1641 // Reconstruct
1642 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1643
1644 // Set negative value for 0 < a < 1
1645 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1646
1647 return poly;
1648}
1649
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001650inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1651{
1652 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1653
1654 // Find shift value. Number must be in (0.5, 2) range.
1655 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1656
1657 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1658 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1659 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1660 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1661 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1662
1663 temp = vshl_s8(a, shift_value);
1664
1665 // Initial guess
1666 qint8x8_t x = temp;
1667
1668 // Calculate (x / 2) * (3 - a * x^2)
1669 // After three iterations we have the result for 8 bit
1670 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1671 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1672 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1673
1674 return vshl_s8(x, shift_value2);
1675}
1676
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001677inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1678{
1679 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1680
1681 // Find shift value. Number must be in (0.5, 2) range.
1682 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1683
1684 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1685 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1686 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1687 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1688 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1689
1690 temp = vshl_s16(a, shift_value);
1691
1692 // Initial guess
1693 qint16x4_t x = temp;
1694
1695 // Calculate (x / 2) * (3 - a * x^2)
1696 // After five iterations we have the result for 8 bit
1697 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1698 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1699 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1700 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1701 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1702
1703 return vshl_s16(x, shift_value2);
1704}
1705
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001706inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1707{
1708 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1709
1710 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001711 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001712
1713 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001714 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001715 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001716 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001717 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001718
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001719 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001720
1721 // Initial guess
1722 qint8x8_t x = temp;
1723
1724 // Calculate (x / 2) * (3 - a * x^2)
1725 // After three iterations we have the result for 8 bit
1726 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1727 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1728 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1729
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001730 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001731}
1732
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001733inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1734{
1735 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1736
1737 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001738 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001739
1740 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1741 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1742 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1743 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001744 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001745
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001746 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001747
1748 // Initial guess
1749 qint16x4_t x = temp;
1750
1751 // Calculate (x / 2) * (3 - a * x^2)
1752 // After five iterations we have the result for 16 bit
1753 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1754 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1755 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1756 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1757 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1758
1759 return vqshl_s16(x, shift_value2);
1760}
1761
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001762inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1763{
1764 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1765
1766 // Find shift value. Number must be in (0.5, 2) range.
1767 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1768
1769 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1770 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1771 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1772 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1773 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1774
1775 temp = vshlq_s8(a, shift_value);
1776
1777 // Initial guess
1778 qint8x16_t x = temp;
1779
1780 // Calculate (x / 2) * (3 - a * x^2)
1781 // After three iterations we have the result for 8 bit
1782 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1783 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1784 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1785
1786 return vshlq_s8(x, shift_value2);
1787}
1788
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001789inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1790{
1791 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1792
1793 // Find shift value. Number must be in (0.5, 2) range.
1794 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1795
1796 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1797 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1798 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1799 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1800 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1801
1802 temp = vshlq_s16(a, shift_value);
1803
1804 // Initial guess
1805 qint16x8_t x = temp;
1806
1807 // Calculate (x / 2) * (3 - a * x^2)
1808 // After five iterations we have the result for 16 bit
1809 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1810 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1811 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1812 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1813 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1814
1815 return vshlq_s16(x, shift_value2);
1816}
1817
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001818inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1819{
1820 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1821
1822 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001823 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001824
1825 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001826 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001827 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001828 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001829 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001830
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001831 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001832
1833 // Initial guess
1834 qint8x16_t x = temp;
1835
1836 // Calculate (x / 2) * (3 - a * x^2)
1837 // After three iterations we have the result for 8 bit
1838 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1839 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1840 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1841
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001842 return vqshlq_s8(x, shift_value2);
1843}
1844
1845inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1846{
1847 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1848
1849 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001850 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001851
1852 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1853 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1854 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1855 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001856 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001857
1858 temp = vqshlq_s16(a, shift_value);
1859
1860 // Initial guess
1861 qint16x8_t x = temp;
1862
1863 // Calculate (x / 2) * (3 - a * x^2)
1864 // After five iterations we have the result for 16 bit
1865 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1866 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1867 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1868 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1869 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1870
1871 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001872}
1873
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001874inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001875{
1876 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1877 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1878
1879 qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1880 qint8x8_t num = vqsub_qs8(exp2x, const_one);
1881 qint8x8_t den = vqadd_qs8(exp2x, const_one);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001882 qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001883
1884 return tanh;
1885}
1886
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001887inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001888{
1889 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1890 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1891
1892 qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1893 qint16x4_t num = vqsub_qs16(exp2x, const_one);
1894 qint16x4_t den = vqadd_qs16(exp2x, const_one);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001895 qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001896
1897 return tanh;
1898}
1899
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001900inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001901{
1902 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1903 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1904
1905 qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1906 qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1907 qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1908 qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
1909
1910 return tanh;
1911}
1912
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001913inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1914{
1915 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1916 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1917
1918 qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1919 qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1920 qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1921 qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
1922
1923 return tanh;
1924}
1925
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001926inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1927{
1928 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1929}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001930
1931inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1932{
1933 float32x4x2_t res =
1934 {
1935 {
1936 vmaxq_f32(a.val[0], b.val[0]),
1937 vmaxq_f32(a.val[1], b.val[1])
1938 }
1939 };
1940 return res;
1941}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001942}