blob: c879d3e275db2aa181e8e0e5f4125fbd4b183517 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
28/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010030static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031{
32 {
33 vdup_n_s8(0x7F), // 0.9978546
34 vdup_n_s8(0x3F), // 0.4994721
35 vdup_n_s8(0x16), // 0.1763723
36 vdup_n_s8(0x05), // 0.0435108
37 }
38};
39
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010040/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
41 * Format is in Q0.15 for all elements */
42static const std::array<qint16x4_t, 4> exp_tab_qs16 =
43{
44 {
45 vdup_n_s16(0x7FBA), // 0.9978546
46 vdup_n_s16(0x3FE9), // 0.4994721
47 vdup_n_s16(0x1693), // 0.1763723
48 vdup_n_s16(0x0592), // 0.0435108
49 }
50};
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
53 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010054static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055{
56 {
57 vdupq_n_s8(0x7F), // 0.9978546
58 vdupq_n_s8(0x3F), // 0.4994721
59 vdupq_n_s8(0x16), // 0.1763723
60 vdupq_n_s8(0x05), // 0.0435108
61 }
62};
63
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010064/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
65 * Format is in Q0.15 for all elements */
66static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
67{
68 {
69 vdupq_n_s16(0x7FBA), // 0.9978546
70 vdupq_n_s16(0x3FE9), // 0.4994721
71 vdupq_n_s16(0x1693), // 0.1763723
72 vdupq_n_s16(0x0592), // 0.0435108
73 }
74};
75
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
77 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010078static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079{
80 {
81 vdup_n_s8(0x5C), // 1.4384189
82 vdup_n_s8(-0x56), // -0.6771900
83 vdup_n_s8(0x29), // 0.3218538
84 vdup_n_s8(-0x0A), // -0.0832229
85 }
86};
87
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010088/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
89 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
90static const std::array<qint16x4_t, 4> log_tab_qs16 =
91{
92 {
93 vdup_n_s16(0x5C0F), // 1.4384189
94 vdup_n_s16(-0x56AE), // -0.6771900
95 vdup_n_s16(0x2933), // 0.3218538
96 vdup_n_s16(-0x0AA7), // -0.0832229
97 }
98};
99
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
101 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100102static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103{
104 {
105 vdupq_n_s8(0x5C), // 1.4384189
106 vdupq_n_s8(-0x56), // -0.6771900
107 vdupq_n_s8(0x29), // 0.3218538
108 vdupq_n_s8(-0x0A), // -0.0832229
109 }
110};
111
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100112/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
113 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
114static const std::array<qint16x8_t, 4> log_tabq_qs16 =
115{
116 {
117 vdupq_n_s16(0x5C0F), // 1.4384189
118 vdupq_n_s16(-0x56AE), // -0.6771900
119 vdupq_n_s16(0x2933), // 0.3218538
120 vdupq_n_s16(-0x0AA7), // -0.0832229
121 }
122};
123
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124inline qint8x8_t vget_low_qs8(qint8x16_t a)
125{
126 return vget_low_s8(a);
127}
128
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100129inline qint16x4_t vget_low_qs16(qint16x8_t a)
130{
131 return vget_low_s16(a);
132}
133
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134inline qint8x8_t vget_high_qs8(qint8x16_t a)
135{
136 return vget_high_s8(a);
137}
138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100139inline qint16x4_t vget_high_qs16(qint16x8_t a)
140{
141 return vget_high_s16(a);
142}
143
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144inline qint8x8_t vld1_qs8(const qint8_t *addr)
145{
146 return vld1_s8(addr);
147}
148
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149inline qint16x4_t vld1_qs16(const qint16_t *addr)
150{
151 return vld1_s16(addr);
152}
153
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100154inline qint8x16_t vld1q_qs8(const qint8_t *addr)
155{
156 return vld1q_s8(addr);
157}
158
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159inline qint16x8_t vld1q_qs16(const qint16_t *addr)
160{
161 return vld1q_s16(addr);
162}
163
164inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
165{
166 return vld1_dup_s8(addr);
167}
168
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100169inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
170{
171 return vld1_dup_s16(addr);
172}
173
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
175{
176 return vld1q_dup_s8(addr);
177}
178
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100179inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
180{
181 return vld1q_dup_s16(addr);
182}
183
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100184inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
185{
186 return vld2q_s16(addr);
187}
188
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
190{
191 vst1_s8(addr, b);
192}
193
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
195{
196 vst1_s16(addr, b);
197}
198
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100199inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
200{
201 vst1q_s8(addr, b);
202}
203
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
205{
206 vst1q_s16(addr, b);
207}
208
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100209inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
210{
211 vst2q_s16(addr, b);
212}
213
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214inline qint8x8_t vqmovn_qs16(qint16x8_t a)
215{
216 return vqmovn_s16(a);
217}
218
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100219inline qint16x4_t vqmovn_qs32(qint32x4_t a)
220{
221 return vqmovn_s32(a);
222}
223
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224inline qint8x8_t vdup_n_qs8(qint8_t a)
225{
226 return vdup_n_s8(a);
227}
228
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100229inline qint16x4_t vdup_n_qs16(qint16_t a)
230{
231 return vdup_n_s16(a);
232}
233
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234inline qint8x16_t vdupq_n_qs8(qint8_t a)
235{
236 return vdupq_n_s8(a);
237}
238
239inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
240{
241 float32x4x4_t res =
242 {
243 {
244 vdupq_n_f32(a),
245 vdupq_n_f32(a),
246 vdupq_n_f32(a),
247 vdupq_n_f32(a),
248 }
249 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100250 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251}
252
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +0100253inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
254{
255 float32x4x2_t res =
256 {
257 {
258 vdupq_n_f32(a),
259 vdupq_n_f32(a),
260 }
261 };
262 return vqcvtq_qs16_f32(res, fixed_point_position);
263}
264
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100265inline qint16x8_t vdupq_n_qs16(qint16_t a)
266{
267 return vdupq_n_s16(a);
268}
269
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100270inline qint32x4_t vdupq_n_qs32(qint32_t a)
271{
272 return vdupq_n_s32(a);
273}
274
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100275inline qint8x8_t vabs_qs8(qint8x8_t a)
276{
277 return vabs_s8(a);
278}
279
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100280inline qint16x4_t vabs_qs16(qint16x4_t a)
281{
282 return vabs_s16(a);
283}
284
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285inline qint8x16_t vabsq_qs8(qint8x16_t a)
286{
287 return vabsq_s8(a);
288}
289
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100290inline qint16x8_t vabsq_qs16(qint16x8_t a)
291{
292 return vabsq_s16(a);
293}
294
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100295inline qint8x8_t vqabs_qs8(qint8x8_t a)
296{
297 return vqabs_s8(a);
298}
299
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100300inline qint16x4_t vqabs_qs16(qint16x4_t a)
301{
302 return vqabs_s16(a);
303}
304
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100305inline qint8x16_t vqabsq_qs8(qint8x16_t a)
306{
307 return vqabsq_s8(a);
308}
309
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100310inline qint16x8_t vqabsq_qs16(qint16x8_t a)
311{
312 return vqabsq_s16(a);
313}
314
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100315inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
316{
317 return vmax_s8(a, b);
318}
319
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100320inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
321{
322 return vmax_s16(a, b);
323}
324
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100325inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
326{
327 return vmaxq_s8(a, b);
328}
329
330inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
331{
332 return vpmax_s8(a, b);
333}
334
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100335inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
336{
337 return vpmax_s16(a, b);
338}
339
340inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
341{
342 return vmaxq_s16(a, b);
343}
344
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100345inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
346{
347 return vmin_s8(a, b);
348}
349
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100350inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
351{
352 return vmin_s16(a, b);
353}
354
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100355inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
356{
357 return vminq_s8(a, b);
358}
359
360inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
361{
362 return vpmin_s8(a, b);
363}
364
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100365inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
366{
367 return vpmin_s16(a, b);
368}
369
370inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
371{
372 return vminq_s16(a, b);
373}
374
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100375inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
376{
377 return vadd_s8(a, b);
378}
379
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100380inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
381{
382 return vadd_s16(a, b);
383}
384
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100385inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
386{
387 return vaddq_s8(a, b);
388}
389
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100390inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
391{
392 return vaddq_s16(a, b);
393}
394
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100395inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
396{
397 return vqadd_s8(a, b);
398}
399
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100400inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
401{
402 return vqadd_s16(a, b);
403}
404
Georgios Pinitas9247c922017-06-28 18:29:47 +0100405inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
406{
407 return vqadd_s32(a, b);
408}
409
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100410inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
411{
412 return vqaddq_s8(a, b);
413}
414
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100415inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
416{
417 return vqaddq_s16(a, b);
418}
419
Georgios Pinitas9247c922017-06-28 18:29:47 +0100420inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
421{
422 return vqaddq_s32(a, b);
423}
424
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100425inline int16x4_t vpaddl_qs8(qint8x8_t a)
426{
427 return vpaddl_s8(a);
428}
429
430inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
431{
432 return vsub_s8(a, b);
433}
434
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100435inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
436{
437 return vsub_s16(a, b);
438}
439
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100440inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
441{
442 return vsubq_s8(a, b);
443}
444
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100445inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
446{
447 return vsubq_s16(a, b);
448}
449
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100450inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
451{
452 return vqsub_s8(a, b);
453}
454
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100455inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
456{
457 return vqsub_s16(a, b);
458}
459
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
461{
462 return vqsubq_s8(a, b);
463}
464
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100465inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
466{
467 return vqsubq_s16(a, b);
468}
469
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100470inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
471{
472 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
473
474 // Initialize the temporary result with a constant used to round up the result
475 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
476
477 // Vector multiply-accumulate long
478 res = vmlal_s8(res, a, b);
479
480 // Shift right by fixed_point_position
481 res = vshlq_s16(res, fixed_point_position_s16);
482
483 // Convert back to qint8
484 return vmovn_s16(res);
485}
486
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100487inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
488{
489 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
490
491 // Initialize the temporary result with a constant used to round up the result
492 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
493
494 // Vector multiply-accumulate long
495 res = vmlal_s16(res, a, b);
496
497 // Shift right by fixed_point_position
498 res = vshlq_s32(res, fixed_point_position_s32);
499
500 // Convert back to qint16
501 return vmovn_s32(res);
502}
503
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100504inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
505{
506 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
507
508 // Initialize the temporary results with a constant used to round up the result
509 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
510 qint16x8_t res1 = res0;
511
512 // Vector multiply-accumulate long
513 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
514 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
515
516 // Shift right by fixed_point_position
517 res0 = vshlq_s16(res0, fixed_point_position_s16);
518 res1 = vshlq_s16(res1, fixed_point_position_s16);
519
520 // Convert back to qint8
521 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
522}
523
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100524inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
525{
526 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
527
528 // Initialize the temporary results with a constant used to round up the result
529 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
530 qint32x4_t res1 = res0;
531
532 // Vector multiply-accumulate long
533 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
534 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
535
536 // Shift right by fixed_point_position
537 res0 = vshlq_s32(res0, fixed_point_position_s32);
538 res1 = vshlq_s32(res1, fixed_point_position_s32);
539
540 // Convert back to qint16
541 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
542}
543
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100544inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
545{
546 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
547
548 // Initialize the temporary result with a constant used to round up the result
549 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
550
551 // Vector multiply-accumulate long
552 res = vmlal_s8(res, a, b);
553
554 // Shift right by fixed_point_position
555 res = vqshlq_s16(res, fixed_point_position_s16);
556
557 // Convert back to qint8 and saturate
558 return vqmovn_s16(res);
559}
560
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100561inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
562{
563 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
564
565 // Initialize the temporary result with a constant used to round up the result
566 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
567
568 // Vector multiply-accumulate long
569 res = vmlal_s16(res, a, b);
570
571 // Shift right by fixed_point_position
572 res = vqshlq_s32(res, fixed_point_position_s32);
573
574 // Convert back to qint16 and saturate
575 return vqmovn_s32(res);
576}
577
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100578inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
579{
580 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
581
582 // Initialize the temporary results with a constant used to round up the result
583 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
584 qint16x8_t res1 = res0;
585
586 // Vector multiply-accumulate long
587 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
588 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
589
590 // Shift right by fixed_point_position
591 res0 = vqshlq_s16(res0, fixed_point_position_s16);
592 res1 = vqshlq_s16(res1, fixed_point_position_s16);
593
594 // Convert back to qint8 and saturate
595 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
596}
597
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100598inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
599{
600 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
601
602 // Initialize the temporary results with a constant used to round up the result
603 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
604 qint32x4_t res1 = res0;
605
606 // Vector multiply-accumulate long
607 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
608 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
609
610 // Shift right by fixed_point_position
611 res0 = vqshlq_s32(res0, fixed_point_position_s32);
612 res1 = vqshlq_s32(res1, fixed_point_position_s32);
613
614 // Convert back to qint16 and saturate
615 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
616}
617
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100618inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
619{
620 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
621
622 qint16x8_t res = vmull_s8(a, b);
623
624 return vqrshlq_s16(res, fixed_point_position_s16);
625}
626
627inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
628{
629 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
630
631 // Initialize the temporary results with a constant used to round up the result
632 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
633
634 // Vector multiply-accumulate long
635 tmp = vmlal_s8(tmp, b, c);
636
637 // Shift right by fixed_point_position
638 tmp = vshlq_s16(tmp, fixed_point_position_s16);
639
640 // Convert back to qint8 and accumulate
641 return vadd_s8(a, vmovn_s16(tmp));
642}
643
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100644inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
645{
646 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
647
648 // Initialize the temporary results with a constant used to round up the result
649 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
650
651 // Vector multiply-accumulate long
652 tmp = vmlal_s16(tmp, b, c);
653
654 // Shift right by fixed_point_position
655 tmp = vshlq_s32(tmp, fixed_point_position_s32);
656
657 // Convert back to qint16 and accumulate
658 return vadd_s16(a, vmovn_s32(tmp));
659}
660
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100661inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
662{
663 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
664
665 // Initialize the temporary results with a constant used to round up the result
666 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
667 qint16x8_t tmp1 = tmp0;
668
669 // Vector multiply-accumulate long
670 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
671 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
672
673 // Shift right by fixed_point_position
674 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
675 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
676
677 // Convert back to qint8 and accumulate
678 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
679}
680
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100681inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
682{
683 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
684
685 // Initialize the temporary results with a constant used to round up the result
686 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
687 qint32x4_t tmp1 = tmp0;
688
689 // Vector multiply-accumulate long
690 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
691 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
692
693 // Shift right by fixed_point_position
694 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
695 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
696
697 // Convert back to qint16 and accumulate
698 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
699}
700
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100701inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
702{
703 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
704
705 // Initialize the temporary results with a constant used to round up the result
706 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
707
708 // Vector multiply-accumulate long
709 tmp = vmlal_s8(tmp, b, c);
710
711 // Shift right by fixed_point_position
712 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
713
714 // Convert back to qint8 and accumulate
715 return vqadd_s8(a, vqmovn_s16(tmp));
716}
717
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100718inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
719{
720 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
721
722 // Initialize the temporary results with a constant used to round up the result
723 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
724
725 // Vector multiply-accumulate long
726 tmp = vmlal_s16(tmp, b, c);
727
728 // Shift right by fixed_point_position
729 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
730
731 // Convert back to qint8 and accumulate
732 return vqadd_s16(a, vqmovn_s32(tmp));
733}
734
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100735inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
736{
737 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
738
739 // Initialize the temporary results with a constant used to round up the result
740 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
741 qint16x8_t tmp1 = tmp0;
742
743 // Vector multiply-accumulate long
744 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
745 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
746
747 // Shift right by fixed_point_position
748 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
749 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
750
751 // Convert back to qint8 and accumulate
752 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
753 return vqaddq_s8(a, res);
754}
755
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100756inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
757{
758 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
759
760 // Initialize the temporary results with a constant used to round up the result
761 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
762 qint32x4_t tmp1 = tmp0;
763
764 // Vector multiply-accumulate long
765 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
766 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
767
768 // Shift right by fixed_point_position
769 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
770 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
771
772 // Convert back to qint16 and accumulate
773 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
774 return vqaddq_s16(a, res);
775}
776
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100777inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
778{
779 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
780
781 // Initialize the temporary results with a constant used to round up the result
782 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
783
784 // Vector multiply-accumulate long
785 tmp = vmlal_s8(tmp, b, c);
786
787 // Shift right by fixed_point_position
788 tmp = vshlq_s16(tmp, fixed_point_position_s16);
789
790 // Accumulate
791 return vaddq_s16(a, tmp);
792}
793
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100794inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
795{
796 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
797
798 // Initialize the temporary results with a constant used to round up the result
799 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
800
801 // Vector multiply-accumulate long
802 tmp = vmlal_s16(tmp, b, c);
803
804 // Shift right by fixed_point_position
805 tmp = vshlq_s32(tmp, fixed_point_position_s32);
806
807 // Accumulate
808 return vaddq_s32(a, tmp);
809}
810
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100811inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
812{
813 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
814
815 // Initialize the temporary results with a constant used to round up the result
816 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
817
818 // Vector multiply-accumulate long
819 tmp = vmlal_s8(tmp, b, c);
820
821 // Shift right by fixed_point_position
822 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
823
824 // Accumulate
825 return vqaddq_s16(a, tmp);
826}
827
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100828inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
829{
830 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
831
832 // Initialize the temporary results with a constant used to round up the result
833 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
834
835 // Vector multiply-accumulate long
836 tmp = vmlal_s16(tmp, b, c);
837
838 // Shift right by fixed_point_position
839 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
840
841 // Accumulate
842 return vqaddq_s32(a, tmp);
843}
844
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100845inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100846{
847 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
848
849 float32x4x2_t res_f32 =
850 {
851 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100852 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
853 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100854 }
855 };
856
857 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
858 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
859
860 const int32x4x2_t res_s32 =
861 {
862 {
863 vcvtq_s32_f32(res_f32.val[0]),
864 vcvtq_s32_f32(res_f32.val[1]),
865 }
866 };
867
868 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
869
870 return vqmovn_s16(res_s16);
871}
872
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100873inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100874{
875 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
876
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100877 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100878
879 res_f32 = vmlaq_f32(res_f32, a, pow2);
880
881 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
882
883 return vqmovn_s32(res_s32);
884}
885
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100886inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100887{
888 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
889
890 float32x4x4_t res_f32 =
891 {
892 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100893 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
894 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
895 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
896 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100897 }
898 };
899
900 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
901 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
902 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
903 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
904
905 const int32x4x4_t res_s32 =
906 {
907 {
908 vcvtq_s32_f32(res_f32.val[0]),
909 vcvtq_s32_f32(res_f32.val[1]),
910 vcvtq_s32_f32(res_f32.val[2]),
911 vcvtq_s32_f32(res_f32.val[3]),
912 }
913 };
914
915 const int16x8x2_t res_s16 =
916 {
917 {
918 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
919 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
920 }
921 };
922
923 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
924}
925
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100926inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100927{
928 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
929
930 float32x4x2_t res_f32 =
931 {
932 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100933 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
934 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100935 }
936 };
937
938 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
939 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
940
941 const int32x4x2_t res_s32 =
942 {
943 {
944 vcvtq_s32_f32(res_f32.val[0]),
945 vcvtq_s32_f32(res_f32.val[1])
946 }
947 };
948
949 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
950}
951
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100952inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
953{
954 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
955
956 const int16x8_t res_s16 = vmovl_s8(a);
957
958 const int32x4x2_t res_s32 =
959 {
960 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100961 vmovl_s16(vget_low_qs16(res_s16)),
962 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100963 }
964 };
965
966 float32x4x2_t res_f32 =
967 {
968 {
969 vcvtq_f32_s32(res_s32.val[0]),
970 vcvtq_f32_s32(res_s32.val[1])
971 }
972 };
973
974 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
975 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
976
977 return res_f32;
978}
979
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100980inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
981{
982 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
983 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
984
985 return vmulq_f32(res_f32, pow2);
986}
987
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100988inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
989{
990 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
991
992 const int16x8x2_t res_s16 =
993 {
994 {
995 vmovl_s8(vget_low_s8(a)),
996 vmovl_s8(vget_high_s8(a)),
997 }
998 };
999
1000 const int32x4x4_t res_s32 =
1001 {
1002 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001003 vmovl_s16(vget_low_qs16(res_s16.val[0])),
1004 vmovl_s16(vget_high_qs16(res_s16.val[0])),
1005 vmovl_s16(vget_low_qs16(res_s16.val[1])),
1006 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001007 }
1008 };
1009
1010 float32x4x4_t res_f32 =
1011 {
1012 {
1013 vcvtq_f32_s32(res_s32.val[0]),
1014 vcvtq_f32_s32(res_s32.val[1]),
1015 vcvtq_f32_s32(res_s32.val[2]),
1016 vcvtq_f32_s32(res_s32.val[3])
1017 }
1018 };
1019
1020 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1021 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1022 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1023 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1024
1025 return res_f32;
1026}
1027
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001028inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1029{
1030 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1031
1032 const int32x4x2_t res_s32 =
1033 {
1034 {
1035 vmovl_s16(vget_low_qs16(a)),
1036 vmovl_s16(vget_high_qs16(a))
1037 }
1038 };
1039
1040 float32x4x2_t res_f32 =
1041 {
1042 {
1043 vcvtq_f32_s32(res_s32.val[0]),
1044 vcvtq_f32_s32(res_s32.val[1])
1045 }
1046 };
1047
1048 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1049 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1050
1051 return res_f32;
1052}
1053
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001054inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1055{
1056 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001057 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1058 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1059 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001060
1061 // Find shift value
1062 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1063 const qint8x8_t temp = vshl_s8(a, shift_value);
1064
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001065 // Newton-Raphson division initial estimate X0 calculation
1066 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001067
1068 uint8x8_t set_one = vcgt_s8(x, const_one);
1069 x = vbsl_s8(set_one, const_one, x);
1070
1071 // Use three iterations of Newton-Raphson method to get the result
1072 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1073 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1074 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1075
1076 return vshl_s8(x, shift_value);
1077}
1078
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001079inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1080{
1081 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1082 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1083 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1084 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1085
1086 // Find shift value
1087 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1088 const qint16x4_t temp = vshl_s16(a, shift_value);
1089
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001090 // Newton-Raphson division initial estimate X0 calculation
1091 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001092
1093 uint16x4_t set_one = vcgt_s16(x, const_one);
1094 x = vbsl_s16(set_one, const_one, x);
1095
1096 // Use five iterations of Newton-Raphson method to get the result
1097 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1098 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1099 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1100 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1101 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1102
1103 return vshl_s16(x, shift_value);
1104}
1105
Georgios Pinitas9247c922017-06-28 18:29:47 +01001106inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1107{
1108 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1109 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1110 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1111 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1112
1113 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001114 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001115 const qint8x8_t temp = vqshl_s8(a, shift_value);
1116
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001117 // Newton-Raphson division initial estimate X0 calculation
1118 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001119
1120 uint8x8_t set_one = vcgt_s8(x, const_one);
1121 x = vbsl_s8(set_one, const_one, x);
1122
1123 // Use three iterations of Newton-Raphson method to get the result
1124 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1125 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1126 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1127
1128 return vqshl_s8(x, shift_value);
1129}
1130
1131inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1132{
1133 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1134 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1135 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1136 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1137
1138 // Find shift value
1139 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1140 const qint16x4_t temp = vqshl_s16(a, shift_value);
1141
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001142 // Newton-Raphson division initial estimate X0 calculation
1143 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001144
1145 uint16x4_t set_one = vcgt_s16(x, const_one);
1146 x = vbsl_s16(set_one, const_one, x);
1147
1148 // Use five iterations of Newton-Raphson method to get the result
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001149 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1150 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1151 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1152 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1153 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001154
1155 return vqshl_s16(x, shift_value);
1156}
1157
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001158inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1159{
1160 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001161 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1162 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1163 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001164
1165 // Find shift value
1166 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1167 const qint8x16_t temp = vshlq_s8(a, shift_value);
1168
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001169 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001170 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001171
1172 // Set initial guess to one if x > 1
1173 uint8x16_t set_one = vcgtq_s8(x, const_one);
1174 x = vbslq_s8(set_one, const_one, x);
1175
1176 // Use three iterations of Newton-Raphson method to get the result
1177 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1178 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1179 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1180
1181 return vshlq_s8(x, shift_value);
1182}
1183
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001184inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1185{
1186 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1187 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1188 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1189 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1190
1191 // Find shift value
1192 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1193 const qint16x8_t temp = vshlq_s16(a, shift_value);
1194
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001195 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001196 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1197
1198 // Set initial guess to one if x > 1
1199 uint16x8_t set_one = vcgtq_s16(x, const_one);
1200 x = vbslq_s16(set_one, const_one, x);
1201
1202 // Use five iterations of Newton-Raphson method to get the result
1203 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1204 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1205 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1206 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1207 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1208
1209 return vshlq_s16(x, shift_value);
1210}
1211
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001212inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1213{
1214 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001215 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1216 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1217 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001218
1219 // Find shift value
1220 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1221 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1222
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001223 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001224 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001225
1226 // Set initial guess to one if x > 1
1227 uint8x16_t set_one = vcgtq_s8(x, const_one);
1228 x = vbslq_s8(set_one, const_one, x);
1229
1230 // Use three iterations of Newton-Raphson method to get the result
1231 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1232 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1233 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1234
1235 return vqshlq_s8(x, shift_value);
1236}
1237
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001238inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1239{
1240 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1241 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1242 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1243 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1244
1245 // Find shift value
1246 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1247 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1248
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001249 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001250 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1251
1252 // Set initial guess to one if x > 1
1253 uint16x8_t set_one = vcgtq_s16(x, const_one);
1254 x = vbslq_s16(set_one, const_one, x);
1255
1256 // Use five iterations of Newton-Raphson method to get the result
1257 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1258 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1259 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1260 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1261 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1262
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001263 // Saturate result in case of overflow
1264 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001265}
1266
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001267inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1268{
1269 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1270}
1271
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001272inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1273{
1274 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1275}
1276
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001277inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1278{
1279 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1280}
1281
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001282inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1283{
1284 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1285}
1286
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001287template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001288inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001289{
1290 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1291 const qint8x8_t const_one = vdup_n_s8(1);
1292 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1293 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1294 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1295 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1296 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1297 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1298 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1299 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1300 return res;
1301}
1302
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001303template <bool islog>
1304inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1305{
1306 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1307 const qint16x4_t const_one = vdup_n_s16(1);
1308 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1309 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1310 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1311 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1312 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1313 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1314 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1315 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1316 return res;
1317}
1318
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001319template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001320inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001321{
1322 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1323 const qint8x8_t const_one = vdup_n_s8(1);
1324 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1325 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1326 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1327 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1328 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1329 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1330 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1331 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1332 return res;
1333}
1334
1335template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001336inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1337{
1338 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1339 const qint16x4_t const_one = vdup_n_s16(1);
1340 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1341 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1342 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1343 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1344 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1345 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1346 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1347 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1348 return res;
1349}
1350
1351template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001352inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1353{
1354 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1355 const qint8x16_t const_one = vdupq_n_s8(1);
1356 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1357 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1358 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1359 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1360 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1361 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1362 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1363 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1364 return res;
1365}
1366
1367template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001368inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1369{
1370 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1371 const qint16x8_t const_one = vdupq_n_s16(1);
1372 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1373 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1374 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1375 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1376 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1377 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1378 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1379 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1380 return res;
1381}
1382
1383template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001384inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1385{
1386 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1387 const qint8x16_t const_one = vdupq_n_s8(1);
1388 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1389 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1390 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1391 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1392 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1393 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1394 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1395 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1396 return res;
1397}
1398
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001399template <bool islog>
1400inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1401{
1402 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1403 const qint16x8_t const_one = vdupq_n_s16(1);
1404 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1405 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1406 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1407 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1408 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1409 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1410 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1411 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1412 return res;
1413}
1414
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001415inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1416{
1417 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1418 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1419 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1420 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1421
1422 // Perform range reduction [-log(2),log(2)]
1423 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1424
1425 // get decimal part from m
1426 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1427
1428 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1429 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1430
1431 // Polynomial Approximation
1432 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1433 poly = vqadd_s8(poly, const_one);
1434
1435 // Reconstruct
1436 poly = vqshl_s8(poly, dec_m);
1437
1438 return poly;
1439}
1440
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001441inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1442{
1443 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1444 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1445 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1446 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1447
1448 // Perform range reduction [-log(2),log(2)]
1449 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1450
1451 // get decimal part from m
1452 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1453
1454 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1455 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1456
1457 // Polynomial Approximation
1458 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1459 poly = vqadd_s16(poly, const_one);
1460
1461 // Reconstruct
1462 poly = vqshl_s16(poly, dec_m);
1463
1464 return poly;
1465}
1466
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001467inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1468{
1469 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1470 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1471 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1472 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1473
1474 // Perform range reduction [-log(2),log(2)]
1475 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1476
1477 // get decimal part from m
1478 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1479
1480 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1481 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1482
1483 // Polynomial Approximation
1484 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1485 poly = vqaddq_s8(poly, const_one);
1486
1487 // Reconstruct
1488 poly = vqshlq_s8(poly, dec_m);
1489
1490 return poly;
1491}
1492
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001493inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1494{
1495 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1496 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1497 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1498 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1499
1500 // Perform range reduction [-log(2),log(2)]
1501 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1502
1503 // get decimal part from m
1504 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1505
1506 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1507 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1508
1509 // Polynomial Approximation
1510 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1511 poly = vqaddq_s16(poly, const_one);
1512
1513 // Reconstruct
1514 poly = vqshlq_s16(poly, dec_m);
1515
1516 return poly;
1517}
1518
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001519inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1520{
1521 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1522 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1523 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1524
1525 // If 0 < a < 1, calculate log(1/x)
1526 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1527 qint8x8_t recip = vdup_n_s8(0);
1528 recip = vbsl_s8(calc_reciprocal, recip, a);
1529
1530 // Calculate reciprocal
1531 recip = vrecip_qs8(recip, fixed_point_position);
1532 a = vbsl_s8(calc_reciprocal, recip, a);
1533
1534 // Get decimal part of a
1535 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1536 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1537
1538 // Get exponent of 2^n which is equal or less than dec_a
1539 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1540
1541 // Get x to range (1, 2]
1542 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1543 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1544 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1545
1546 // Polynomial Approximation
1547 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1548
1549 // Reconstruct
1550 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1551
1552 // Set negative value for 0 < a < 1
1553 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1554
1555 return poly;
1556}
1557
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001558inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1559{
1560 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1561 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1562 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1563
1564 // If 0 < a < 1, calculate log(1/x)
1565 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1566 qint16x4_t recip = vdup_n_s16(0);
1567 recip = vbsl_s16(calc_reciprocal, recip, a);
1568
1569 // Calculate reciprocal
1570 recip = vrecip_qs16(recip, fixed_point_position);
1571 a = vbsl_s16(calc_reciprocal, recip, a);
1572
1573 // Get decimal part of a
1574 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1575 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1576
1577 // Get exponent of 2^n which is equal or less than dec_a
1578 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1579
1580 // Get x to range (1, 2]
1581 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1582 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1583 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1584
1585 // Polynomial Approximation
1586 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1587
1588 // Reconstruct
1589 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1590
1591 // Set negative value for 0 < a < 1
1592 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1593
1594 return poly;
1595}
1596
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001597inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1598{
1599 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1600 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1601 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1602
1603 // If 0 < a < 1, calculate log(1/x)
1604 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1605 qint8x16_t recip = vdupq_n_s8(0);
1606 recip = vbslq_s8(calc_reciprocal, a, recip);
1607
1608 // Calculate reciprocal
1609 recip = vrecipq_qs8(recip, fixed_point_position);
1610 a = vbslq_s8(calc_reciprocal, recip, a);
1611
1612 // Get decimal part of a
1613 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1614 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1615
1616 // Get exponent of 2^n which is equal or less than dec_a
1617 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1618
1619 // Get x to range (1, 2]
1620 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1621 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1622 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1623
1624 // Polynomial Approximation
1625 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1626
1627 // Reconstruct
1628 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1629
1630 // Set negative value for 0 < a < 1
1631 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1632
1633 return poly;
1634}
1635
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001636inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1637{
1638 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1639 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1640 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1641
1642 // If 0 < a < 1, calculate log(1/x)
1643 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1644 qint16x8_t recip = vdupq_n_s16(0);
1645 recip = vbslq_s16(calc_reciprocal, a, recip);
1646
1647 // Calculate reciprocal
1648 recip = vqrecipq_qs16(recip, fixed_point_position);
1649 a = vbslq_s16(calc_reciprocal, recip, a);
1650
1651 // Get decimal part of a
1652 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1653 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1654
1655 // Get exponent of 2^n which is equal or less than dec_a
1656 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1657
1658 // Get x to range (1, 2]
1659 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1660 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1661 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1662
1663 // Polynomial Approximation
1664 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1665
1666 // Reconstruct
1667 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1668
1669 // Set negative value for 0 < a < 1
1670 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1671
1672 return poly;
1673}
1674
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001675inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1676{
1677 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1678
1679 // Find shift value. Number must be in (0.5, 2) range.
1680 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1681
1682 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1683 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1684 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1685 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1686 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1687
1688 temp = vshl_s8(a, shift_value);
1689
1690 // Initial guess
1691 qint8x8_t x = temp;
1692
1693 // Calculate (x / 2) * (3 - a * x^2)
1694 // After three iterations we have the result for 8 bit
1695 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1696 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1697 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1698
1699 return vshl_s8(x, shift_value2);
1700}
1701
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001702inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1703{
1704 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1705
1706 // Find shift value. Number must be in (0.5, 2) range.
1707 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1708
1709 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1710 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1711 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1712 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1713 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1714
1715 temp = vshl_s16(a, shift_value);
1716
1717 // Initial guess
1718 qint16x4_t x = temp;
1719
1720 // Calculate (x / 2) * (3 - a * x^2)
1721 // After five iterations we have the result for 8 bit
1722 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1723 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1724 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1725 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1726 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1727
1728 return vshl_s16(x, shift_value2);
1729}
1730
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001731inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1732{
1733 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1734
1735 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001736 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001737
1738 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001739 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001740 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001741 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001742 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001743
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001744 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001745
1746 // Initial guess
1747 qint8x8_t x = temp;
1748
1749 // Calculate (x / 2) * (3 - a * x^2)
1750 // After three iterations we have the result for 8 bit
1751 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1753 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1754
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001755 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001756}
1757
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001758inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1759{
1760 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1761
1762 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001763 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001764
1765 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1766 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1767 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1768 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001769 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001770
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001771 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001772
1773 // Initial guess
1774 qint16x4_t x = temp;
1775
1776 // Calculate (x / 2) * (3 - a * x^2)
1777 // After five iterations we have the result for 16 bit
1778 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1779 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1780 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1781 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1782 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1783
1784 return vqshl_s16(x, shift_value2);
1785}
1786
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001787inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1788{
1789 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1790
1791 // Find shift value. Number must be in (0.5, 2) range.
1792 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1793
1794 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1795 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1796 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1797 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1798 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1799
1800 temp = vshlq_s8(a, shift_value);
1801
1802 // Initial guess
1803 qint8x16_t x = temp;
1804
1805 // Calculate (x / 2) * (3 - a * x^2)
1806 // After three iterations we have the result for 8 bit
1807 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1809 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1810
1811 return vshlq_s8(x, shift_value2);
1812}
1813
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001814inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1815{
1816 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1817
1818 // Find shift value. Number must be in (0.5, 2) range.
1819 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1820
1821 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1822 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1823 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1824 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1825 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1826
1827 temp = vshlq_s16(a, shift_value);
1828
1829 // Initial guess
1830 qint16x8_t x = temp;
1831
1832 // Calculate (x / 2) * (3 - a * x^2)
1833 // After five iterations we have the result for 16 bit
1834 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1835 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1836 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1837 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1838 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1839
1840 return vshlq_s16(x, shift_value2);
1841}
1842
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001843inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1844{
1845 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1846
1847 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001848 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001849
1850 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001851 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001852 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001853 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001854 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001855
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001856 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001857
1858 // Initial guess
1859 qint8x16_t x = temp;
1860
1861 // Calculate (x / 2) * (3 - a * x^2)
1862 // After three iterations we have the result for 8 bit
1863 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1864 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1865 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1866
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001867 return vqshlq_s8(x, shift_value2);
1868}
1869
1870inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1871{
1872 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1873
1874 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001875 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001876
1877 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1878 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1879 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1880 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001881 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001882
1883 temp = vqshlq_s16(a, shift_value);
1884
1885 // Initial guess
1886 qint16x8_t x = temp;
1887
1888 // Calculate (x / 2) * (3 - a * x^2)
1889 // After five iterations we have the result for 16 bit
1890 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1891 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1892 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1893 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1894 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1895
1896 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001897}
1898
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001899inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001900{
1901 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1902 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1903
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001904 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1905 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1906 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1907 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001908
1909 return tanh;
1910}
1911
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001912inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001913{
1914 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1915 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1916
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001917 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1918 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1919 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1920 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001921
1922 return tanh;
1923}
1924
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001925inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001926{
1927 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1928 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1929
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001930 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1931 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1932 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1933 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001934
1935 return tanh;
1936}
1937
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001938inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1939{
1940 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1941 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1942
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001943 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1944 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1945 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1946 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001947
1948 return tanh;
1949}
1950
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001951inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1952{
1953 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1954}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001955
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +01001956inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1957{
1958 return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
1959}
1960
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001961inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1962{
1963 float32x4x2_t res =
1964 {
1965 {
1966 vmaxq_f32(a.val[0], b.val[0]),
1967 vmaxq_f32(a.val[1], b.val[1])
1968 }
1969 };
1970 return res;
1971}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001972}