blob: dd1066d6bcc1aca4503c117f25eda519645c0bf3 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
28/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010030static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031{
32 {
33 vdup_n_s8(0x7F), // 0.9978546
34 vdup_n_s8(0x3F), // 0.4994721
35 vdup_n_s8(0x16), // 0.1763723
36 vdup_n_s8(0x05), // 0.0435108
37 }
38};
39
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010040/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
41 * Format is in Q0.15 for all elements */
42static const std::array<qint16x4_t, 4> exp_tab_qs16 =
43{
44 {
45 vdup_n_s16(0x7FBA), // 0.9978546
46 vdup_n_s16(0x3FE9), // 0.4994721
47 vdup_n_s16(0x1693), // 0.1763723
48 vdup_n_s16(0x0592), // 0.0435108
49 }
50};
51
Anthony Barbier6ff3b192017-09-04 18:44:23 +010052/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
53 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010054static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055{
56 {
57 vdupq_n_s8(0x7F), // 0.9978546
58 vdupq_n_s8(0x3F), // 0.4994721
59 vdupq_n_s8(0x16), // 0.1763723
60 vdupq_n_s8(0x05), // 0.0435108
61 }
62};
63
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010064/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
65 * Format is in Q0.15 for all elements */
66static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
67{
68 {
69 vdupq_n_s16(0x7FBA), // 0.9978546
70 vdupq_n_s16(0x3FE9), // 0.4994721
71 vdupq_n_s16(0x1693), // 0.1763723
72 vdupq_n_s16(0x0592), // 0.0435108
73 }
74};
75
Anthony Barbier6ff3b192017-09-04 18:44:23 +010076/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
77 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010078static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010079{
80 {
81 vdup_n_s8(0x5C), // 1.4384189
82 vdup_n_s8(-0x56), // -0.6771900
83 vdup_n_s8(0x29), // 0.3218538
84 vdup_n_s8(-0x0A), // -0.0832229
85 }
86};
87
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010088/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
89 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
90static const std::array<qint16x4_t, 4> log_tab_qs16 =
91{
92 {
93 vdup_n_s16(0x5C0F), // 1.4384189
94 vdup_n_s16(-0x56AE), // -0.6771900
95 vdup_n_s16(0x2933), // 0.3218538
96 vdup_n_s16(-0x0AA7), // -0.0832229
97 }
98};
99
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100100/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
101 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100102static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103{
104 {
105 vdupq_n_s8(0x5C), // 1.4384189
106 vdupq_n_s8(-0x56), // -0.6771900
107 vdupq_n_s8(0x29), // 0.3218538
108 vdupq_n_s8(-0x0A), // -0.0832229
109 }
110};
111
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100112/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
113 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
114static const std::array<qint16x8_t, 4> log_tabq_qs16 =
115{
116 {
117 vdupq_n_s16(0x5C0F), // 1.4384189
118 vdupq_n_s16(-0x56AE), // -0.6771900
119 vdupq_n_s16(0x2933), // 0.3218538
120 vdupq_n_s16(-0x0AA7), // -0.0832229
121 }
122};
123
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100124inline qint8x8_t vget_low_qs8(qint8x16_t a)
125{
126 return vget_low_s8(a);
127}
128
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100129inline qint16x4_t vget_low_qs16(qint16x8_t a)
130{
131 return vget_low_s16(a);
132}
133
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134inline qint8x8_t vget_high_qs8(qint8x16_t a)
135{
136 return vget_high_s8(a);
137}
138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100139inline qint16x4_t vget_high_qs16(qint16x8_t a)
140{
141 return vget_high_s16(a);
142}
143
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144inline qint8x8_t vld1_qs8(const qint8_t *addr)
145{
146 return vld1_s8(addr);
147}
148
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100149inline qint16x4_t vld1_qs16(const qint16_t *addr)
150{
151 return vld1_s16(addr);
152}
153
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100154inline qint8x16_t vld1q_qs8(const qint8_t *addr)
155{
156 return vld1q_s8(addr);
157}
158
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159inline qint16x8_t vld1q_qs16(const qint16_t *addr)
160{
161 return vld1q_s16(addr);
162}
163
164inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
165{
166 return vld1_dup_s8(addr);
167}
168
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100169inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
170{
171 return vld1_dup_s16(addr);
172}
173
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100174inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
175{
176 return vld1q_dup_s8(addr);
177}
178
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100179inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
180{
181 return vld1q_dup_s16(addr);
182}
183
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100184inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
185{
186 return vld2q_s16(addr);
187}
188
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100189inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
190{
191 vst1_s8(addr, b);
192}
193
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
195{
196 vst1_s16(addr, b);
197}
198
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100199inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
200{
201 vst1q_s8(addr, b);
202}
203
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
205{
206 vst1q_s16(addr, b);
207}
208
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100209inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
210{
211 vst2q_s16(addr, b);
212}
213
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214inline qint8x8_t vqmovn_qs16(qint16x8_t a)
215{
216 return vqmovn_s16(a);
217}
218
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100219inline qint16x4_t vqmovn_qs32(qint32x4_t a)
220{
221 return vqmovn_s32(a);
222}
223
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224inline qint8x8_t vdup_n_qs8(qint8_t a)
225{
226 return vdup_n_s8(a);
227}
228
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100229inline qint16x4_t vdup_n_qs16(qint16_t a)
230{
231 return vdup_n_s16(a);
232}
233
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234inline qint8x16_t vdupq_n_qs8(qint8_t a)
235{
236 return vdupq_n_s8(a);
237}
238
239inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
240{
241 float32x4x4_t res =
242 {
243 {
244 vdupq_n_f32(a),
245 vdupq_n_f32(a),
246 vdupq_n_f32(a),
247 vdupq_n_f32(a),
248 }
249 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100250 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100251}
252
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +0100253inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
254{
255 float32x4x2_t res =
256 {
257 {
258 vdupq_n_f32(a),
259 vdupq_n_f32(a),
260 }
261 };
262 return vqcvtq_qs16_f32(res, fixed_point_position);
263}
264
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100265inline qint16x8_t vdupq_n_qs16(qint16_t a)
266{
267 return vdupq_n_s16(a);
268}
269
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100270inline qint32x4_t vdupq_n_qs32(qint32_t a)
271{
272 return vdupq_n_s32(a);
273}
274
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100275inline qint8x8_t vabs_qs8(qint8x8_t a)
276{
277 return vabs_s8(a);
278}
279
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100280inline qint16x4_t vabs_qs16(qint16x4_t a)
281{
282 return vabs_s16(a);
283}
284
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285inline qint8x16_t vabsq_qs8(qint8x16_t a)
286{
287 return vabsq_s8(a);
288}
289
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100290inline qint16x8_t vabsq_qs16(qint16x8_t a)
291{
292 return vabsq_s16(a);
293}
294
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100295inline qint8x8_t vqabs_qs8(qint8x8_t a)
296{
297 return vqabs_s8(a);
298}
299
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100300inline qint16x4_t vqabs_qs16(qint16x4_t a)
301{
302 return vqabs_s16(a);
303}
304
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100305inline qint8x16_t vqabsq_qs8(qint8x16_t a)
306{
307 return vqabsq_s8(a);
308}
309
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100310inline qint16x8_t vqabsq_qs16(qint16x8_t a)
311{
312 return vqabsq_s16(a);
313}
314
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100315inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
316{
317 return vmax_s8(a, b);
318}
319
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100320inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
321{
322 return vmax_s16(a, b);
323}
324
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100325inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
326{
327 return vmaxq_s8(a, b);
328}
329
330inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
331{
332 return vpmax_s8(a, b);
333}
334
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100335inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
336{
337 return vpmax_s16(a, b);
338}
339
340inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
341{
342 return vmaxq_s16(a, b);
343}
344
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100345inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
346{
347 return vmin_s8(a, b);
348}
349
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100350inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
351{
352 return vmin_s16(a, b);
353}
354
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100355inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
356{
357 return vminq_s8(a, b);
358}
359
360inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
361{
362 return vpmin_s8(a, b);
363}
364
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100365inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
366{
367 return vpmin_s16(a, b);
368}
369
370inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
371{
372 return vminq_s16(a, b);
373}
374
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100375inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
376{
377 return vadd_s8(a, b);
378}
379
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100380inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
381{
382 return vadd_s16(a, b);
383}
384
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100385inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
386{
387 return vaddq_s8(a, b);
388}
389
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100390inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
391{
392 return vaddq_s16(a, b);
393}
394
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100395inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
396{
397 return vqadd_s8(a, b);
398}
399
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100400inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
401{
402 return vqadd_s16(a, b);
403}
404
Georgios Pinitas9247c922017-06-28 18:29:47 +0100405inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
406{
407 return vqadd_s32(a, b);
408}
409
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100410inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
411{
412 return vqaddq_s8(a, b);
413}
414
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100415inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
416{
417 return vqaddq_s16(a, b);
418}
419
Georgios Pinitas9247c922017-06-28 18:29:47 +0100420inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
421{
422 return vqaddq_s32(a, b);
423}
424
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100425inline int16x4_t vpaddl_qs8(qint8x8_t a)
426{
427 return vpaddl_s8(a);
428}
429
430inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
431{
432 return vsub_s8(a, b);
433}
434
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100435inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
436{
437 return vsub_s16(a, b);
438}
439
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100440inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
441{
442 return vsubq_s8(a, b);
443}
444
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100445inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
446{
447 return vsubq_s16(a, b);
448}
449
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100450inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
451{
452 return vqsub_s8(a, b);
453}
454
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100455inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
456{
457 return vqsub_s16(a, b);
458}
459
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
461{
462 return vqsubq_s8(a, b);
463}
464
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100465inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
466{
467 return vqsubq_s16(a, b);
468}
469
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100470inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
471{
472 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
473
474 // Initialize the temporary result with a constant used to round up the result
475 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
476
477 // Vector multiply-accumulate long
478 res = vmlal_s8(res, a, b);
479
480 // Shift right by fixed_point_position
481 res = vshlq_s16(res, fixed_point_position_s16);
482
483 // Convert back to qint8
484 return vmovn_s16(res);
485}
486
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100487inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
488{
489 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
490
491 // Initialize the temporary result with a constant used to round up the result
492 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
493
494 // Vector multiply-accumulate long
495 res = vmlal_s16(res, a, b);
496
497 // Shift right by fixed_point_position
498 res = vshlq_s32(res, fixed_point_position_s32);
499
500 // Convert back to qint16
501 return vmovn_s32(res);
502}
503
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100504inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
505{
506 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
507
508 // Initialize the temporary results with a constant used to round up the result
509 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
510 qint16x8_t res1 = res0;
511
512 // Vector multiply-accumulate long
513 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
514 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
515
516 // Shift right by fixed_point_position
517 res0 = vshlq_s16(res0, fixed_point_position_s16);
518 res1 = vshlq_s16(res1, fixed_point_position_s16);
519
520 // Convert back to qint8
521 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
522}
523
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100524inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
525{
526 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
527
528 // Initialize the temporary results with a constant used to round up the result
529 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
530 qint32x4_t res1 = res0;
531
532 // Vector multiply-accumulate long
533 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
534 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
535
536 // Shift right by fixed_point_position
537 res0 = vshlq_s32(res0, fixed_point_position_s32);
538 res1 = vshlq_s32(res1, fixed_point_position_s32);
539
540 // Convert back to qint16
541 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
542}
543
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100544inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
545{
546 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
547
548 // Initialize the temporary result with a constant used to round up the result
549 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
550
551 // Vector multiply-accumulate long
552 res = vmlal_s8(res, a, b);
553
554 // Shift right by fixed_point_position
555 res = vqshlq_s16(res, fixed_point_position_s16);
556
557 // Convert back to qint8 and saturate
558 return vqmovn_s16(res);
559}
560
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100561inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
562{
563 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
564
565 // Initialize the temporary result with a constant used to round up the result
566 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
567
568 // Vector multiply-accumulate long
569 res = vmlal_s16(res, a, b);
570
571 // Shift right by fixed_point_position
572 res = vqshlq_s32(res, fixed_point_position_s32);
573
574 // Convert back to qint16 and saturate
575 return vqmovn_s32(res);
576}
577
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100578inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
579{
580 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
581
582 // Initialize the temporary results with a constant used to round up the result
583 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
584 qint16x8_t res1 = res0;
585
586 // Vector multiply-accumulate long
587 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
588 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
589
590 // Shift right by fixed_point_position
591 res0 = vqshlq_s16(res0, fixed_point_position_s16);
592 res1 = vqshlq_s16(res1, fixed_point_position_s16);
593
594 // Convert back to qint8 and saturate
595 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
596}
597
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100598inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
599{
600 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
601
602 // Initialize the temporary results with a constant used to round up the result
603 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
604 qint32x4_t res1 = res0;
605
606 // Vector multiply-accumulate long
607 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
608 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
609
610 // Shift right by fixed_point_position
611 res0 = vqshlq_s32(res0, fixed_point_position_s32);
612 res1 = vqshlq_s32(res1, fixed_point_position_s32);
613
614 // Convert back to qint16 and saturate
615 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
616}
617
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100618inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
619{
620 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
621
622 qint16x8_t res = vmull_s8(a, b);
623
624 return vqrshlq_s16(res, fixed_point_position_s16);
625}
626
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100627inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
628{
629 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
630
631 // Initialize the temporary results with a constant used to round up the result
632 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
633
634 // Vector multiply-accumulate long
635 tmp = vmull_s16(a, b);
636
637 // Shift right by fixed_point_position
638 return vqshlq_s32(tmp, fixed_point_position_s32);
639}
640
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100641inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
642{
643 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
644
645 // Initialize the temporary results with a constant used to round up the result
646 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
647
648 // Vector multiply-accumulate long
649 tmp = vmlal_s8(tmp, b, c);
650
651 // Shift right by fixed_point_position
652 tmp = vshlq_s16(tmp, fixed_point_position_s16);
653
654 // Convert back to qint8 and accumulate
655 return vadd_s8(a, vmovn_s16(tmp));
656}
657
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100658inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
659{
660 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
661
662 // Initialize the temporary results with a constant used to round up the result
663 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
664
665 // Vector multiply-accumulate long
666 tmp = vmlal_s16(tmp, b, c);
667
668 // Shift right by fixed_point_position
669 tmp = vshlq_s32(tmp, fixed_point_position_s32);
670
671 // Convert back to qint16 and accumulate
672 return vadd_s16(a, vmovn_s32(tmp));
673}
674
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100675inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
676{
677 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
678
679 // Initialize the temporary results with a constant used to round up the result
680 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
681 qint16x8_t tmp1 = tmp0;
682
683 // Vector multiply-accumulate long
684 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
685 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
686
687 // Shift right by fixed_point_position
688 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
689 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
690
691 // Convert back to qint8 and accumulate
692 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
693}
694
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100695inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
696{
697 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
698
699 // Initialize the temporary results with a constant used to round up the result
700 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
701 qint32x4_t tmp1 = tmp0;
702
703 // Vector multiply-accumulate long
704 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
705 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
706
707 // Shift right by fixed_point_position
708 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
709 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
710
711 // Convert back to qint16 and accumulate
712 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
713}
714
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100715inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
716{
717 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
718
719 // Initialize the temporary results with a constant used to round up the result
720 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
721
722 // Vector multiply-accumulate long
723 tmp = vmlal_s8(tmp, b, c);
724
725 // Shift right by fixed_point_position
726 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
727
728 // Convert back to qint8 and accumulate
729 return vqadd_s8(a, vqmovn_s16(tmp));
730}
731
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100732inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
733{
734 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
735
736 // Initialize the temporary results with a constant used to round up the result
737 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
738
739 // Vector multiply-accumulate long
740 tmp = vmlal_s16(tmp, b, c);
741
742 // Shift right by fixed_point_position
743 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
744
745 // Convert back to qint8 and accumulate
746 return vqadd_s16(a, vqmovn_s32(tmp));
747}
748
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100749inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
750{
751 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
752
753 // Initialize the temporary results with a constant used to round up the result
754 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
755 qint16x8_t tmp1 = tmp0;
756
757 // Vector multiply-accumulate long
758 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
759 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
760
761 // Shift right by fixed_point_position
762 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
763 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
764
765 // Convert back to qint8 and accumulate
766 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
767 return vqaddq_s8(a, res);
768}
769
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100770inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
771{
772 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
773
774 // Initialize the temporary results with a constant used to round up the result
775 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
776 qint32x4_t tmp1 = tmp0;
777
778 // Vector multiply-accumulate long
779 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
780 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
781
782 // Shift right by fixed_point_position
783 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
784 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
785
786 // Convert back to qint16 and accumulate
787 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
788 return vqaddq_s16(a, res);
789}
790
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100791inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
792{
793 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
794
795 // Initialize the temporary results with a constant used to round up the result
796 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
797
798 // Vector multiply-accumulate long
799 tmp = vmlal_s8(tmp, b, c);
800
801 // Shift right by fixed_point_position
802 tmp = vshlq_s16(tmp, fixed_point_position_s16);
803
804 // Accumulate
805 return vaddq_s16(a, tmp);
806}
807
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100808inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
809{
810 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
811
812 // Initialize the temporary results with a constant used to round up the result
813 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
814
815 // Vector multiply-accumulate long
816 tmp = vmlal_s16(tmp, b, c);
817
818 // Shift right by fixed_point_position
819 tmp = vshlq_s32(tmp, fixed_point_position_s32);
820
821 // Accumulate
822 return vaddq_s32(a, tmp);
823}
824
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100825inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
826{
827 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
828
829 // Initialize the temporary results with a constant used to round up the result
830 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
831
832 // Vector multiply-accumulate long
833 tmp = vmlal_s8(tmp, b, c);
834
835 // Shift right by fixed_point_position
836 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
837
838 // Accumulate
839 return vqaddq_s16(a, tmp);
840}
841
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100842inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
843{
844 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
845
846 // Initialize the temporary results with a constant used to round up the result
847 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
848
849 // Vector multiply-accumulate long
850 tmp = vmlal_s16(tmp, b, c);
851
852 // Shift right by fixed_point_position
853 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
854
855 // Accumulate
856 return vqaddq_s32(a, tmp);
857}
858
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100859inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100860{
861 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
862
863 float32x4x2_t res_f32 =
864 {
865 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100866 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
867 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100868 }
869 };
870
871 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
872 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
873
874 const int32x4x2_t res_s32 =
875 {
876 {
877 vcvtq_s32_f32(res_f32.val[0]),
878 vcvtq_s32_f32(res_f32.val[1]),
879 }
880 };
881
882 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
883
884 return vqmovn_s16(res_s16);
885}
886
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100887inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100888{
889 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
890
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100891 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100892
893 res_f32 = vmlaq_f32(res_f32, a, pow2);
894
895 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
896
897 return vqmovn_s32(res_s32);
898}
899
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100900inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100901{
902 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
903
904 float32x4x4_t res_f32 =
905 {
906 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100907 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
908 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
909 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
910 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100911 }
912 };
913
914 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
915 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
916 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
917 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
918
919 const int32x4x4_t res_s32 =
920 {
921 {
922 vcvtq_s32_f32(res_f32.val[0]),
923 vcvtq_s32_f32(res_f32.val[1]),
924 vcvtq_s32_f32(res_f32.val[2]),
925 vcvtq_s32_f32(res_f32.val[3]),
926 }
927 };
928
929 const int16x8x2_t res_s16 =
930 {
931 {
932 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
933 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
934 }
935 };
936
937 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
938}
939
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100940inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100941{
942 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
943
944 float32x4x2_t res_f32 =
945 {
946 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100947 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
948 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100949 }
950 };
951
952 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
953 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
954
955 const int32x4x2_t res_s32 =
956 {
957 {
958 vcvtq_s32_f32(res_f32.val[0]),
959 vcvtq_s32_f32(res_f32.val[1])
960 }
961 };
962
963 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
964}
965
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100966inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
967{
968 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
969
970 const int16x8_t res_s16 = vmovl_s8(a);
971
972 const int32x4x2_t res_s32 =
973 {
974 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100975 vmovl_s16(vget_low_qs16(res_s16)),
976 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100977 }
978 };
979
980 float32x4x2_t res_f32 =
981 {
982 {
983 vcvtq_f32_s32(res_s32.val[0]),
984 vcvtq_f32_s32(res_s32.val[1])
985 }
986 };
987
988 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
989 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
990
991 return res_f32;
992}
993
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100994inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
995{
996 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
997 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
998
999 return vmulq_f32(res_f32, pow2);
1000}
1001
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001002inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
1003{
1004 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1005
1006 const int16x8x2_t res_s16 =
1007 {
1008 {
1009 vmovl_s8(vget_low_s8(a)),
1010 vmovl_s8(vget_high_s8(a)),
1011 }
1012 };
1013
1014 const int32x4x4_t res_s32 =
1015 {
1016 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001017 vmovl_s16(vget_low_qs16(res_s16.val[0])),
1018 vmovl_s16(vget_high_qs16(res_s16.val[0])),
1019 vmovl_s16(vget_low_qs16(res_s16.val[1])),
1020 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001021 }
1022 };
1023
1024 float32x4x4_t res_f32 =
1025 {
1026 {
1027 vcvtq_f32_s32(res_s32.val[0]),
1028 vcvtq_f32_s32(res_s32.val[1]),
1029 vcvtq_f32_s32(res_s32.val[2]),
1030 vcvtq_f32_s32(res_s32.val[3])
1031 }
1032 };
1033
1034 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1035 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1036 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1037 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1038
1039 return res_f32;
1040}
1041
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001042inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1043{
1044 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1045
1046 const int32x4x2_t res_s32 =
1047 {
1048 {
1049 vmovl_s16(vget_low_qs16(a)),
1050 vmovl_s16(vget_high_qs16(a))
1051 }
1052 };
1053
1054 float32x4x2_t res_f32 =
1055 {
1056 {
1057 vcvtq_f32_s32(res_s32.val[0]),
1058 vcvtq_f32_s32(res_s32.val[1])
1059 }
1060 };
1061
1062 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1063 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1064
1065 return res_f32;
1066}
1067
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001068inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1069{
1070 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001071 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1072 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1073 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001074
1075 // Find shift value
1076 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1077 const qint8x8_t temp = vshl_s8(a, shift_value);
1078
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001079 // Newton-Raphson division initial estimate X0 calculation
1080 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001081
1082 uint8x8_t set_one = vcgt_s8(x, const_one);
1083 x = vbsl_s8(set_one, const_one, x);
1084
1085 // Use three iterations of Newton-Raphson method to get the result
1086 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1087 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1088 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1089
1090 return vshl_s8(x, shift_value);
1091}
1092
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001093inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1094{
1095 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1096 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1097 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1098 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1099
1100 // Find shift value
1101 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1102 const qint16x4_t temp = vshl_s16(a, shift_value);
1103
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001104 // Newton-Raphson division initial estimate X0 calculation
1105 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001106
1107 uint16x4_t set_one = vcgt_s16(x, const_one);
1108 x = vbsl_s16(set_one, const_one, x);
1109
1110 // Use five iterations of Newton-Raphson method to get the result
1111 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1112 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1113 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1114 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1115 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1116
1117 return vshl_s16(x, shift_value);
1118}
1119
Georgios Pinitas9247c922017-06-28 18:29:47 +01001120inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1121{
1122 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1123 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1124 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1125 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1126
1127 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001128 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001129 const qint8x8_t temp = vqshl_s8(a, shift_value);
1130
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001131 // Newton-Raphson division initial estimate X0 calculation
1132 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001133
1134 uint8x8_t set_one = vcgt_s8(x, const_one);
1135 x = vbsl_s8(set_one, const_one, x);
1136
1137 // Use three iterations of Newton-Raphson method to get the result
1138 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1139 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1140 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1141
1142 return vqshl_s8(x, shift_value);
1143}
1144
1145inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1146{
1147 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1148 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1149 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1150 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1151
1152 // Find shift value
1153 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1154 const qint16x4_t temp = vqshl_s16(a, shift_value);
1155
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001156 // Newton-Raphson division initial estimate X0 calculation
1157 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001158
1159 uint16x4_t set_one = vcgt_s16(x, const_one);
1160 x = vbsl_s16(set_one, const_one, x);
1161
1162 // Use five iterations of Newton-Raphson method to get the result
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001163 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1164 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1165 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1166 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1167 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001168
1169 return vqshl_s16(x, shift_value);
1170}
1171
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001172inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1173{
1174 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001175 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1176 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1177 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001178
1179 // Find shift value
1180 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1181 const qint8x16_t temp = vshlq_s8(a, shift_value);
1182
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001183 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001184 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001185
1186 // Set initial guess to one if x > 1
1187 uint8x16_t set_one = vcgtq_s8(x, const_one);
1188 x = vbslq_s8(set_one, const_one, x);
1189
1190 // Use three iterations of Newton-Raphson method to get the result
1191 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1192 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1193 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1194
1195 return vshlq_s8(x, shift_value);
1196}
1197
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001198inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1199{
1200 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1201 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1202 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1203 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1204
1205 // Find shift value
1206 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1207 const qint16x8_t temp = vshlq_s16(a, shift_value);
1208
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001209 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001210 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1211
1212 // Set initial guess to one if x > 1
1213 uint16x8_t set_one = vcgtq_s16(x, const_one);
1214 x = vbslq_s16(set_one, const_one, x);
1215
1216 // Use five iterations of Newton-Raphson method to get the result
1217 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1218 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1219 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1220 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1221 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1222
1223 return vshlq_s16(x, shift_value);
1224}
1225
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001226inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1227{
1228 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001229 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1230 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1231 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001232
1233 // Find shift value
1234 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1235 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1236
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001237 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001238 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001239
1240 // Set initial guess to one if x > 1
1241 uint8x16_t set_one = vcgtq_s8(x, const_one);
1242 x = vbslq_s8(set_one, const_one, x);
1243
1244 // Use three iterations of Newton-Raphson method to get the result
1245 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1246 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1247 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1248
1249 return vqshlq_s8(x, shift_value);
1250}
1251
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001252inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1253{
1254 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1255 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1256 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1257 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1258
1259 // Find shift value
1260 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1261 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1262
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001263 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001264 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1265
1266 // Set initial guess to one if x > 1
1267 uint16x8_t set_one = vcgtq_s16(x, const_one);
1268 x = vbslq_s16(set_one, const_one, x);
1269
1270 // Use five iterations of Newton-Raphson method to get the result
1271 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1272 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1273 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1274 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1275 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1276
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001277 // Saturate result in case of overflow
1278 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001279}
1280
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001281inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1282{
1283 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1284}
1285
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001286inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1287{
1288 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1289}
1290
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001291inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1292{
1293 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1294}
1295
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001296inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1297{
1298 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1299}
1300
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001301template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001302inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001303{
1304 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1305 const qint8x8_t const_one = vdup_n_s8(1);
1306 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1307 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1308 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1309 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1310 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1311 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1312 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1313 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1314 return res;
1315}
1316
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001317template <bool islog>
1318inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1319{
1320 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1321 const qint16x4_t const_one = vdup_n_s16(1);
1322 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1323 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1324 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1325 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1326 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1327 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1328 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1329 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1330 return res;
1331}
1332
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001333template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001334inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001335{
1336 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1337 const qint8x8_t const_one = vdup_n_s8(1);
1338 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1339 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1340 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1341 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1342 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1343 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1344 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1345 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1346 return res;
1347}
1348
1349template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001350inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1351{
1352 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1353 const qint16x4_t const_one = vdup_n_s16(1);
1354 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1355 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1356 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1357 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1358 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1359 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1360 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1361 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1362 return res;
1363}
1364
1365template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001366inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1367{
1368 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1369 const qint8x16_t const_one = vdupq_n_s8(1);
1370 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1371 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1372 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1373 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1374 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1375 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1376 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1377 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1378 return res;
1379}
1380
1381template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001382inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1383{
1384 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1385 const qint16x8_t const_one = vdupq_n_s16(1);
1386 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1387 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1388 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1389 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1390 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1391 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1392 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1393 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1394 return res;
1395}
1396
1397template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001398inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1399{
1400 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1401 const qint8x16_t const_one = vdupq_n_s8(1);
1402 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1403 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1404 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1405 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1406 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1407 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1408 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1409 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1410 return res;
1411}
1412
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001413template <bool islog>
1414inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1415{
1416 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1417 const qint16x8_t const_one = vdupq_n_s16(1);
1418 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1419 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1420 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1421 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1422 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1423 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1424 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1425 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1426 return res;
1427}
1428
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001429inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1430{
1431 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1432 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1433 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1434 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1435
1436 // Perform range reduction [-log(2),log(2)]
1437 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1438
1439 // get decimal part from m
1440 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1441
1442 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1443 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1444
1445 // Polynomial Approximation
1446 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1447 poly = vqadd_s8(poly, const_one);
1448
1449 // Reconstruct
1450 poly = vqshl_s8(poly, dec_m);
1451
1452 return poly;
1453}
1454
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001455inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1456{
1457 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1458 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1459 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1460 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1461
1462 // Perform range reduction [-log(2),log(2)]
1463 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1464
1465 // get decimal part from m
1466 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1467
1468 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1469 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1470
1471 // Polynomial Approximation
1472 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1473 poly = vqadd_s16(poly, const_one);
1474
1475 // Reconstruct
1476 poly = vqshl_s16(poly, dec_m);
1477
1478 return poly;
1479}
1480
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001481inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1482{
1483 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1484 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1485 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1486 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1487
1488 // Perform range reduction [-log(2),log(2)]
1489 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1490
1491 // get decimal part from m
1492 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1493
1494 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1495 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1496
1497 // Polynomial Approximation
1498 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1499 poly = vqaddq_s8(poly, const_one);
1500
1501 // Reconstruct
1502 poly = vqshlq_s8(poly, dec_m);
1503
1504 return poly;
1505}
1506
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001507inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1508{
1509 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1510 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1511 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1512 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1513
1514 // Perform range reduction [-log(2),log(2)]
1515 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1516
1517 // get decimal part from m
1518 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1519
1520 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1521 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1522
1523 // Polynomial Approximation
1524 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1525 poly = vqaddq_s16(poly, const_one);
1526
1527 // Reconstruct
1528 poly = vqshlq_s16(poly, dec_m);
1529
1530 return poly;
1531}
1532
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001533inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1534{
1535 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1536 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1537 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1538
1539 // If 0 < a < 1, calculate log(1/x)
1540 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1541 qint8x8_t recip = vdup_n_s8(0);
1542 recip = vbsl_s8(calc_reciprocal, recip, a);
1543
1544 // Calculate reciprocal
1545 recip = vrecip_qs8(recip, fixed_point_position);
1546 a = vbsl_s8(calc_reciprocal, recip, a);
1547
1548 // Get decimal part of a
1549 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1550 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1551
1552 // Get exponent of 2^n which is equal or less than dec_a
1553 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1554
1555 // Get x to range (1, 2]
1556 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1557 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1558 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1559
1560 // Polynomial Approximation
1561 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1562
1563 // Reconstruct
1564 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1565
1566 // Set negative value for 0 < a < 1
1567 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1568
1569 return poly;
1570}
1571
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001572inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1573{
1574 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1575 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1576 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1577
1578 // If 0 < a < 1, calculate log(1/x)
1579 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1580 qint16x4_t recip = vdup_n_s16(0);
1581 recip = vbsl_s16(calc_reciprocal, recip, a);
1582
1583 // Calculate reciprocal
1584 recip = vrecip_qs16(recip, fixed_point_position);
1585 a = vbsl_s16(calc_reciprocal, recip, a);
1586
1587 // Get decimal part of a
1588 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1589 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1590
1591 // Get exponent of 2^n which is equal or less than dec_a
1592 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1593
1594 // Get x to range (1, 2]
1595 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1596 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1597 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1598
1599 // Polynomial Approximation
1600 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1601
1602 // Reconstruct
1603 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1604
1605 // Set negative value for 0 < a < 1
1606 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1607
1608 return poly;
1609}
1610
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001611inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1612{
1613 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1614 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1615 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1616
1617 // If 0 < a < 1, calculate log(1/x)
1618 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1619 qint8x16_t recip = vdupq_n_s8(0);
1620 recip = vbslq_s8(calc_reciprocal, a, recip);
1621
1622 // Calculate reciprocal
1623 recip = vrecipq_qs8(recip, fixed_point_position);
1624 a = vbslq_s8(calc_reciprocal, recip, a);
1625
1626 // Get decimal part of a
1627 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1628 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1629
1630 // Get exponent of 2^n which is equal or less than dec_a
1631 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1632
1633 // Get x to range (1, 2]
1634 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1635 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1636 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1637
1638 // Polynomial Approximation
1639 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1640
1641 // Reconstruct
1642 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1643
1644 // Set negative value for 0 < a < 1
1645 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1646
1647 return poly;
1648}
1649
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001650inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1651{
1652 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1653 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1654 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1655
1656 // If 0 < a < 1, calculate log(1/x)
1657 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1658 qint16x8_t recip = vdupq_n_s16(0);
1659 recip = vbslq_s16(calc_reciprocal, a, recip);
1660
1661 // Calculate reciprocal
1662 recip = vqrecipq_qs16(recip, fixed_point_position);
1663 a = vbslq_s16(calc_reciprocal, recip, a);
1664
1665 // Get decimal part of a
1666 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1667 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1668
1669 // Get exponent of 2^n which is equal or less than dec_a
1670 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1671
1672 // Get x to range (1, 2]
1673 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1674 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1675 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1676
1677 // Polynomial Approximation
1678 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1679
1680 // Reconstruct
1681 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1682
1683 // Set negative value for 0 < a < 1
1684 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1685
1686 return poly;
1687}
1688
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001689inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1690{
1691 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1692
1693 // Find shift value. Number must be in (0.5, 2) range.
1694 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1695
1696 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1697 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1698 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1699 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1700 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1701
1702 temp = vshl_s8(a, shift_value);
1703
1704 // Initial guess
1705 qint8x8_t x = temp;
1706
1707 // Calculate (x / 2) * (3 - a * x^2)
1708 // After three iterations we have the result for 8 bit
1709 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1710 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1711 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1712
1713 return vshl_s8(x, shift_value2);
1714}
1715
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001716inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1717{
1718 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1719
1720 // Find shift value. Number must be in (0.5, 2) range.
1721 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1722
1723 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1724 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1725 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1726 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1727 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1728
1729 temp = vshl_s16(a, shift_value);
1730
1731 // Initial guess
1732 qint16x4_t x = temp;
1733
1734 // Calculate (x / 2) * (3 - a * x^2)
1735 // After five iterations we have the result for 8 bit
1736 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1737 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1738 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1739 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1740 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1741
1742 return vshl_s16(x, shift_value2);
1743}
1744
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001745inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1746{
1747 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1748
1749 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001750 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001751
1752 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001753 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001754 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001755 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001756 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001757
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001758 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001759
1760 // Initial guess
1761 qint8x8_t x = temp;
1762
1763 // Calculate (x / 2) * (3 - a * x^2)
1764 // After three iterations we have the result for 8 bit
1765 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1766 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1767 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1768
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001769 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001770}
1771
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001772inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1773{
1774 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1775
1776 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001777 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001778
1779 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1780 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1781 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1782 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001783 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001784
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001785 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001786
1787 // Initial guess
1788 qint16x4_t x = temp;
1789
1790 // Calculate (x / 2) * (3 - a * x^2)
1791 // After five iterations we have the result for 16 bit
1792 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1793 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1794 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1795 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1796 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1797
1798 return vqshl_s16(x, shift_value2);
1799}
1800
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001801inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1802{
1803 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1804
1805 // Find shift value. Number must be in (0.5, 2) range.
1806 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1807
1808 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1809 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1810 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1811 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1812 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1813
1814 temp = vshlq_s8(a, shift_value);
1815
1816 // Initial guess
1817 qint8x16_t x = temp;
1818
1819 // Calculate (x / 2) * (3 - a * x^2)
1820 // After three iterations we have the result for 8 bit
1821 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1822 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1823 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1824
1825 return vshlq_s8(x, shift_value2);
1826}
1827
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001828inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1829{
1830 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1831
1832 // Find shift value. Number must be in (0.5, 2) range.
1833 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1834
1835 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1836 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1837 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1838 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1839 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1840
1841 temp = vshlq_s16(a, shift_value);
1842
1843 // Initial guess
1844 qint16x8_t x = temp;
1845
1846 // Calculate (x / 2) * (3 - a * x^2)
1847 // After five iterations we have the result for 16 bit
1848 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1849 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1850 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1851 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1852 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1853
1854 return vshlq_s16(x, shift_value2);
1855}
1856
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001857inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1858{
1859 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1860
1861 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001862 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001863
1864 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001865 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001866 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001867 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001868 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001869
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001870 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001871
1872 // Initial guess
1873 qint8x16_t x = temp;
1874
1875 // Calculate (x / 2) * (3 - a * x^2)
1876 // After three iterations we have the result for 8 bit
1877 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1878 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1879 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1880
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001881 return vqshlq_s8(x, shift_value2);
1882}
1883
1884inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1885{
1886 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1887
1888 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001889 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001890
1891 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1892 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1893 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1894 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001895 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001896
1897 temp = vqshlq_s16(a, shift_value);
1898
1899 // Initial guess
1900 qint16x8_t x = temp;
1901
1902 // Calculate (x / 2) * (3 - a * x^2)
1903 // After five iterations we have the result for 16 bit
1904 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1905 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1906 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1907 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1908 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1909
1910 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001911}
1912
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001913inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001914{
1915 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1916 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1917
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001918 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1919 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1920 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1921 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001922
1923 return tanh;
1924}
1925
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001926inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001927{
1928 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1929 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1930
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001931 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1932 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1933 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1934 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001935
1936 return tanh;
1937}
1938
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001939inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001940{
1941 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1942 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1943
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001944 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1945 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1946 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1947 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001948
1949 return tanh;
1950}
1951
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001952inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1953{
1954 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1955 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1956
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001957 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1958 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1959 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1960 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001961
1962 return tanh;
1963}
1964
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001965inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1966{
1967 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1968}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001969
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +01001970inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1971{
1972 return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
1973}
1974
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001975inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1976{
1977 float32x4x2_t res =
1978 {
1979 {
1980 vmaxq_f32(a.val[0], b.val[0]),
1981 vmaxq_f32(a.val[1], b.val[1])
1982 }
1983 };
1984 return res;
1985}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001986}