blob: b241dd5069bda8fc0054ea6028d05063aa02f730 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25namespace arm_compute
26{
27/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
28 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010029static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030{
31 {
32 vdup_n_s8(0x7F), // 0.9978546
33 vdup_n_s8(0x3F), // 0.4994721
34 vdup_n_s8(0x16), // 0.1763723
35 vdup_n_s8(0x05), // 0.0435108
36 }
37};
38
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010039/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
40 * Format is in Q0.15 for all elements */
41static const std::array<qint16x4_t, 4> exp_tab_qs16 =
42{
43 {
44 vdup_n_s16(0x7FBA), // 0.9978546
45 vdup_n_s16(0x3FE9), // 0.4994721
46 vdup_n_s16(0x1693), // 0.1763723
47 vdup_n_s16(0x0592), // 0.0435108
48 }
49};
50
Anthony Barbier6ff3b192017-09-04 18:44:23 +010051/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
52 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010053static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054{
55 {
56 vdupq_n_s8(0x7F), // 0.9978546
57 vdupq_n_s8(0x3F), // 0.4994721
58 vdupq_n_s8(0x16), // 0.1763723
59 vdupq_n_s8(0x05), // 0.0435108
60 }
61};
62
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010063/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
64 * Format is in Q0.15 for all elements */
65static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
66{
67 {
68 vdupq_n_s16(0x7FBA), // 0.9978546
69 vdupq_n_s16(0x3FE9), // 0.4994721
70 vdupq_n_s16(0x1693), // 0.1763723
71 vdupq_n_s16(0x0592), // 0.0435108
72 }
73};
74
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
76 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010077static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010078{
79 {
80 vdup_n_s8(0x5C), // 1.4384189
81 vdup_n_s8(-0x56), // -0.6771900
82 vdup_n_s8(0x29), // 0.3218538
83 vdup_n_s8(-0x0A), // -0.0832229
84 }
85};
86
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010087/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
88 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
89static const std::array<qint16x4_t, 4> log_tab_qs16 =
90{
91 {
92 vdup_n_s16(0x5C0F), // 1.4384189
93 vdup_n_s16(-0x56AE), // -0.6771900
94 vdup_n_s16(0x2933), // 0.3218538
95 vdup_n_s16(-0x0AA7), // -0.0832229
96 }
97};
98
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
100 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100101static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102{
103 {
104 vdupq_n_s8(0x5C), // 1.4384189
105 vdupq_n_s8(-0x56), // -0.6771900
106 vdupq_n_s8(0x29), // 0.3218538
107 vdupq_n_s8(-0x0A), // -0.0832229
108 }
109};
110
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100111/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
112 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
113static const std::array<qint16x8_t, 4> log_tabq_qs16 =
114{
115 {
116 vdupq_n_s16(0x5C0F), // 1.4384189
117 vdupq_n_s16(-0x56AE), // -0.6771900
118 vdupq_n_s16(0x2933), // 0.3218538
119 vdupq_n_s16(-0x0AA7), // -0.0832229
120 }
121};
122
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100123inline qint8x8_t vget_low_qs8(qint8x16_t a)
124{
125 return vget_low_s8(a);
126}
127
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100128inline qint16x4_t vget_low_qs16(qint16x8_t a)
129{
130 return vget_low_s16(a);
131}
132
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133inline qint8x8_t vget_high_qs8(qint8x16_t a)
134{
135 return vget_high_s8(a);
136}
137
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100138inline qint16x4_t vget_high_qs16(qint16x8_t a)
139{
140 return vget_high_s16(a);
141}
142
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143inline qint8x8_t vld1_qs8(const qint8_t *addr)
144{
145 return vld1_s8(addr);
146}
147
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100148inline qint16x4_t vld1_qs16(const qint16_t *addr)
149{
150 return vld1_s16(addr);
151}
152
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100153inline qint8x16_t vld1q_qs8(const qint8_t *addr)
154{
155 return vld1q_s8(addr);
156}
157
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158inline qint16x8_t vld1q_qs16(const qint16_t *addr)
159{
160 return vld1q_s16(addr);
161}
162
163inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
164{
165 return vld1_dup_s8(addr);
166}
167
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100168inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
169{
170 return vld1_dup_s16(addr);
171}
172
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
174{
175 return vld1q_dup_s8(addr);
176}
177
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100178inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
179{
180 return vld1q_dup_s16(addr);
181}
182
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
184{
185 vst1_s8(addr, b);
186}
187
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
189{
190 vst1_s16(addr, b);
191}
192
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100193inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
194{
195 vst1q_s8(addr, b);
196}
197
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100198inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
199{
200 vst1q_s16(addr, b);
201}
202
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100203inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
204{
205 vst2q_s16(addr, b);
206}
207
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100208inline qint8x8_t vqmovn_qs16(qint16x8_t a)
209{
210 return vqmovn_s16(a);
211}
212
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100213inline qint16x4_t vqmovn_qs32(qint32x4_t a)
214{
215 return vqmovn_s32(a);
216}
217
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100218inline qint8x8_t vdup_n_qs8(qint8_t a)
219{
220 return vdup_n_s8(a);
221}
222
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100223inline qint16x4_t vdup_n_qs16(qint16_t a)
224{
225 return vdup_n_s16(a);
226}
227
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100228inline qint8x16_t vdupq_n_qs8(qint8_t a)
229{
230 return vdupq_n_s8(a);
231}
232
233inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
234{
235 float32x4x4_t res =
236 {
237 {
238 vdupq_n_f32(a),
239 vdupq_n_f32(a),
240 vdupq_n_f32(a),
241 vdupq_n_f32(a),
242 }
243 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100244 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100245}
246
247inline qint16x8_t vdupq_n_qs16(qint16_t a)
248{
249 return vdupq_n_s16(a);
250}
251
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100252inline qint32x4_t vdupq_n_qs32(qint32_t a)
253{
254 return vdupq_n_s32(a);
255}
256
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100257inline qint8x8_t vabs_qs8(qint8x8_t a)
258{
259 return vabs_s8(a);
260}
261
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100262inline qint16x4_t vabs_qs16(qint16x4_t a)
263{
264 return vabs_s16(a);
265}
266
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267inline qint8x16_t vabsq_qs8(qint8x16_t a)
268{
269 return vabsq_s8(a);
270}
271
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100272inline qint16x8_t vabsq_qs16(qint16x8_t a)
273{
274 return vabsq_s16(a);
275}
276
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100277inline qint8x8_t vqabs_qs8(qint8x8_t a)
278{
279 return vqabs_s8(a);
280}
281
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100282inline qint16x4_t vqabs_qs16(qint16x4_t a)
283{
284 return vqabs_s16(a);
285}
286
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100287inline qint8x16_t vqabsq_qs8(qint8x16_t a)
288{
289 return vqabsq_s8(a);
290}
291
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100292inline qint16x8_t vqabsq_qs16(qint16x8_t a)
293{
294 return vqabsq_s16(a);
295}
296
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100297inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
298{
299 return vmax_s8(a, b);
300}
301
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100302inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
303{
304 return vmax_s16(a, b);
305}
306
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100307inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
308{
309 return vmaxq_s8(a, b);
310}
311
312inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
313{
314 return vpmax_s8(a, b);
315}
316
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100317inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
318{
319 return vpmax_s16(a, b);
320}
321
322inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
323{
324 return vmaxq_s16(a, b);
325}
326
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100327inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
328{
329 return vmin_s8(a, b);
330}
331
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100332inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
333{
334 return vmin_s16(a, b);
335}
336
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100337inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
338{
339 return vminq_s8(a, b);
340}
341
342inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
343{
344 return vpmin_s8(a, b);
345}
346
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100347inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
348{
349 return vpmin_s16(a, b);
350}
351
352inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
353{
354 return vminq_s16(a, b);
355}
356
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100357inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
358{
359 return vadd_s8(a, b);
360}
361
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100362inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
363{
364 return vadd_s16(a, b);
365}
366
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100367inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
368{
369 return vaddq_s8(a, b);
370}
371
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100372inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
373{
374 return vaddq_s16(a, b);
375}
376
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100377inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
378{
379 return vqadd_s8(a, b);
380}
381
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100382inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
383{
384 return vqadd_s16(a, b);
385}
386
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100387inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
388{
389 return vqaddq_s8(a, b);
390}
391
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100392inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
393{
394 return vqaddq_s16(a, b);
395}
396
397inline int16x4_t vpaddl_qs8(qint8x8_t a)
398{
399 return vpaddl_s8(a);
400}
401
402inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
403{
404 return vsub_s8(a, b);
405}
406
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100407inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
408{
409 return vsub_s16(a, b);
410}
411
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100412inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
413{
414 return vsubq_s8(a, b);
415}
416
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100417inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
418{
419 return vsubq_s16(a, b);
420}
421
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100422inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
423{
424 return vqsub_s8(a, b);
425}
426
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100427inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
428{
429 return vqsub_s16(a, b);
430}
431
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100432inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
433{
434 return vqsubq_s8(a, b);
435}
436
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100437inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
438{
439 return vqsubq_s16(a, b);
440}
441
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100442inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
443{
444 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
445
446 // Initialize the temporary result with a constant used to round up the result
447 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
448
449 // Vector multiply-accumulate long
450 res = vmlal_s8(res, a, b);
451
452 // Shift right by fixed_point_position
453 res = vshlq_s16(res, fixed_point_position_s16);
454
455 // Convert back to qint8
456 return vmovn_s16(res);
457}
458
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100459inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
460{
461 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
462
463 // Initialize the temporary result with a constant used to round up the result
464 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
465
466 // Vector multiply-accumulate long
467 res = vmlal_s16(res, a, b);
468
469 // Shift right by fixed_point_position
470 res = vshlq_s32(res, fixed_point_position_s32);
471
472 // Convert back to qint16
473 return vmovn_s32(res);
474}
475
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100476inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
477{
478 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
479
480 // Initialize the temporary results with a constant used to round up the result
481 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
482 qint16x8_t res1 = res0;
483
484 // Vector multiply-accumulate long
485 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
486 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
487
488 // Shift right by fixed_point_position
489 res0 = vshlq_s16(res0, fixed_point_position_s16);
490 res1 = vshlq_s16(res1, fixed_point_position_s16);
491
492 // Convert back to qint8
493 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
494}
495
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100496inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
497{
498 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
499
500 // Initialize the temporary results with a constant used to round up the result
501 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
502 qint32x4_t res1 = res0;
503
504 // Vector multiply-accumulate long
505 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
506 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
507
508 // Shift right by fixed_point_position
509 res0 = vshlq_s32(res0, fixed_point_position_s32);
510 res1 = vshlq_s32(res1, fixed_point_position_s32);
511
512 // Convert back to qint16
513 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
514}
515
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100516inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
517{
518 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
519
520 // Initialize the temporary result with a constant used to round up the result
521 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
522
523 // Vector multiply-accumulate long
524 res = vmlal_s8(res, a, b);
525
526 // Shift right by fixed_point_position
527 res = vqshlq_s16(res, fixed_point_position_s16);
528
529 // Convert back to qint8 and saturate
530 return vqmovn_s16(res);
531}
532
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100533inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
534{
535 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
536
537 // Initialize the temporary result with a constant used to round up the result
538 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
539
540 // Vector multiply-accumulate long
541 res = vmlal_s16(res, a, b);
542
543 // Shift right by fixed_point_position
544 res = vqshlq_s32(res, fixed_point_position_s32);
545
546 // Convert back to qint16 and saturate
547 return vqmovn_s32(res);
548}
549
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100550inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
551{
552 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
553
554 // Initialize the temporary results with a constant used to round up the result
555 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
556 qint16x8_t res1 = res0;
557
558 // Vector multiply-accumulate long
559 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
560 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
561
562 // Shift right by fixed_point_position
563 res0 = vqshlq_s16(res0, fixed_point_position_s16);
564 res1 = vqshlq_s16(res1, fixed_point_position_s16);
565
566 // Convert back to qint8 and saturate
567 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
568}
569
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100570inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
571{
572 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
573
574 // Initialize the temporary results with a constant used to round up the result
575 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
576 qint32x4_t res1 = res0;
577
578 // Vector multiply-accumulate long
579 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
580 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
581
582 // Shift right by fixed_point_position
583 res0 = vqshlq_s32(res0, fixed_point_position_s32);
584 res1 = vqshlq_s32(res1, fixed_point_position_s32);
585
586 // Convert back to qint16 and saturate
587 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
588}
589
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100590inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
591{
592 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
593
594 qint16x8_t res = vmull_s8(a, b);
595
596 return vqrshlq_s16(res, fixed_point_position_s16);
597}
598
599inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
600{
601 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
602
603 // Initialize the temporary results with a constant used to round up the result
604 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
605
606 // Vector multiply-accumulate long
607 tmp = vmlal_s8(tmp, b, c);
608
609 // Shift right by fixed_point_position
610 tmp = vshlq_s16(tmp, fixed_point_position_s16);
611
612 // Convert back to qint8 and accumulate
613 return vadd_s8(a, vmovn_s16(tmp));
614}
615
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100616inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
617{
618 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
619
620 // Initialize the temporary results with a constant used to round up the result
621 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
622
623 // Vector multiply-accumulate long
624 tmp = vmlal_s16(tmp, b, c);
625
626 // Shift right by fixed_point_position
627 tmp = vshlq_s32(tmp, fixed_point_position_s32);
628
629 // Convert back to qint16 and accumulate
630 return vadd_s16(a, vmovn_s32(tmp));
631}
632
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100633inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
634{
635 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
636
637 // Initialize the temporary results with a constant used to round up the result
638 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
639 qint16x8_t tmp1 = tmp0;
640
641 // Vector multiply-accumulate long
642 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
643 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
644
645 // Shift right by fixed_point_position
646 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
647 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
648
649 // Convert back to qint8 and accumulate
650 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
651}
652
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100653inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
654{
655 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
656
657 // Initialize the temporary results with a constant used to round up the result
658 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
659 qint32x4_t tmp1 = tmp0;
660
661 // Vector multiply-accumulate long
662 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
663 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
664
665 // Shift right by fixed_point_position
666 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
667 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
668
669 // Convert back to qint16 and accumulate
670 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
671}
672
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100673inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
674{
675 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
676
677 // Initialize the temporary results with a constant used to round up the result
678 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
679
680 // Vector multiply-accumulate long
681 tmp = vmlal_s8(tmp, b, c);
682
683 // Shift right by fixed_point_position
684 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
685
686 // Convert back to qint8 and accumulate
687 return vqadd_s8(a, vqmovn_s16(tmp));
688}
689
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100690inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
691{
692 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
693
694 // Initialize the temporary results with a constant used to round up the result
695 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
696
697 // Vector multiply-accumulate long
698 tmp = vmlal_s16(tmp, b, c);
699
700 // Shift right by fixed_point_position
701 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
702
703 // Convert back to qint8 and accumulate
704 return vqadd_s16(a, vqmovn_s32(tmp));
705}
706
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100707inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
708{
709 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
710
711 // Initialize the temporary results with a constant used to round up the result
712 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
713 qint16x8_t tmp1 = tmp0;
714
715 // Vector multiply-accumulate long
716 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
717 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
718
719 // Shift right by fixed_point_position
720 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
721 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
722
723 // Convert back to qint8 and accumulate
724 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
725 return vqaddq_s8(a, res);
726}
727
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100728inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
729{
730 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
731
732 // Initialize the temporary results with a constant used to round up the result
733 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
734 qint32x4_t tmp1 = tmp0;
735
736 // Vector multiply-accumulate long
737 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
738 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
739
740 // Shift right by fixed_point_position
741 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
742 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
743
744 // Convert back to qint16 and accumulate
745 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
746 return vqaddq_s16(a, res);
747}
748
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100749inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
750{
751 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
752
753 // Initialize the temporary results with a constant used to round up the result
754 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
755
756 // Vector multiply-accumulate long
757 tmp = vmlal_s8(tmp, b, c);
758
759 // Shift right by fixed_point_position
760 tmp = vshlq_s16(tmp, fixed_point_position_s16);
761
762 // Accumulate
763 return vaddq_s16(a, tmp);
764}
765
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100766inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
767{
768 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
769
770 // Initialize the temporary results with a constant used to round up the result
771 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
772
773 // Vector multiply-accumulate long
774 tmp = vmlal_s16(tmp, b, c);
775
776 // Shift right by fixed_point_position
777 tmp = vshlq_s32(tmp, fixed_point_position_s32);
778
779 // Accumulate
780 return vaddq_s32(a, tmp);
781}
782
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100783inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
784{
785 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
786
787 // Initialize the temporary results with a constant used to round up the result
788 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
789
790 // Vector multiply-accumulate long
791 tmp = vmlal_s8(tmp, b, c);
792
793 // Shift right by fixed_point_position
794 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
795
796 // Accumulate
797 return vqaddq_s16(a, tmp);
798}
799
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100800inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
801{
802 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
803
804 // Initialize the temporary results with a constant used to round up the result
805 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
806
807 // Vector multiply-accumulate long
808 tmp = vmlal_s16(tmp, b, c);
809
810 // Shift right by fixed_point_position
811 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
812
813 // Accumulate
814 return vqaddq_s32(a, tmp);
815}
816
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100817inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100818{
819 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
820
821 float32x4x2_t res_f32 =
822 {
823 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100824 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
825 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100826 }
827 };
828
829 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
830 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
831
832 const int32x4x2_t res_s32 =
833 {
834 {
835 vcvtq_s32_f32(res_f32.val[0]),
836 vcvtq_s32_f32(res_f32.val[1]),
837 }
838 };
839
840 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
841
842 return vqmovn_s16(res_s16);
843}
844
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100845inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100846{
847 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
848
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100849 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100850
851 res_f32 = vmlaq_f32(res_f32, a, pow2);
852
853 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
854
855 return vqmovn_s32(res_s32);
856}
857
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100858inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100859{
860 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
861
862 float32x4x4_t res_f32 =
863 {
864 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100865 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
866 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
867 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
868 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100869 }
870 };
871
872 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
873 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
874 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
875 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
876
877 const int32x4x4_t res_s32 =
878 {
879 {
880 vcvtq_s32_f32(res_f32.val[0]),
881 vcvtq_s32_f32(res_f32.val[1]),
882 vcvtq_s32_f32(res_f32.val[2]),
883 vcvtq_s32_f32(res_f32.val[3]),
884 }
885 };
886
887 const int16x8x2_t res_s16 =
888 {
889 {
890 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
891 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
892 }
893 };
894
895 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
896}
897
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100898inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100899{
900 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
901
902 float32x4x2_t res_f32 =
903 {
904 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100905 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
906 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100907 }
908 };
909
910 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
911 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
912
913 const int32x4x2_t res_s32 =
914 {
915 {
916 vcvtq_s32_f32(res_f32.val[0]),
917 vcvtq_s32_f32(res_f32.val[1])
918 }
919 };
920
921 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
922}
923
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100924inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
925{
926 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
927
928 const int16x8_t res_s16 = vmovl_s8(a);
929
930 const int32x4x2_t res_s32 =
931 {
932 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100933 vmovl_s16(vget_low_qs16(res_s16)),
934 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100935 }
936 };
937
938 float32x4x2_t res_f32 =
939 {
940 {
941 vcvtq_f32_s32(res_s32.val[0]),
942 vcvtq_f32_s32(res_s32.val[1])
943 }
944 };
945
946 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
947 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
948
949 return res_f32;
950}
951
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100952inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
953{
954 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
955 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
956
957 return vmulq_f32(res_f32, pow2);
958}
959
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100960inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
961{
962 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
963
964 const int16x8x2_t res_s16 =
965 {
966 {
967 vmovl_s8(vget_low_s8(a)),
968 vmovl_s8(vget_high_s8(a)),
969 }
970 };
971
972 const int32x4x4_t res_s32 =
973 {
974 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100975 vmovl_s16(vget_low_qs16(res_s16.val[0])),
976 vmovl_s16(vget_high_qs16(res_s16.val[0])),
977 vmovl_s16(vget_low_qs16(res_s16.val[1])),
978 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100979 }
980 };
981
982 float32x4x4_t res_f32 =
983 {
984 {
985 vcvtq_f32_s32(res_s32.val[0]),
986 vcvtq_f32_s32(res_s32.val[1]),
987 vcvtq_f32_s32(res_s32.val[2]),
988 vcvtq_f32_s32(res_s32.val[3])
989 }
990 };
991
992 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
993 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
994 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
995 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
996
997 return res_f32;
998}
999
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001000inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1001{
1002 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1003
1004 const int32x4x2_t res_s32 =
1005 {
1006 {
1007 vmovl_s16(vget_low_qs16(a)),
1008 vmovl_s16(vget_high_qs16(a))
1009 }
1010 };
1011
1012 float32x4x2_t res_f32 =
1013 {
1014 {
1015 vcvtq_f32_s32(res_s32.val[0]),
1016 vcvtq_f32_s32(res_s32.val[1])
1017 }
1018 };
1019
1020 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1021 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1022
1023 return res_f32;
1024}
1025
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001026inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1027{
1028 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001029 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1030 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1031 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001032
1033 // Find shift value
1034 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1035 const qint8x8_t temp = vshl_s8(a, shift_value);
1036
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001037 qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001038
1039 uint8x8_t set_one = vcgt_s8(x, const_one);
1040 x = vbsl_s8(set_one, const_one, x);
1041
1042 // Use three iterations of Newton-Raphson method to get the result
1043 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1044 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1045 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1046
1047 return vshl_s8(x, shift_value);
1048}
1049
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001050inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1051{
1052 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1053 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1054 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1055 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1056
1057 // Find shift value
1058 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1059 const qint16x4_t temp = vshl_s16(a, shift_value);
1060
1061 qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
1062
1063 uint16x4_t set_one = vcgt_s16(x, const_one);
1064 x = vbsl_s16(set_one, const_one, x);
1065
1066 // Use five iterations of Newton-Raphson method to get the result
1067 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1068 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1069 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1070 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1071 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1072
1073 return vshl_s16(x, shift_value);
1074}
1075
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001076inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1077{
1078 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001079 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1080 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1081 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001082
1083 // Find shift value
1084 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1085 const qint8x16_t temp = vshlq_s8(a, shift_value);
1086
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001087 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001088
1089 // Set initial guess to one if x > 1
1090 uint8x16_t set_one = vcgtq_s8(x, const_one);
1091 x = vbslq_s8(set_one, const_one, x);
1092
1093 // Use three iterations of Newton-Raphson method to get the result
1094 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1095 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1096 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1097
1098 return vshlq_s8(x, shift_value);
1099}
1100
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001101inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1102{
1103 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1104 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1105 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1106 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1107
1108 // Find shift value
1109 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1110 const qint16x8_t temp = vshlq_s16(a, shift_value);
1111
1112 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1113
1114 // Set initial guess to one if x > 1
1115 uint16x8_t set_one = vcgtq_s16(x, const_one);
1116 x = vbslq_s16(set_one, const_one, x);
1117
1118 // Use five iterations of Newton-Raphson method to get the result
1119 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1120 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1121 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1122 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1123 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1124
1125 return vshlq_s16(x, shift_value);
1126}
1127
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001128inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1129{
1130 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001131 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1132 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1133 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001134
1135 // Find shift value
1136 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1137 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001139 qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001140
1141 // Set initial guess to one if x > 1
1142 uint8x16_t set_one = vcgtq_s8(x, const_one);
1143 x = vbslq_s8(set_one, const_one, x);
1144
1145 // Use three iterations of Newton-Raphson method to get the result
1146 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1147 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1148 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1149
1150 return vqshlq_s8(x, shift_value);
1151}
1152
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001153inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1154{
1155 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1156 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1157 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1158 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1159
1160 // Find shift value
1161 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1162 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1163
1164 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1165
1166 // Set initial guess to one if x > 1
1167 uint16x8_t set_one = vcgtq_s16(x, const_one);
1168 x = vbslq_s16(set_one, const_one, x);
1169
1170 // Use five iterations of Newton-Raphson method to get the result
1171 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1172 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1173 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1174 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1175 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1176
1177 return vqshlq_s16(x, shift_value);
1178}
1179
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001180inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1181{
1182 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1183}
1184
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001185inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1186{
1187 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1188}
1189
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001190inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1191{
1192 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1193}
1194
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001195inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1196{
1197 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1198}
1199
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001200template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001201inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001202{
1203 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1204 const qint8x8_t const_one = vdup_n_s8(1);
1205 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1206 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1207 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1208 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1209 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1210 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1211 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1212 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1213 return res;
1214}
1215
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001216template <bool islog>
1217inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1218{
1219 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1220 const qint16x4_t const_one = vdup_n_s16(1);
1221 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1222 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1223 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1224 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1225 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1226 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1227 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1228 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1229 return res;
1230}
1231
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001232template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001233inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001234{
1235 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1236 const qint8x8_t const_one = vdup_n_s8(1);
1237 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1238 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1239 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1240 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1241 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1242 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1243 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1244 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1245 return res;
1246}
1247
1248template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001249inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1250{
1251 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1252 const qint16x4_t const_one = vdup_n_s16(1);
1253 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1254 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1255 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1256 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1257 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1258 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1259 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1260 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1261 return res;
1262}
1263
1264template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001265inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1266{
1267 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1268 const qint8x16_t const_one = vdupq_n_s8(1);
1269 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1270 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1271 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1272 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1273 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1274 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1275 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1276 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1277 return res;
1278}
1279
1280template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001281inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1282{
1283 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1284 const qint16x8_t const_one = vdupq_n_s16(1);
1285 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1286 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1287 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1288 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1289 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1290 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1291 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1292 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1293 return res;
1294}
1295
1296template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001297inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1298{
1299 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1300 const qint8x16_t const_one = vdupq_n_s8(1);
1301 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1302 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1303 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1304 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1305 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1306 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1307 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1308 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1309 return res;
1310}
1311
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001312template <bool islog>
1313inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1314{
1315 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1316 const qint16x8_t const_one = vdupq_n_s16(1);
1317 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1318 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1319 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1320 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1321 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1322 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1323 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1324 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1325 return res;
1326}
1327
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001328inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1329{
1330 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1331 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1332 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1333 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1334
1335 // Perform range reduction [-log(2),log(2)]
1336 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1337
1338 // get decimal part from m
1339 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1340
1341 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1342 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1343
1344 // Polynomial Approximation
1345 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1346 poly = vqadd_s8(poly, const_one);
1347
1348 // Reconstruct
1349 poly = vqshl_s8(poly, dec_m);
1350
1351 return poly;
1352}
1353
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001354inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1355{
1356 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1357 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1358 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1359 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1360
1361 // Perform range reduction [-log(2),log(2)]
1362 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1363
1364 // get decimal part from m
1365 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1366
1367 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1368 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1369
1370 // Polynomial Approximation
1371 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1372 poly = vqadd_s16(poly, const_one);
1373
1374 // Reconstruct
1375 poly = vqshl_s16(poly, dec_m);
1376
1377 return poly;
1378}
1379
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001380inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1381{
1382 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1383 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1384 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1385 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1386
1387 // Perform range reduction [-log(2),log(2)]
1388 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1389
1390 // get decimal part from m
1391 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1392
1393 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1394 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1395
1396 // Polynomial Approximation
1397 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1398 poly = vqaddq_s8(poly, const_one);
1399
1400 // Reconstruct
1401 poly = vqshlq_s8(poly, dec_m);
1402
1403 return poly;
1404}
1405
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001406inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1407{
1408 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1409 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1410 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1411 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1412
1413 // Perform range reduction [-log(2),log(2)]
1414 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1415
1416 // get decimal part from m
1417 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1418
1419 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1420 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1421
1422 // Polynomial Approximation
1423 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1424 poly = vqaddq_s16(poly, const_one);
1425
1426 // Reconstruct
1427 poly = vqshlq_s16(poly, dec_m);
1428
1429 return poly;
1430}
1431
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001432inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1433{
1434 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1435 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1436 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1437
1438 // If 0 < a < 1, calculate log(1/x)
1439 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1440 qint8x8_t recip = vdup_n_s8(0);
1441 recip = vbsl_s8(calc_reciprocal, recip, a);
1442
1443 // Calculate reciprocal
1444 recip = vrecip_qs8(recip, fixed_point_position);
1445 a = vbsl_s8(calc_reciprocal, recip, a);
1446
1447 // Get decimal part of a
1448 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1449 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1450
1451 // Get exponent of 2^n which is equal or less than dec_a
1452 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1453
1454 // Get x to range (1, 2]
1455 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1456 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1457 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1458
1459 // Polynomial Approximation
1460 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1461
1462 // Reconstruct
1463 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1464
1465 // Set negative value for 0 < a < 1
1466 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1467
1468 return poly;
1469}
1470
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001471inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1472{
1473 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1474 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1475 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1476
1477 // If 0 < a < 1, calculate log(1/x)
1478 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1479 qint16x4_t recip = vdup_n_s16(0);
1480 recip = vbsl_s16(calc_reciprocal, recip, a);
1481
1482 // Calculate reciprocal
1483 recip = vrecip_qs16(recip, fixed_point_position);
1484 a = vbsl_s16(calc_reciprocal, recip, a);
1485
1486 // Get decimal part of a
1487 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1488 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1489
1490 // Get exponent of 2^n which is equal or less than dec_a
1491 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1492
1493 // Get x to range (1, 2]
1494 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1495 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1496 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1497
1498 // Polynomial Approximation
1499 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1500
1501 // Reconstruct
1502 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1503
1504 // Set negative value for 0 < a < 1
1505 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1506
1507 return poly;
1508}
1509
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001510inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1511{
1512 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1513 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1514 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1515
1516 // If 0 < a < 1, calculate log(1/x)
1517 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1518 qint8x16_t recip = vdupq_n_s8(0);
1519 recip = vbslq_s8(calc_reciprocal, a, recip);
1520
1521 // Calculate reciprocal
1522 recip = vrecipq_qs8(recip, fixed_point_position);
1523 a = vbslq_s8(calc_reciprocal, recip, a);
1524
1525 // Get decimal part of a
1526 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1527 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1528
1529 // Get exponent of 2^n which is equal or less than dec_a
1530 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1531
1532 // Get x to range (1, 2]
1533 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1534 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1535 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1536
1537 // Polynomial Approximation
1538 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1539
1540 // Reconstruct
1541 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1542
1543 // Set negative value for 0 < a < 1
1544 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1545
1546 return poly;
1547}
1548
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001549inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1550{
1551 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1552 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1553 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1554
1555 // If 0 < a < 1, calculate log(1/x)
1556 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1557 qint16x8_t recip = vdupq_n_s16(0);
1558 recip = vbslq_s16(calc_reciprocal, a, recip);
1559
1560 // Calculate reciprocal
1561 recip = vqrecipq_qs16(recip, fixed_point_position);
1562 a = vbslq_s16(calc_reciprocal, recip, a);
1563
1564 // Get decimal part of a
1565 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1566 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1567
1568 // Get exponent of 2^n which is equal or less than dec_a
1569 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1570
1571 // Get x to range (1, 2]
1572 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1573 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1574 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1575
1576 // Polynomial Approximation
1577 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1578
1579 // Reconstruct
1580 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1581
1582 // Set negative value for 0 < a < 1
1583 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1584
1585 return poly;
1586}
1587
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001588inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1589{
1590 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1591
1592 // Find shift value. Number must be in (0.5, 2) range.
1593 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1594
1595 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1596 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1597 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1598 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1599 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1600
1601 temp = vshl_s8(a, shift_value);
1602
1603 // Initial guess
1604 qint8x8_t x = temp;
1605
1606 // Calculate (x / 2) * (3 - a * x^2)
1607 // After three iterations we have the result for 8 bit
1608 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1609 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1610 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1611
1612 return vshl_s8(x, shift_value2);
1613}
1614
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001615inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1616{
1617 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1618
1619 // Find shift value. Number must be in (0.5, 2) range.
1620 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1621
1622 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1623 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1624 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1625 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1626 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1627
1628 temp = vshl_s16(a, shift_value);
1629
1630 // Initial guess
1631 qint16x4_t x = temp;
1632
1633 // Calculate (x / 2) * (3 - a * x^2)
1634 // After five iterations we have the result for 8 bit
1635 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1636 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1637 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1638 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1639 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1640
1641 return vshl_s16(x, shift_value2);
1642}
1643
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001644inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1645{
1646 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1647
1648 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001649 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001650
1651 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001652 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001653 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001654 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001655 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001656
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001657 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001658
1659 // Initial guess
1660 qint8x8_t x = temp;
1661
1662 // Calculate (x / 2) * (3 - a * x^2)
1663 // After three iterations we have the result for 8 bit
1664 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1665 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1666 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1667
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001668 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001669}
1670
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001671inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1672{
1673 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1674
1675 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001676 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001677
1678 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1679 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1680 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1681 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001682 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001683
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001684 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001685
1686 // Initial guess
1687 qint16x4_t x = temp;
1688
1689 // Calculate (x / 2) * (3 - a * x^2)
1690 // After five iterations we have the result for 16 bit
1691 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1692 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1693 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1694 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1695 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1696
1697 return vqshl_s16(x, shift_value2);
1698}
1699
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001700inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1701{
1702 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1703
1704 // Find shift value. Number must be in (0.5, 2) range.
1705 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1706
1707 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1708 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1709 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1710 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1711 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1712
1713 temp = vshlq_s8(a, shift_value);
1714
1715 // Initial guess
1716 qint8x16_t x = temp;
1717
1718 // Calculate (x / 2) * (3 - a * x^2)
1719 // After three iterations we have the result for 8 bit
1720 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1721 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1722 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1723
1724 return vshlq_s8(x, shift_value2);
1725}
1726
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001727inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1728{
1729 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1730
1731 // Find shift value. Number must be in (0.5, 2) range.
1732 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1733
1734 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1735 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1736 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1737 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1738 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1739
1740 temp = vshlq_s16(a, shift_value);
1741
1742 // Initial guess
1743 qint16x8_t x = temp;
1744
1745 // Calculate (x / 2) * (3 - a * x^2)
1746 // After five iterations we have the result for 16 bit
1747 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1748 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1749 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1750 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1751 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752
1753 return vshlq_s16(x, shift_value2);
1754}
1755
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001756inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1757{
1758 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1759
1760 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001761 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001762
1763 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001764 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001765 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001766 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001767 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001768
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001769 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001770
1771 // Initial guess
1772 qint8x16_t x = temp;
1773
1774 // Calculate (x / 2) * (3 - a * x^2)
1775 // After three iterations we have the result for 8 bit
1776 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1777 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1778 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1779
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001780 return vqshlq_s8(x, shift_value2);
1781}
1782
1783inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1784{
1785 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1786
1787 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001788 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001789
1790 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1791 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1792 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1793 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001794 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001795
1796 temp = vqshlq_s16(a, shift_value);
1797
1798 // Initial guess
1799 qint16x8_t x = temp;
1800
1801 // Calculate (x / 2) * (3 - a * x^2)
1802 // After five iterations we have the result for 16 bit
1803 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1804 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1805 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1806 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1807 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808
1809 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001810}
1811
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001812inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001813{
1814 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1815 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1816
1817 qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1818 qint8x8_t num = vqsub_qs8(exp2x, const_one);
1819 qint8x8_t den = vqadd_qs8(exp2x, const_one);
1820 qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
1821
1822 return tanh;
1823}
1824
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001825inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001826{
1827 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1828 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1829
1830 qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1831 qint16x4_t num = vqsub_qs16(exp2x, const_one);
1832 qint16x4_t den = vqadd_qs16(exp2x, const_one);
1833 qint16x4_t tanh = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
1834
1835 return tanh;
1836}
1837
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001838inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001839{
1840 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1841 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1842
1843 qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1844 qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1845 qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1846 qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
1847
1848 return tanh;
1849}
1850
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001851inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1852{
1853 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1854 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1855
1856 qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1857 qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1858 qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1859 qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
1860
1861 return tanh;
1862}
1863
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001864inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1865{
1866 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1867}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001868
1869inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1870{
1871 float32x4x2_t res =
1872 {
1873 {
1874 vmaxq_f32(a.val[0], b.val[0]),
1875 vmaxq_f32(a.val[1], b.val[1])
1876 }
1877 };
1878 return res;
1879}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001880}