blob: f62a338a61cc3145692f97d67b18adc698317f13 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25namespace arm_compute
26{
27/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
28 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010029static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030{
31 {
32 vdup_n_s8(0x7F), // 0.9978546
33 vdup_n_s8(0x3F), // 0.4994721
34 vdup_n_s8(0x16), // 0.1763723
35 vdup_n_s8(0x05), // 0.0435108
36 }
37};
38
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010039/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
40 * Format is in Q0.15 for all elements */
41static const std::array<qint16x4_t, 4> exp_tab_qs16 =
42{
43 {
44 vdup_n_s16(0x7FBA), // 0.9978546
45 vdup_n_s16(0x3FE9), // 0.4994721
46 vdup_n_s16(0x1693), // 0.1763723
47 vdup_n_s16(0x0592), // 0.0435108
48 }
49};
50
Anthony Barbier6ff3b192017-09-04 18:44:23 +010051/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
52 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010053static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054{
55 {
56 vdupq_n_s8(0x7F), // 0.9978546
57 vdupq_n_s8(0x3F), // 0.4994721
58 vdupq_n_s8(0x16), // 0.1763723
59 vdupq_n_s8(0x05), // 0.0435108
60 }
61};
62
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010063/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
64 * Format is in Q0.15 for all elements */
65static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
66{
67 {
68 vdupq_n_s16(0x7FBA), // 0.9978546
69 vdupq_n_s16(0x3FE9), // 0.4994721
70 vdupq_n_s16(0x1693), // 0.1763723
71 vdupq_n_s16(0x0592), // 0.0435108
72 }
73};
74
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
76 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010077static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010078{
79 {
80 vdup_n_s8(0x5C), // 1.4384189
81 vdup_n_s8(-0x56), // -0.6771900
82 vdup_n_s8(0x29), // 0.3218538
83 vdup_n_s8(-0x0A), // -0.0832229
84 }
85};
86
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010087/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
88 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
89static const std::array<qint16x4_t, 4> log_tab_qs16 =
90{
91 {
92 vdup_n_s16(0x5C0F), // 1.4384189
93 vdup_n_s16(-0x56AE), // -0.6771900
94 vdup_n_s16(0x2933), // 0.3218538
95 vdup_n_s16(-0x0AA7), // -0.0832229
96 }
97};
98
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
100 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100101static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102{
103 {
104 vdupq_n_s8(0x5C), // 1.4384189
105 vdupq_n_s8(-0x56), // -0.6771900
106 vdupq_n_s8(0x29), // 0.3218538
107 vdupq_n_s8(-0x0A), // -0.0832229
108 }
109};
110
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100111/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
112 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
113static const std::array<qint16x8_t, 4> log_tabq_qs16 =
114{
115 {
116 vdupq_n_s16(0x5C0F), // 1.4384189
117 vdupq_n_s16(-0x56AE), // -0.6771900
118 vdupq_n_s16(0x2933), // 0.3218538
119 vdupq_n_s16(-0x0AA7), // -0.0832229
120 }
121};
122
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100123inline qint8x8_t vget_low_qs8(qint8x16_t a)
124{
125 return vget_low_s8(a);
126}
127
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100128inline qint16x4_t vget_low_qs16(qint16x8_t a)
129{
130 return vget_low_s16(a);
131}
132
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133inline qint8x8_t vget_high_qs8(qint8x16_t a)
134{
135 return vget_high_s8(a);
136}
137
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100138inline qint16x4_t vget_high_qs16(qint16x8_t a)
139{
140 return vget_high_s16(a);
141}
142
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143inline qint8x8_t vld1_qs8(const qint8_t *addr)
144{
145 return vld1_s8(addr);
146}
147
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100148inline qint16x4_t vld1_qs16(const qint16_t *addr)
149{
150 return vld1_s16(addr);
151}
152
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100153inline qint8x16_t vld1q_qs8(const qint8_t *addr)
154{
155 return vld1q_s8(addr);
156}
157
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158inline qint16x8_t vld1q_qs16(const qint16_t *addr)
159{
160 return vld1q_s16(addr);
161}
162
163inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
164{
165 return vld1_dup_s8(addr);
166}
167
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100168inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
169{
170 return vld1_dup_s16(addr);
171}
172
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
174{
175 return vld1q_dup_s8(addr);
176}
177
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100178inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
179{
180 return vld1q_dup_s16(addr);
181}
182
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
184{
185 vst1_s8(addr, b);
186}
187
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
189{
190 vst1_s16(addr, b);
191}
192
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100193inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
194{
195 vst1q_s8(addr, b);
196}
197
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100198inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
199{
200 vst1q_s16(addr, b);
201}
202
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100203inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
204{
205 vst2q_s16(addr, b);
206}
207
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100208inline qint8x8_t vqmovn_qs16(qint16x8_t a)
209{
210 return vqmovn_s16(a);
211}
212
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100213inline qint16x4_t vqmovn_qs32(qint32x4_t a)
214{
215 return vqmovn_s32(a);
216}
217
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100218inline qint8x8_t vdup_n_qs8(qint8_t a)
219{
220 return vdup_n_s8(a);
221}
222
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100223inline qint16x4_t vdup_n_qs16(qint16_t a)
224{
225 return vdup_n_s16(a);
226}
227
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100228inline qint8x16_t vdupq_n_qs8(qint8_t a)
229{
230 return vdupq_n_s8(a);
231}
232
233inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
234{
235 float32x4x4_t res =
236 {
237 {
238 vdupq_n_f32(a),
239 vdupq_n_f32(a),
240 vdupq_n_f32(a),
241 vdupq_n_f32(a),
242 }
243 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100244 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100245}
246
247inline qint16x8_t vdupq_n_qs16(qint16_t a)
248{
249 return vdupq_n_s16(a);
250}
251
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100252inline qint32x4_t vdupq_n_qs32(qint32_t a)
253{
254 return vdupq_n_s32(a);
255}
256
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100257inline qint8x8_t vabs_qs8(qint8x8_t a)
258{
259 return vabs_s8(a);
260}
261
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100262inline qint16x4_t vabs_qs16(qint16x4_t a)
263{
264 return vabs_s16(a);
265}
266
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267inline qint8x16_t vabsq_qs8(qint8x16_t a)
268{
269 return vabsq_s8(a);
270}
271
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100272inline qint16x8_t vabsq_qs16(qint16x8_t a)
273{
274 return vabsq_s16(a);
275}
276
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100277inline qint8x8_t vqabs_qs8(qint8x8_t a)
278{
279 return vqabs_s8(a);
280}
281
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100282inline qint16x4_t vqabs_qs16(qint16x4_t a)
283{
284 return vqabs_s16(a);
285}
286
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100287inline qint8x16_t vqabsq_qs8(qint8x16_t a)
288{
289 return vqabsq_s8(a);
290}
291
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100292inline qint16x8_t vqabsq_qs16(qint16x8_t a)
293{
294 return vqabsq_s16(a);
295}
296
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100297inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
298{
299 return vmax_s8(a, b);
300}
301
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100302inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
303{
304 return vmax_s16(a, b);
305}
306
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100307inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
308{
309 return vmaxq_s8(a, b);
310}
311
312inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
313{
314 return vpmax_s8(a, b);
315}
316
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100317inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
318{
319 return vpmax_s16(a, b);
320}
321
322inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
323{
324 return vmaxq_s16(a, b);
325}
326
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100327inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
328{
329 return vmin_s8(a, b);
330}
331
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100332inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
333{
334 return vmin_s16(a, b);
335}
336
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100337inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
338{
339 return vminq_s8(a, b);
340}
341
342inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
343{
344 return vpmin_s8(a, b);
345}
346
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100347inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
348{
349 return vpmin_s16(a, b);
350}
351
352inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
353{
354 return vminq_s16(a, b);
355}
356
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100357inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
358{
359 return vadd_s8(a, b);
360}
361
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100362inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
363{
364 return vadd_s16(a, b);
365}
366
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100367inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
368{
369 return vaddq_s8(a, b);
370}
371
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100372inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
373{
374 return vaddq_s16(a, b);
375}
376
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100377inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
378{
379 return vqadd_s8(a, b);
380}
381
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100382inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
383{
384 return vqadd_s16(a, b);
385}
386
Georgios Pinitas9247c922017-06-28 18:29:47 +0100387inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
388{
389 return vqadd_s32(a, b);
390}
391
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100392inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
393{
394 return vqaddq_s8(a, b);
395}
396
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100397inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
398{
399 return vqaddq_s16(a, b);
400}
401
Georgios Pinitas9247c922017-06-28 18:29:47 +0100402inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
403{
404 return vqaddq_s32(a, b);
405}
406
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100407inline int16x4_t vpaddl_qs8(qint8x8_t a)
408{
409 return vpaddl_s8(a);
410}
411
412inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
413{
414 return vsub_s8(a, b);
415}
416
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100417inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
418{
419 return vsub_s16(a, b);
420}
421
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100422inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
423{
424 return vsubq_s8(a, b);
425}
426
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100427inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
428{
429 return vsubq_s16(a, b);
430}
431
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100432inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
433{
434 return vqsub_s8(a, b);
435}
436
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100437inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
438{
439 return vqsub_s16(a, b);
440}
441
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100442inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
443{
444 return vqsubq_s8(a, b);
445}
446
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100447inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
448{
449 return vqsubq_s16(a, b);
450}
451
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100452inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
453{
454 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
455
456 // Initialize the temporary result with a constant used to round up the result
457 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
458
459 // Vector multiply-accumulate long
460 res = vmlal_s8(res, a, b);
461
462 // Shift right by fixed_point_position
463 res = vshlq_s16(res, fixed_point_position_s16);
464
465 // Convert back to qint8
466 return vmovn_s16(res);
467}
468
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100469inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
470{
471 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
472
473 // Initialize the temporary result with a constant used to round up the result
474 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
475
476 // Vector multiply-accumulate long
477 res = vmlal_s16(res, a, b);
478
479 // Shift right by fixed_point_position
480 res = vshlq_s32(res, fixed_point_position_s32);
481
482 // Convert back to qint16
483 return vmovn_s32(res);
484}
485
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100486inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
487{
488 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
489
490 // Initialize the temporary results with a constant used to round up the result
491 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
492 qint16x8_t res1 = res0;
493
494 // Vector multiply-accumulate long
495 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
496 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
497
498 // Shift right by fixed_point_position
499 res0 = vshlq_s16(res0, fixed_point_position_s16);
500 res1 = vshlq_s16(res1, fixed_point_position_s16);
501
502 // Convert back to qint8
503 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
504}
505
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100506inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
507{
508 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
509
510 // Initialize the temporary results with a constant used to round up the result
511 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
512 qint32x4_t res1 = res0;
513
514 // Vector multiply-accumulate long
515 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
516 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
517
518 // Shift right by fixed_point_position
519 res0 = vshlq_s32(res0, fixed_point_position_s32);
520 res1 = vshlq_s32(res1, fixed_point_position_s32);
521
522 // Convert back to qint16
523 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
524}
525
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100526inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
527{
528 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
529
530 // Initialize the temporary result with a constant used to round up the result
531 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
532
533 // Vector multiply-accumulate long
534 res = vmlal_s8(res, a, b);
535
536 // Shift right by fixed_point_position
537 res = vqshlq_s16(res, fixed_point_position_s16);
538
539 // Convert back to qint8 and saturate
540 return vqmovn_s16(res);
541}
542
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100543inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
544{
545 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
546
547 // Initialize the temporary result with a constant used to round up the result
548 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
549
550 // Vector multiply-accumulate long
551 res = vmlal_s16(res, a, b);
552
553 // Shift right by fixed_point_position
554 res = vqshlq_s32(res, fixed_point_position_s32);
555
556 // Convert back to qint16 and saturate
557 return vqmovn_s32(res);
558}
559
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100560inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
561{
562 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
563
564 // Initialize the temporary results with a constant used to round up the result
565 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
566 qint16x8_t res1 = res0;
567
568 // Vector multiply-accumulate long
569 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
570 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
571
572 // Shift right by fixed_point_position
573 res0 = vqshlq_s16(res0, fixed_point_position_s16);
574 res1 = vqshlq_s16(res1, fixed_point_position_s16);
575
576 // Convert back to qint8 and saturate
577 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
578}
579
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100580inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
581{
582 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
583
584 // Initialize the temporary results with a constant used to round up the result
585 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
586 qint32x4_t res1 = res0;
587
588 // Vector multiply-accumulate long
589 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
590 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
591
592 // Shift right by fixed_point_position
593 res0 = vqshlq_s32(res0, fixed_point_position_s32);
594 res1 = vqshlq_s32(res1, fixed_point_position_s32);
595
596 // Convert back to qint16 and saturate
597 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
598}
599
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100600inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
601{
602 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
603
604 qint16x8_t res = vmull_s8(a, b);
605
606 return vqrshlq_s16(res, fixed_point_position_s16);
607}
608
609inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
610{
611 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
612
613 // Initialize the temporary results with a constant used to round up the result
614 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
615
616 // Vector multiply-accumulate long
617 tmp = vmlal_s8(tmp, b, c);
618
619 // Shift right by fixed_point_position
620 tmp = vshlq_s16(tmp, fixed_point_position_s16);
621
622 // Convert back to qint8 and accumulate
623 return vadd_s8(a, vmovn_s16(tmp));
624}
625
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100626inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
627{
628 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
629
630 // Initialize the temporary results with a constant used to round up the result
631 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
632
633 // Vector multiply-accumulate long
634 tmp = vmlal_s16(tmp, b, c);
635
636 // Shift right by fixed_point_position
637 tmp = vshlq_s32(tmp, fixed_point_position_s32);
638
639 // Convert back to qint16 and accumulate
640 return vadd_s16(a, vmovn_s32(tmp));
641}
642
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100643inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
644{
645 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
646
647 // Initialize the temporary results with a constant used to round up the result
648 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
649 qint16x8_t tmp1 = tmp0;
650
651 // Vector multiply-accumulate long
652 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
653 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
654
655 // Shift right by fixed_point_position
656 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
657 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
658
659 // Convert back to qint8 and accumulate
660 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
661}
662
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100663inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
664{
665 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
666
667 // Initialize the temporary results with a constant used to round up the result
668 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
669 qint32x4_t tmp1 = tmp0;
670
671 // Vector multiply-accumulate long
672 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
673 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
674
675 // Shift right by fixed_point_position
676 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
677 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
678
679 // Convert back to qint16 and accumulate
680 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
681}
682
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100683inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
684{
685 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
686
687 // Initialize the temporary results with a constant used to round up the result
688 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
689
690 // Vector multiply-accumulate long
691 tmp = vmlal_s8(tmp, b, c);
692
693 // Shift right by fixed_point_position
694 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
695
696 // Convert back to qint8 and accumulate
697 return vqadd_s8(a, vqmovn_s16(tmp));
698}
699
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100700inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
701{
702 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
703
704 // Initialize the temporary results with a constant used to round up the result
705 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
706
707 // Vector multiply-accumulate long
708 tmp = vmlal_s16(tmp, b, c);
709
710 // Shift right by fixed_point_position
711 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
712
713 // Convert back to qint8 and accumulate
714 return vqadd_s16(a, vqmovn_s32(tmp));
715}
716
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100717inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
718{
719 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
720
721 // Initialize the temporary results with a constant used to round up the result
722 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
723 qint16x8_t tmp1 = tmp0;
724
725 // Vector multiply-accumulate long
726 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
727 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
728
729 // Shift right by fixed_point_position
730 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
731 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
732
733 // Convert back to qint8 and accumulate
734 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
735 return vqaddq_s8(a, res);
736}
737
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100738inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
739{
740 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
741
742 // Initialize the temporary results with a constant used to round up the result
743 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
744 qint32x4_t tmp1 = tmp0;
745
746 // Vector multiply-accumulate long
747 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
748 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
749
750 // Shift right by fixed_point_position
751 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
752 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
753
754 // Convert back to qint16 and accumulate
755 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
756 return vqaddq_s16(a, res);
757}
758
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100759inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
760{
761 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
762
763 // Initialize the temporary results with a constant used to round up the result
764 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
765
766 // Vector multiply-accumulate long
767 tmp = vmlal_s8(tmp, b, c);
768
769 // Shift right by fixed_point_position
770 tmp = vshlq_s16(tmp, fixed_point_position_s16);
771
772 // Accumulate
773 return vaddq_s16(a, tmp);
774}
775
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100776inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
777{
778 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
779
780 // Initialize the temporary results with a constant used to round up the result
781 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
782
783 // Vector multiply-accumulate long
784 tmp = vmlal_s16(tmp, b, c);
785
786 // Shift right by fixed_point_position
787 tmp = vshlq_s32(tmp, fixed_point_position_s32);
788
789 // Accumulate
790 return vaddq_s32(a, tmp);
791}
792
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100793inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
794{
795 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
796
797 // Initialize the temporary results with a constant used to round up the result
798 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
799
800 // Vector multiply-accumulate long
801 tmp = vmlal_s8(tmp, b, c);
802
803 // Shift right by fixed_point_position
804 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
805
806 // Accumulate
807 return vqaddq_s16(a, tmp);
808}
809
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100810inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
811{
812 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
813
814 // Initialize the temporary results with a constant used to round up the result
815 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
816
817 // Vector multiply-accumulate long
818 tmp = vmlal_s16(tmp, b, c);
819
820 // Shift right by fixed_point_position
821 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
822
823 // Accumulate
824 return vqaddq_s32(a, tmp);
825}
826
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100827inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100828{
829 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
830
831 float32x4x2_t res_f32 =
832 {
833 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100834 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
835 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100836 }
837 };
838
839 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
840 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
841
842 const int32x4x2_t res_s32 =
843 {
844 {
845 vcvtq_s32_f32(res_f32.val[0]),
846 vcvtq_s32_f32(res_f32.val[1]),
847 }
848 };
849
850 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
851
852 return vqmovn_s16(res_s16);
853}
854
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100855inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100856{
857 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
858
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100859 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100860
861 res_f32 = vmlaq_f32(res_f32, a, pow2);
862
863 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
864
865 return vqmovn_s32(res_s32);
866}
867
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100868inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100869{
870 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
871
872 float32x4x4_t res_f32 =
873 {
874 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100875 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
876 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
877 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
878 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100879 }
880 };
881
882 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
883 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
884 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
885 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
886
887 const int32x4x4_t res_s32 =
888 {
889 {
890 vcvtq_s32_f32(res_f32.val[0]),
891 vcvtq_s32_f32(res_f32.val[1]),
892 vcvtq_s32_f32(res_f32.val[2]),
893 vcvtq_s32_f32(res_f32.val[3]),
894 }
895 };
896
897 const int16x8x2_t res_s16 =
898 {
899 {
900 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
901 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
902 }
903 };
904
905 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
906}
907
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100908inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100909{
910 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
911
912 float32x4x2_t res_f32 =
913 {
914 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100915 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
916 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100917 }
918 };
919
920 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
921 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
922
923 const int32x4x2_t res_s32 =
924 {
925 {
926 vcvtq_s32_f32(res_f32.val[0]),
927 vcvtq_s32_f32(res_f32.val[1])
928 }
929 };
930
931 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
932}
933
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100934inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
935{
936 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
937
938 const int16x8_t res_s16 = vmovl_s8(a);
939
940 const int32x4x2_t res_s32 =
941 {
942 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100943 vmovl_s16(vget_low_qs16(res_s16)),
944 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100945 }
946 };
947
948 float32x4x2_t res_f32 =
949 {
950 {
951 vcvtq_f32_s32(res_s32.val[0]),
952 vcvtq_f32_s32(res_s32.val[1])
953 }
954 };
955
956 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
957 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
958
959 return res_f32;
960}
961
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100962inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
963{
964 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
965 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
966
967 return vmulq_f32(res_f32, pow2);
968}
969
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100970inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
971{
972 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
973
974 const int16x8x2_t res_s16 =
975 {
976 {
977 vmovl_s8(vget_low_s8(a)),
978 vmovl_s8(vget_high_s8(a)),
979 }
980 };
981
982 const int32x4x4_t res_s32 =
983 {
984 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100985 vmovl_s16(vget_low_qs16(res_s16.val[0])),
986 vmovl_s16(vget_high_qs16(res_s16.val[0])),
987 vmovl_s16(vget_low_qs16(res_s16.val[1])),
988 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100989 }
990 };
991
992 float32x4x4_t res_f32 =
993 {
994 {
995 vcvtq_f32_s32(res_s32.val[0]),
996 vcvtq_f32_s32(res_s32.val[1]),
997 vcvtq_f32_s32(res_s32.val[2]),
998 vcvtq_f32_s32(res_s32.val[3])
999 }
1000 };
1001
1002 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1003 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1004 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1005 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1006
1007 return res_f32;
1008}
1009
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001010inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1011{
1012 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1013
1014 const int32x4x2_t res_s32 =
1015 {
1016 {
1017 vmovl_s16(vget_low_qs16(a)),
1018 vmovl_s16(vget_high_qs16(a))
1019 }
1020 };
1021
1022 float32x4x2_t res_f32 =
1023 {
1024 {
1025 vcvtq_f32_s32(res_s32.val[0]),
1026 vcvtq_f32_s32(res_s32.val[1])
1027 }
1028 };
1029
1030 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1031 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1032
1033 return res_f32;
1034}
1035
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001036inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1037{
1038 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001039 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1040 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1041 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001042
1043 // Find shift value
1044 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1045 const qint8x8_t temp = vshl_s8(a, shift_value);
1046
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001047 qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001048
1049 uint8x8_t set_one = vcgt_s8(x, const_one);
1050 x = vbsl_s8(set_one, const_one, x);
1051
1052 // Use three iterations of Newton-Raphson method to get the result
1053 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1054 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1055 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1056
1057 return vshl_s8(x, shift_value);
1058}
1059
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001060inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1061{
1062 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1063 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1064 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1065 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1066
1067 // Find shift value
1068 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1069 const qint16x4_t temp = vshl_s16(a, shift_value);
1070
1071 qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
1072
1073 uint16x4_t set_one = vcgt_s16(x, const_one);
1074 x = vbsl_s16(set_one, const_one, x);
1075
1076 // Use five iterations of Newton-Raphson method to get the result
1077 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1078 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1079 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1080 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1081 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1082
1083 return vshl_s16(x, shift_value);
1084}
1085
Georgios Pinitas9247c922017-06-28 18:29:47 +01001086inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1087{
1088 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1089 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1090 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1091 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1092
1093 // Find shift value
1094 const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1095 const qint8x8_t temp = vqshl_s8(a, shift_value);
1096
1097 qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
1098
1099 uint8x8_t set_one = vcgt_s8(x, const_one);
1100 x = vbsl_s8(set_one, const_one, x);
1101
1102 // Use three iterations of Newton-Raphson method to get the result
1103 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1104 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1105 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1106
1107 return vqshl_s8(x, shift_value);
1108}
1109
1110inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1111{
1112 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1113 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1114 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1115 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1116
1117 // Find shift value
1118 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1119 const qint16x4_t temp = vqshl_s16(a, shift_value);
1120
1121 qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
1122
1123 uint16x4_t set_one = vcgt_s16(x, const_one);
1124 x = vbsl_s16(set_one, const_one, x);
1125
1126 // Use five iterations of Newton-Raphson method to get the result
1127 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1128 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1129 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1130 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1131 x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1132
1133 return vqshl_s16(x, shift_value);
1134}
1135
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001136inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1137{
1138 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001139 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1140 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1141 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001142
1143 // Find shift value
1144 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1145 const qint8x16_t temp = vshlq_s8(a, shift_value);
1146
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001147 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001148
1149 // Set initial guess to one if x > 1
1150 uint8x16_t set_one = vcgtq_s8(x, const_one);
1151 x = vbslq_s8(set_one, const_one, x);
1152
1153 // Use three iterations of Newton-Raphson method to get the result
1154 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1155 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1156 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1157
1158 return vshlq_s8(x, shift_value);
1159}
1160
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001161inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1162{
1163 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1164 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1165 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1166 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1167
1168 // Find shift value
1169 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1170 const qint16x8_t temp = vshlq_s16(a, shift_value);
1171
1172 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1173
1174 // Set initial guess to one if x > 1
1175 uint16x8_t set_one = vcgtq_s16(x, const_one);
1176 x = vbslq_s16(set_one, const_one, x);
1177
1178 // Use five iterations of Newton-Raphson method to get the result
1179 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1180 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1181 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1182 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1183 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1184
1185 return vshlq_s16(x, shift_value);
1186}
1187
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001188inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1189{
1190 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001191 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1192 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1193 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001194
1195 // Find shift value
1196 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1197 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1198
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001199 qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001200
1201 // Set initial guess to one if x > 1
1202 uint8x16_t set_one = vcgtq_s8(x, const_one);
1203 x = vbslq_s8(set_one, const_one, x);
1204
1205 // Use three iterations of Newton-Raphson method to get the result
1206 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1207 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1208 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1209
1210 return vqshlq_s8(x, shift_value);
1211}
1212
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001213inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1214{
1215 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1216 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1217 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1218 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1219
1220 // Find shift value
1221 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1222 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1223
1224 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1225
1226 // Set initial guess to one if x > 1
1227 uint16x8_t set_one = vcgtq_s16(x, const_one);
1228 x = vbslq_s16(set_one, const_one, x);
1229
1230 // Use five iterations of Newton-Raphson method to get the result
1231 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1232 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1233 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1234 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1235 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1236
1237 return vqshlq_s16(x, shift_value);
1238}
1239
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001240inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1241{
1242 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1243}
1244
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001245inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1246{
1247 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1248}
1249
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001250inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1251{
1252 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1253}
1254
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001255inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1256{
1257 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1258}
1259
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001260template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001261inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001262{
1263 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1264 const qint8x8_t const_one = vdup_n_s8(1);
1265 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1266 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1267 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1268 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1269 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1270 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1271 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1272 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1273 return res;
1274}
1275
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001276template <bool islog>
1277inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1278{
1279 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1280 const qint16x4_t const_one = vdup_n_s16(1);
1281 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1282 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1283 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1284 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1285 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1286 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1287 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1288 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1289 return res;
1290}
1291
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001292template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001293inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294{
1295 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1296 const qint8x8_t const_one = vdup_n_s8(1);
1297 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1298 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1299 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1300 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1301 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1302 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1303 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1304 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1305 return res;
1306}
1307
1308template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001309inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1310{
1311 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1312 const qint16x4_t const_one = vdup_n_s16(1);
1313 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1314 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1315 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1316 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1317 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1318 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1319 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1320 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1321 return res;
1322}
1323
1324template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001325inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1326{
1327 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1328 const qint8x16_t const_one = vdupq_n_s8(1);
1329 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1330 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1331 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1332 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1333 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1334 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1335 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1336 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1337 return res;
1338}
1339
1340template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001341inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1342{
1343 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1344 const qint16x8_t const_one = vdupq_n_s16(1);
1345 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1346 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1347 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1348 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1349 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1350 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1351 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1352 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1353 return res;
1354}
1355
1356template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001357inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1358{
1359 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1360 const qint8x16_t const_one = vdupq_n_s8(1);
1361 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1362 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1363 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1364 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1365 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1366 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1367 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1368 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1369 return res;
1370}
1371
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001372template <bool islog>
1373inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1374{
1375 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1376 const qint16x8_t const_one = vdupq_n_s16(1);
1377 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1378 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1379 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1380 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1381 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1382 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1383 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1384 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1385 return res;
1386}
1387
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001388inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1389{
1390 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1391 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1392 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1393 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1394
1395 // Perform range reduction [-log(2),log(2)]
1396 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1397
1398 // get decimal part from m
1399 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1400
1401 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1402 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1403
1404 // Polynomial Approximation
1405 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1406 poly = vqadd_s8(poly, const_one);
1407
1408 // Reconstruct
1409 poly = vqshl_s8(poly, dec_m);
1410
1411 return poly;
1412}
1413
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001414inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1415{
1416 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1417 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1418 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1419 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1420
1421 // Perform range reduction [-log(2),log(2)]
1422 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1423
1424 // get decimal part from m
1425 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1426
1427 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1428 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1429
1430 // Polynomial Approximation
1431 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1432 poly = vqadd_s16(poly, const_one);
1433
1434 // Reconstruct
1435 poly = vqshl_s16(poly, dec_m);
1436
1437 return poly;
1438}
1439
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001440inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1441{
1442 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1443 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1444 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1445 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1446
1447 // Perform range reduction [-log(2),log(2)]
1448 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1449
1450 // get decimal part from m
1451 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1452
1453 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1454 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1455
1456 // Polynomial Approximation
1457 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1458 poly = vqaddq_s8(poly, const_one);
1459
1460 // Reconstruct
1461 poly = vqshlq_s8(poly, dec_m);
1462
1463 return poly;
1464}
1465
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001466inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1467{
1468 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1469 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1470 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1471 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1472
1473 // Perform range reduction [-log(2),log(2)]
1474 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1475
1476 // get decimal part from m
1477 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1478
1479 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1480 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1481
1482 // Polynomial Approximation
1483 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1484 poly = vqaddq_s16(poly, const_one);
1485
1486 // Reconstruct
1487 poly = vqshlq_s16(poly, dec_m);
1488
1489 return poly;
1490}
1491
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001492inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1493{
1494 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1495 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1496 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1497
1498 // If 0 < a < 1, calculate log(1/x)
1499 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1500 qint8x8_t recip = vdup_n_s8(0);
1501 recip = vbsl_s8(calc_reciprocal, recip, a);
1502
1503 // Calculate reciprocal
1504 recip = vrecip_qs8(recip, fixed_point_position);
1505 a = vbsl_s8(calc_reciprocal, recip, a);
1506
1507 // Get decimal part of a
1508 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1509 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1510
1511 // Get exponent of 2^n which is equal or less than dec_a
1512 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1513
1514 // Get x to range (1, 2]
1515 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1516 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1517 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1518
1519 // Polynomial Approximation
1520 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1521
1522 // Reconstruct
1523 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1524
1525 // Set negative value for 0 < a < 1
1526 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1527
1528 return poly;
1529}
1530
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001531inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1532{
1533 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1534 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1535 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1536
1537 // If 0 < a < 1, calculate log(1/x)
1538 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1539 qint16x4_t recip = vdup_n_s16(0);
1540 recip = vbsl_s16(calc_reciprocal, recip, a);
1541
1542 // Calculate reciprocal
1543 recip = vrecip_qs16(recip, fixed_point_position);
1544 a = vbsl_s16(calc_reciprocal, recip, a);
1545
1546 // Get decimal part of a
1547 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1548 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1549
1550 // Get exponent of 2^n which is equal or less than dec_a
1551 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1552
1553 // Get x to range (1, 2]
1554 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1555 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1556 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1557
1558 // Polynomial Approximation
1559 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1560
1561 // Reconstruct
1562 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1563
1564 // Set negative value for 0 < a < 1
1565 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1566
1567 return poly;
1568}
1569
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001570inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1571{
1572 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1573 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1574 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1575
1576 // If 0 < a < 1, calculate log(1/x)
1577 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1578 qint8x16_t recip = vdupq_n_s8(0);
1579 recip = vbslq_s8(calc_reciprocal, a, recip);
1580
1581 // Calculate reciprocal
1582 recip = vrecipq_qs8(recip, fixed_point_position);
1583 a = vbslq_s8(calc_reciprocal, recip, a);
1584
1585 // Get decimal part of a
1586 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1587 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1588
1589 // Get exponent of 2^n which is equal or less than dec_a
1590 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1591
1592 // Get x to range (1, 2]
1593 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1594 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1595 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1596
1597 // Polynomial Approximation
1598 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1599
1600 // Reconstruct
1601 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1602
1603 // Set negative value for 0 < a < 1
1604 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1605
1606 return poly;
1607}
1608
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001609inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1610{
1611 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1612 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1613 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1614
1615 // If 0 < a < 1, calculate log(1/x)
1616 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1617 qint16x8_t recip = vdupq_n_s16(0);
1618 recip = vbslq_s16(calc_reciprocal, a, recip);
1619
1620 // Calculate reciprocal
1621 recip = vqrecipq_qs16(recip, fixed_point_position);
1622 a = vbslq_s16(calc_reciprocal, recip, a);
1623
1624 // Get decimal part of a
1625 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1626 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1627
1628 // Get exponent of 2^n which is equal or less than dec_a
1629 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1630
1631 // Get x to range (1, 2]
1632 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1633 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1634 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1635
1636 // Polynomial Approximation
1637 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1638
1639 // Reconstruct
1640 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1641
1642 // Set negative value for 0 < a < 1
1643 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1644
1645 return poly;
1646}
1647
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001648inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1649{
1650 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1651
1652 // Find shift value. Number must be in (0.5, 2) range.
1653 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1654
1655 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1656 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1657 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1658 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1659 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1660
1661 temp = vshl_s8(a, shift_value);
1662
1663 // Initial guess
1664 qint8x8_t x = temp;
1665
1666 // Calculate (x / 2) * (3 - a * x^2)
1667 // After three iterations we have the result for 8 bit
1668 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1669 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1670 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1671
1672 return vshl_s8(x, shift_value2);
1673}
1674
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001675inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1676{
1677 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1678
1679 // Find shift value. Number must be in (0.5, 2) range.
1680 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1681
1682 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1683 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1684 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1685 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1686 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1687
1688 temp = vshl_s16(a, shift_value);
1689
1690 // Initial guess
1691 qint16x4_t x = temp;
1692
1693 // Calculate (x / 2) * (3 - a * x^2)
1694 // After five iterations we have the result for 8 bit
1695 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1696 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1697 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1698 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1699 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1700
1701 return vshl_s16(x, shift_value2);
1702}
1703
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001704inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1705{
1706 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1707
1708 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001709 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001710
1711 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001712 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001713 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001714 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001715 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001716
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001717 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001718
1719 // Initial guess
1720 qint8x8_t x = temp;
1721
1722 // Calculate (x / 2) * (3 - a * x^2)
1723 // After three iterations we have the result for 8 bit
1724 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1725 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1726 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1727
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001728 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001729}
1730
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001731inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1732{
1733 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1734
1735 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001736 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001737
1738 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1739 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1740 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1741 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001742 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001743
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001744 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001745
1746 // Initial guess
1747 qint16x4_t x = temp;
1748
1749 // Calculate (x / 2) * (3 - a * x^2)
1750 // After five iterations we have the result for 16 bit
1751 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1753 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1754 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1755 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1756
1757 return vqshl_s16(x, shift_value2);
1758}
1759
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001760inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1761{
1762 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1763
1764 // Find shift value. Number must be in (0.5, 2) range.
1765 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1766
1767 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1768 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1769 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1770 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1771 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1772
1773 temp = vshlq_s8(a, shift_value);
1774
1775 // Initial guess
1776 qint8x16_t x = temp;
1777
1778 // Calculate (x / 2) * (3 - a * x^2)
1779 // After three iterations we have the result for 8 bit
1780 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1781 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1782 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1783
1784 return vshlq_s8(x, shift_value2);
1785}
1786
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001787inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1788{
1789 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1790
1791 // Find shift value. Number must be in (0.5, 2) range.
1792 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1793
1794 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1795 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1796 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1797 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1798 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1799
1800 temp = vshlq_s16(a, shift_value);
1801
1802 // Initial guess
1803 qint16x8_t x = temp;
1804
1805 // Calculate (x / 2) * (3 - a * x^2)
1806 // After five iterations we have the result for 16 bit
1807 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1809 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1810 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1811 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1812
1813 return vshlq_s16(x, shift_value2);
1814}
1815
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001816inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1817{
1818 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1819
1820 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001821 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001822
1823 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001824 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001825 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001826 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001827 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001828
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001829 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001830
1831 // Initial guess
1832 qint8x16_t x = temp;
1833
1834 // Calculate (x / 2) * (3 - a * x^2)
1835 // After three iterations we have the result for 8 bit
1836 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1837 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1838 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1839
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001840 return vqshlq_s8(x, shift_value2);
1841}
1842
1843inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1844{
1845 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1846
1847 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001848 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001849
1850 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1851 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1852 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1853 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001854 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001855
1856 temp = vqshlq_s16(a, shift_value);
1857
1858 // Initial guess
1859 qint16x8_t x = temp;
1860
1861 // Calculate (x / 2) * (3 - a * x^2)
1862 // After five iterations we have the result for 16 bit
1863 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1864 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1865 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1866 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1867 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1868
1869 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001870}
1871
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001872inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001873{
1874 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1875 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1876
1877 qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1878 qint8x8_t num = vqsub_qs8(exp2x, const_one);
1879 qint8x8_t den = vqadd_qs8(exp2x, const_one);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001880 qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001881
1882 return tanh;
1883}
1884
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001885inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001886{
1887 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1888 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1889
1890 qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1891 qint16x4_t num = vqsub_qs16(exp2x, const_one);
1892 qint16x4_t den = vqadd_qs16(exp2x, const_one);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001893 qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001894
1895 return tanh;
1896}
1897
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001898inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001899{
1900 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1901 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1902
1903 qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1904 qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1905 qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1906 qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
1907
1908 return tanh;
1909}
1910
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001911inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1912{
1913 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1914 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1915
1916 qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1917 qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1918 qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1919 qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
1920
1921 return tanh;
1922}
1923
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001924inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1925{
1926 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1927}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001928
1929inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1930{
1931 float32x4x2_t res =
1932 {
1933 {
1934 vmaxq_f32(a.val[0], b.val[0]),
1935 vmaxq_f32(a.val[1], b.val[1])
1936 }
1937 };
1938 return res;
1939}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001940}