blob: 92af82cf716bc11da21f1aac8429feec20ec6b83 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25namespace arm_compute
26{
27/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
28 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010029static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030{
31 {
32 vdup_n_s8(0x7F), // 0.9978546
33 vdup_n_s8(0x3F), // 0.4994721
34 vdup_n_s8(0x16), // 0.1763723
35 vdup_n_s8(0x05), // 0.0435108
36 }
37};
38
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010039/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
40 * Format is in Q0.15 for all elements */
41static const std::array<qint16x4_t, 4> exp_tab_qs16 =
42{
43 {
44 vdup_n_s16(0x7FBA), // 0.9978546
45 vdup_n_s16(0x3FE9), // 0.4994721
46 vdup_n_s16(0x1693), // 0.1763723
47 vdup_n_s16(0x0592), // 0.0435108
48 }
49};
50
Anthony Barbier6ff3b192017-09-04 18:44:23 +010051/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
52 * Format is in Q0.7 for all elements */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010053static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054{
55 {
56 vdupq_n_s8(0x7F), // 0.9978546
57 vdupq_n_s8(0x3F), // 0.4994721
58 vdupq_n_s8(0x16), // 0.1763723
59 vdupq_n_s8(0x05), // 0.0435108
60 }
61};
62
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010063/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
64 * Format is in Q0.15 for all elements */
65static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
66{
67 {
68 vdupq_n_s16(0x7FBA), // 0.9978546
69 vdupq_n_s16(0x3FE9), // 0.4994721
70 vdupq_n_s16(0x1693), // 0.1763723
71 vdupq_n_s16(0x0592), // 0.0435108
72 }
73};
74
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
76 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010077static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010078{
79 {
80 vdup_n_s8(0x5C), // 1.4384189
81 vdup_n_s8(-0x56), // -0.6771900
82 vdup_n_s8(0x29), // 0.3218538
83 vdup_n_s8(-0x0A), // -0.0832229
84 }
85};
86
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010087/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
88 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
89static const std::array<qint16x4_t, 4> log_tab_qs16 =
90{
91 {
92 vdup_n_s16(0x5C0F), // 1.4384189
93 vdup_n_s16(-0x56AE), // -0.6771900
94 vdup_n_s16(0x2933), // 0.3218538
95 vdup_n_s16(-0x0AA7), // -0.0832229
96 }
97};
98
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
100 * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100101static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102{
103 {
104 vdupq_n_s8(0x5C), // 1.4384189
105 vdupq_n_s8(-0x56), // -0.6771900
106 vdupq_n_s8(0x29), // 0.3218538
107 vdupq_n_s8(-0x0A), // -0.0832229
108 }
109};
110
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100111/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
112 * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
113static const std::array<qint16x8_t, 4> log_tabq_qs16 =
114{
115 {
116 vdupq_n_s16(0x5C0F), // 1.4384189
117 vdupq_n_s16(-0x56AE), // -0.6771900
118 vdupq_n_s16(0x2933), // 0.3218538
119 vdupq_n_s16(-0x0AA7), // -0.0832229
120 }
121};
122
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100123inline qint8x8_t vget_low_qs8(qint8x16_t a)
124{
125 return vget_low_s8(a);
126}
127
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100128inline qint16x4_t vget_low_qs16(qint16x8_t a)
129{
130 return vget_low_s16(a);
131}
132
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133inline qint8x8_t vget_high_qs8(qint8x16_t a)
134{
135 return vget_high_s8(a);
136}
137
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100138inline qint16x4_t vget_high_qs16(qint16x8_t a)
139{
140 return vget_high_s16(a);
141}
142
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143inline qint8x8_t vld1_qs8(const qint8_t *addr)
144{
145 return vld1_s8(addr);
146}
147
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100148inline qint16x4_t vld1_qs16(const qint16_t *addr)
149{
150 return vld1_s16(addr);
151}
152
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100153inline qint8x16_t vld1q_qs8(const qint8_t *addr)
154{
155 return vld1q_s8(addr);
156}
157
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158inline qint16x8_t vld1q_qs16(const qint16_t *addr)
159{
160 return vld1q_s16(addr);
161}
162
163inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
164{
165 return vld1_dup_s8(addr);
166}
167
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100168inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
169{
170 return vld1_dup_s16(addr);
171}
172
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
174{
175 return vld1q_dup_s8(addr);
176}
177
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100178inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
179{
180 return vld1q_dup_s16(addr);
181}
182
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
184{
185 vst1_s8(addr, b);
186}
187
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
189{
190 vst1_s16(addr, b);
191}
192
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100193inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
194{
195 vst1q_s8(addr, b);
196}
197
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100198inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
199{
200 vst1q_s16(addr, b);
201}
202
203inline qint8x8_t vqmovn_qs16(qint16x8_t a)
204{
205 return vqmovn_s16(a);
206}
207
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100208inline qint16x4_t vqmovn_qs32(qint32x4_t a)
209{
210 return vqmovn_s32(a);
211}
212
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100213inline qint8x8_t vdup_n_qs8(qint8_t a)
214{
215 return vdup_n_s8(a);
216}
217
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100218inline qint16x4_t vdup_n_qs16(qint16_t a)
219{
220 return vdup_n_s16(a);
221}
222
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100223inline qint8x16_t vdupq_n_qs8(qint8_t a)
224{
225 return vdupq_n_s8(a);
226}
227
228inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
229{
230 float32x4x4_t res =
231 {
232 {
233 vdupq_n_f32(a),
234 vdupq_n_f32(a),
235 vdupq_n_f32(a),
236 vdupq_n_f32(a),
237 }
238 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100239 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100240}
241
242inline qint16x8_t vdupq_n_qs16(qint16_t a)
243{
244 return vdupq_n_s16(a);
245}
246
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100247inline qint32x4_t vdupq_n_qs32(qint32_t a)
248{
249 return vdupq_n_s32(a);
250}
251
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100252inline qint8x8_t vabs_qs8(qint8x8_t a)
253{
254 return vabs_s8(a);
255}
256
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100257inline qint16x4_t vabs_qs16(qint16x4_t a)
258{
259 return vabs_s16(a);
260}
261
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100262inline qint8x16_t vabsq_qs8(qint8x16_t a)
263{
264 return vabsq_s8(a);
265}
266
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100267inline qint16x8_t vabsq_qs16(qint16x8_t a)
268{
269 return vabsq_s16(a);
270}
271
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100272inline qint8x8_t vqabs_qs8(qint8x8_t a)
273{
274 return vqabs_s8(a);
275}
276
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100277inline qint16x4_t vqabs_qs16(qint16x4_t a)
278{
279 return vqabs_s16(a);
280}
281
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100282inline qint8x16_t vqabsq_qs8(qint8x16_t a)
283{
284 return vqabsq_s8(a);
285}
286
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100287inline qint16x8_t vqabsq_qs16(qint16x8_t a)
288{
289 return vqabsq_s16(a);
290}
291
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100292inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
293{
294 return vmax_s8(a, b);
295}
296
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100297inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
298{
299 return vmax_s16(a, b);
300}
301
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100302inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
303{
304 return vmaxq_s8(a, b);
305}
306
307inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
308{
309 return vpmax_s8(a, b);
310}
311
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100312inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
313{
314 return vpmax_s16(a, b);
315}
316
317inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
318{
319 return vmaxq_s16(a, b);
320}
321
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100322inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
323{
324 return vmin_s8(a, b);
325}
326
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100327inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
328{
329 return vmin_s16(a, b);
330}
331
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100332inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
333{
334 return vminq_s8(a, b);
335}
336
337inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
338{
339 return vpmin_s8(a, b);
340}
341
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100342inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
343{
344 return vpmin_s16(a, b);
345}
346
347inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
348{
349 return vminq_s16(a, b);
350}
351
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100352inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
353{
354 return vadd_s8(a, b);
355}
356
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100357inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
358{
359 return vadd_s16(a, b);
360}
361
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100362inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
363{
364 return vaddq_s8(a, b);
365}
366
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100367inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
368{
369 return vaddq_s16(a, b);
370}
371
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100372inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
373{
374 return vqadd_s8(a, b);
375}
376
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100377inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
378{
379 return vqadd_s16(a, b);
380}
381
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100382inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
383{
384 return vqaddq_s8(a, b);
385}
386
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100387inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
388{
389 return vqaddq_s16(a, b);
390}
391
392inline int16x4_t vpaddl_qs8(qint8x8_t a)
393{
394 return vpaddl_s8(a);
395}
396
397inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
398{
399 return vsub_s8(a, b);
400}
401
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100402inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
403{
404 return vsub_s16(a, b);
405}
406
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100407inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
408{
409 return vsubq_s8(a, b);
410}
411
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100412inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
413{
414 return vsubq_s16(a, b);
415}
416
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100417inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
418{
419 return vqsub_s8(a, b);
420}
421
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100422inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
423{
424 return vqsub_s16(a, b);
425}
426
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100427inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
428{
429 return vqsubq_s8(a, b);
430}
431
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100432inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
433{
434 return vqsubq_s16(a, b);
435}
436
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100437inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
438{
439 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
440
441 // Initialize the temporary result with a constant used to round up the result
442 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
443
444 // Vector multiply-accumulate long
445 res = vmlal_s8(res, a, b);
446
447 // Shift right by fixed_point_position
448 res = vshlq_s16(res, fixed_point_position_s16);
449
450 // Convert back to qint8
451 return vmovn_s16(res);
452}
453
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100454inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
455{
456 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
457
458 // Initialize the temporary result with a constant used to round up the result
459 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
460
461 // Vector multiply-accumulate long
462 res = vmlal_s16(res, a, b);
463
464 // Shift right by fixed_point_position
465 res = vshlq_s32(res, fixed_point_position_s32);
466
467 // Convert back to qint16
468 return vmovn_s32(res);
469}
470
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100471inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
472{
473 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
474
475 // Initialize the temporary results with a constant used to round up the result
476 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
477 qint16x8_t res1 = res0;
478
479 // Vector multiply-accumulate long
480 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
481 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
482
483 // Shift right by fixed_point_position
484 res0 = vshlq_s16(res0, fixed_point_position_s16);
485 res1 = vshlq_s16(res1, fixed_point_position_s16);
486
487 // Convert back to qint8
488 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
489}
490
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100491inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
492{
493 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
494
495 // Initialize the temporary results with a constant used to round up the result
496 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
497 qint32x4_t res1 = res0;
498
499 // Vector multiply-accumulate long
500 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
501 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
502
503 // Shift right by fixed_point_position
504 res0 = vshlq_s32(res0, fixed_point_position_s32);
505 res1 = vshlq_s32(res1, fixed_point_position_s32);
506
507 // Convert back to qint16
508 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
509}
510
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100511inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
512{
513 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
514
515 // Initialize the temporary result with a constant used to round up the result
516 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
517
518 // Vector multiply-accumulate long
519 res = vmlal_s8(res, a, b);
520
521 // Shift right by fixed_point_position
522 res = vqshlq_s16(res, fixed_point_position_s16);
523
524 // Convert back to qint8 and saturate
525 return vqmovn_s16(res);
526}
527
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100528inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
529{
530 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
531
532 // Initialize the temporary result with a constant used to round up the result
533 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
534
535 // Vector multiply-accumulate long
536 res = vmlal_s16(res, a, b);
537
538 // Shift right by fixed_point_position
539 res = vqshlq_s32(res, fixed_point_position_s32);
540
541 // Convert back to qint16 and saturate
542 return vqmovn_s32(res);
543}
544
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100545inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
546{
547 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
548
549 // Initialize the temporary results with a constant used to round up the result
550 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
551 qint16x8_t res1 = res0;
552
553 // Vector multiply-accumulate long
554 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
555 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
556
557 // Shift right by fixed_point_position
558 res0 = vqshlq_s16(res0, fixed_point_position_s16);
559 res1 = vqshlq_s16(res1, fixed_point_position_s16);
560
561 // Convert back to qint8 and saturate
562 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
563}
564
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100565inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
566{
567 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
568
569 // Initialize the temporary results with a constant used to round up the result
570 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
571 qint32x4_t res1 = res0;
572
573 // Vector multiply-accumulate long
574 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
575 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
576
577 // Shift right by fixed_point_position
578 res0 = vqshlq_s32(res0, fixed_point_position_s32);
579 res1 = vqshlq_s32(res1, fixed_point_position_s32);
580
581 // Convert back to qint16 and saturate
582 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
583}
584
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100585inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
586{
587 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
588
589 qint16x8_t res = vmull_s8(a, b);
590
591 return vqrshlq_s16(res, fixed_point_position_s16);
592}
593
594inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
595{
596 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
597
598 // Initialize the temporary results with a constant used to round up the result
599 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
600
601 // Vector multiply-accumulate long
602 tmp = vmlal_s8(tmp, b, c);
603
604 // Shift right by fixed_point_position
605 tmp = vshlq_s16(tmp, fixed_point_position_s16);
606
607 // Convert back to qint8 and accumulate
608 return vadd_s8(a, vmovn_s16(tmp));
609}
610
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100611inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
612{
613 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
614
615 // Initialize the temporary results with a constant used to round up the result
616 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
617
618 // Vector multiply-accumulate long
619 tmp = vmlal_s16(tmp, b, c);
620
621 // Shift right by fixed_point_position
622 tmp = vshlq_s32(tmp, fixed_point_position_s32);
623
624 // Convert back to qint16 and accumulate
625 return vadd_s16(a, vmovn_s32(tmp));
626}
627
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100628inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
629{
630 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
631
632 // Initialize the temporary results with a constant used to round up the result
633 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
634 qint16x8_t tmp1 = tmp0;
635
636 // Vector multiply-accumulate long
637 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
638 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
639
640 // Shift right by fixed_point_position
641 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
642 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
643
644 // Convert back to qint8 and accumulate
645 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
646}
647
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100648inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
649{
650 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
651
652 // Initialize the temporary results with a constant used to round up the result
653 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
654 qint32x4_t tmp1 = tmp0;
655
656 // Vector multiply-accumulate long
657 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
658 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
659
660 // Shift right by fixed_point_position
661 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
662 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
663
664 // Convert back to qint16 and accumulate
665 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
666}
667
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100668inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
669{
670 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
671
672 // Initialize the temporary results with a constant used to round up the result
673 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
674
675 // Vector multiply-accumulate long
676 tmp = vmlal_s8(tmp, b, c);
677
678 // Shift right by fixed_point_position
679 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
680
681 // Convert back to qint8 and accumulate
682 return vqadd_s8(a, vqmovn_s16(tmp));
683}
684
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100685inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
686{
687 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
688
689 // Initialize the temporary results with a constant used to round up the result
690 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
691
692 // Vector multiply-accumulate long
693 tmp = vmlal_s16(tmp, b, c);
694
695 // Shift right by fixed_point_position
696 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
697
698 // Convert back to qint8 and accumulate
699 return vqadd_s16(a, vqmovn_s32(tmp));
700}
701
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100702inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
703{
704 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
705
706 // Initialize the temporary results with a constant used to round up the result
707 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
708 qint16x8_t tmp1 = tmp0;
709
710 // Vector multiply-accumulate long
711 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
712 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
713
714 // Shift right by fixed_point_position
715 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
716 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
717
718 // Convert back to qint8 and accumulate
719 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
720 return vqaddq_s8(a, res);
721}
722
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100723inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
724{
725 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
726
727 // Initialize the temporary results with a constant used to round up the result
728 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
729 qint32x4_t tmp1 = tmp0;
730
731 // Vector multiply-accumulate long
732 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
733 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
734
735 // Shift right by fixed_point_position
736 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
737 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
738
739 // Convert back to qint16 and accumulate
740 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
741 return vqaddq_s16(a, res);
742}
743
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100744inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
745{
746 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
747
748 // Initialize the temporary results with a constant used to round up the result
749 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
750
751 // Vector multiply-accumulate long
752 tmp = vmlal_s8(tmp, b, c);
753
754 // Shift right by fixed_point_position
755 tmp = vshlq_s16(tmp, fixed_point_position_s16);
756
757 // Accumulate
758 return vaddq_s16(a, tmp);
759}
760
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100761inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
762{
763 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
764
765 // Initialize the temporary results with a constant used to round up the result
766 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
767
768 // Vector multiply-accumulate long
769 tmp = vmlal_s16(tmp, b, c);
770
771 // Shift right by fixed_point_position
772 tmp = vshlq_s32(tmp, fixed_point_position_s32);
773
774 // Accumulate
775 return vaddq_s32(a, tmp);
776}
777
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100778inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
779{
780 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
781
782 // Initialize the temporary results with a constant used to round up the result
783 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
784
785 // Vector multiply-accumulate long
786 tmp = vmlal_s8(tmp, b, c);
787
788 // Shift right by fixed_point_position
789 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
790
791 // Accumulate
792 return vqaddq_s16(a, tmp);
793}
794
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100795inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
796{
797 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
798
799 // Initialize the temporary results with a constant used to round up the result
800 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
801
802 // Vector multiply-accumulate long
803 tmp = vmlal_s16(tmp, b, c);
804
805 // Shift right by fixed_point_position
806 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
807
808 // Accumulate
809 return vqaddq_s32(a, tmp);
810}
811
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100812inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100813{
814 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
815
816 float32x4x2_t res_f32 =
817 {
818 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100819 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
820 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100821 }
822 };
823
824 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
825 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
826
827 const int32x4x2_t res_s32 =
828 {
829 {
830 vcvtq_s32_f32(res_f32.val[0]),
831 vcvtq_s32_f32(res_f32.val[1]),
832 }
833 };
834
835 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
836
837 return vqmovn_s16(res_s16);
838}
839
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100840inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100841{
842 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
843
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100844 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100845
846 res_f32 = vmlaq_f32(res_f32, a, pow2);
847
848 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
849
850 return vqmovn_s32(res_s32);
851}
852
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100853inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100854{
855 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
856
857 float32x4x4_t res_f32 =
858 {
859 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100860 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
861 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
862 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
863 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100864 }
865 };
866
867 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
868 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
869 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
870 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
871
872 const int32x4x4_t res_s32 =
873 {
874 {
875 vcvtq_s32_f32(res_f32.val[0]),
876 vcvtq_s32_f32(res_f32.val[1]),
877 vcvtq_s32_f32(res_f32.val[2]),
878 vcvtq_s32_f32(res_f32.val[3]),
879 }
880 };
881
882 const int16x8x2_t res_s16 =
883 {
884 {
885 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
886 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
887 }
888 };
889
890 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
891}
892
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100893inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100894{
895 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
896
897 float32x4x2_t res_f32 =
898 {
899 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100900 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
901 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100902 }
903 };
904
905 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
906 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
907
908 const int32x4x2_t res_s32 =
909 {
910 {
911 vcvtq_s32_f32(res_f32.val[0]),
912 vcvtq_s32_f32(res_f32.val[1])
913 }
914 };
915
916 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
917}
918
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100919inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
920{
921 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
922
923 const int16x8_t res_s16 = vmovl_s8(a);
924
925 const int32x4x2_t res_s32 =
926 {
927 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100928 vmovl_s16(vget_low_qs16(res_s16)),
929 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100930 }
931 };
932
933 float32x4x2_t res_f32 =
934 {
935 {
936 vcvtq_f32_s32(res_s32.val[0]),
937 vcvtq_f32_s32(res_s32.val[1])
938 }
939 };
940
941 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
942 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
943
944 return res_f32;
945}
946
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100947inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
948{
949 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
950 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
951
952 return vmulq_f32(res_f32, pow2);
953}
954
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100955inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
956{
957 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
958
959 const int16x8x2_t res_s16 =
960 {
961 {
962 vmovl_s8(vget_low_s8(a)),
963 vmovl_s8(vget_high_s8(a)),
964 }
965 };
966
967 const int32x4x4_t res_s32 =
968 {
969 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100970 vmovl_s16(vget_low_qs16(res_s16.val[0])),
971 vmovl_s16(vget_high_qs16(res_s16.val[0])),
972 vmovl_s16(vget_low_qs16(res_s16.val[1])),
973 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100974 }
975 };
976
977 float32x4x4_t res_f32 =
978 {
979 {
980 vcvtq_f32_s32(res_s32.val[0]),
981 vcvtq_f32_s32(res_s32.val[1]),
982 vcvtq_f32_s32(res_s32.val[2]),
983 vcvtq_f32_s32(res_s32.val[3])
984 }
985 };
986
987 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
988 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
989 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
990 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
991
992 return res_f32;
993}
994
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100995inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
996{
997 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
998
999 const int32x4x2_t res_s32 =
1000 {
1001 {
1002 vmovl_s16(vget_low_qs16(a)),
1003 vmovl_s16(vget_high_qs16(a))
1004 }
1005 };
1006
1007 float32x4x2_t res_f32 =
1008 {
1009 {
1010 vcvtq_f32_s32(res_s32.val[0]),
1011 vcvtq_f32_s32(res_s32.val[1])
1012 }
1013 };
1014
1015 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1016 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1017
1018 return res_f32;
1019}
1020
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001021inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1022{
1023 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001024 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1025 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1026 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001027
1028 // Find shift value
1029 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1030 const qint8x8_t temp = vshl_s8(a, shift_value);
1031
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001032 qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001033
1034 uint8x8_t set_one = vcgt_s8(x, const_one);
1035 x = vbsl_s8(set_one, const_one, x);
1036
1037 // Use three iterations of Newton-Raphson method to get the result
1038 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1039 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1040 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1041
1042 return vshl_s8(x, shift_value);
1043}
1044
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001045inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1046{
1047 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1048 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1049 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1050 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1051
1052 // Find shift value
1053 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1054 const qint16x4_t temp = vshl_s16(a, shift_value);
1055
1056 qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
1057
1058 uint16x4_t set_one = vcgt_s16(x, const_one);
1059 x = vbsl_s16(set_one, const_one, x);
1060
1061 // Use five iterations of Newton-Raphson method to get the result
1062 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1063 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1064 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1065 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1066 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1067
1068 return vshl_s16(x, shift_value);
1069}
1070
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001071inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1072{
1073 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001074 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1075 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1076 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001077
1078 // Find shift value
1079 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1080 const qint8x16_t temp = vshlq_s8(a, shift_value);
1081
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001082 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001083
1084 // Set initial guess to one if x > 1
1085 uint8x16_t set_one = vcgtq_s8(x, const_one);
1086 x = vbslq_s8(set_one, const_one, x);
1087
1088 // Use three iterations of Newton-Raphson method to get the result
1089 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1090 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1091 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1092
1093 return vshlq_s8(x, shift_value);
1094}
1095
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001096inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1097{
1098 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1099 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1100 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1101 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1102
1103 // Find shift value
1104 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1105 const qint16x8_t temp = vshlq_s16(a, shift_value);
1106
1107 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1108
1109 // Set initial guess to one if x > 1
1110 uint16x8_t set_one = vcgtq_s16(x, const_one);
1111 x = vbslq_s16(set_one, const_one, x);
1112
1113 // Use five iterations of Newton-Raphson method to get the result
1114 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1115 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1116 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1117 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1118 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1119
1120 return vshlq_s16(x, shift_value);
1121}
1122
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001123inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1124{
1125 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001126 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1127 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1128 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001129
1130 // Find shift value
1131 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1132 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1133
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001134 qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001135
1136 // Set initial guess to one if x > 1
1137 uint8x16_t set_one = vcgtq_s8(x, const_one);
1138 x = vbslq_s8(set_one, const_one, x);
1139
1140 // Use three iterations of Newton-Raphson method to get the result
1141 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1142 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1143 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1144
1145 return vqshlq_s8(x, shift_value);
1146}
1147
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001148inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1149{
1150 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1151 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1152 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1153 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1154
1155 // Find shift value
1156 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1157 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1158
1159 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1160
1161 // Set initial guess to one if x > 1
1162 uint16x8_t set_one = vcgtq_s16(x, const_one);
1163 x = vbslq_s16(set_one, const_one, x);
1164
1165 // Use five iterations of Newton-Raphson method to get the result
1166 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1167 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1168 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1169 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1170 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1171
1172 return vqshlq_s16(x, shift_value);
1173}
1174
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001175inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1176{
1177 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1178}
1179
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001180inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1181{
1182 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1183}
1184
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001185inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1186{
1187 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1188}
1189
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001190inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1191{
1192 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1193}
1194
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001195template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001196inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001197{
1198 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1199 const qint8x8_t const_one = vdup_n_s8(1);
1200 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1201 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1202 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1203 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1204 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1205 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1206 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1207 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1208 return res;
1209}
1210
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001211template <bool islog>
1212inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1213{
1214 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1215 const qint16x4_t const_one = vdup_n_s16(1);
1216 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1217 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1218 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1219 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1220 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1221 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1222 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1223 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1224 return res;
1225}
1226
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001227template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001228inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001229{
1230 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1231 const qint8x8_t const_one = vdup_n_s8(1);
1232 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1233 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1234 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1235 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1236 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1237 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1238 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1239 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1240 return res;
1241}
1242
1243template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001244inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1245{
1246 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1247 const qint16x4_t const_one = vdup_n_s16(1);
1248 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1249 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1250 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1251 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1252 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1253 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1254 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1255 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1256 return res;
1257}
1258
1259template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001260inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1261{
1262 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1263 const qint8x16_t const_one = vdupq_n_s8(1);
1264 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1265 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1266 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1267 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1268 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1269 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1270 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1271 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1272 return res;
1273}
1274
1275template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001276inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1277{
1278 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1279 const qint16x8_t const_one = vdupq_n_s16(1);
1280 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1281 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1282 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1283 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1284 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1285 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1286 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1287 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1288 return res;
1289}
1290
1291template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001292inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1293{
1294 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1295 const qint8x16_t const_one = vdupq_n_s8(1);
1296 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1297 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1298 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1299 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1300 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1301 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1302 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1303 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1304 return res;
1305}
1306
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001307template <bool islog>
1308inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1309{
1310 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1311 const qint16x8_t const_one = vdupq_n_s16(1);
1312 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1313 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1314 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1315 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1316 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1317 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1318 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1319 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1320 return res;
1321}
1322
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001323inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1324{
1325 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1326 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1327 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1328 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1329
1330 // Perform range reduction [-log(2),log(2)]
1331 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1332
1333 // get decimal part from m
1334 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1335
1336 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1337 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1338
1339 // Polynomial Approximation
1340 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1341 poly = vqadd_s8(poly, const_one);
1342
1343 // Reconstruct
1344 poly = vqshl_s8(poly, dec_m);
1345
1346 return poly;
1347}
1348
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001349inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1350{
1351 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1352 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1353 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1354 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1355
1356 // Perform range reduction [-log(2),log(2)]
1357 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1358
1359 // get decimal part from m
1360 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1361
1362 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1363 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1364
1365 // Polynomial Approximation
1366 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1367 poly = vqadd_s16(poly, const_one);
1368
1369 // Reconstruct
1370 poly = vqshl_s16(poly, dec_m);
1371
1372 return poly;
1373}
1374
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001375inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1376{
1377 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1378 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1379 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1380 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1381
1382 // Perform range reduction [-log(2),log(2)]
1383 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1384
1385 // get decimal part from m
1386 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1387
1388 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1389 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1390
1391 // Polynomial Approximation
1392 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1393 poly = vqaddq_s8(poly, const_one);
1394
1395 // Reconstruct
1396 poly = vqshlq_s8(poly, dec_m);
1397
1398 return poly;
1399}
1400
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001401inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1402{
1403 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1404 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1405 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1406 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1407
1408 // Perform range reduction [-log(2),log(2)]
1409 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1410
1411 // get decimal part from m
1412 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1413
1414 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1415 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1416
1417 // Polynomial Approximation
1418 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1419 poly = vqaddq_s16(poly, const_one);
1420
1421 // Reconstruct
1422 poly = vqshlq_s16(poly, dec_m);
1423
1424 return poly;
1425}
1426
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001427inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1428{
1429 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1430 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1431 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1432
1433 // If 0 < a < 1, calculate log(1/x)
1434 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1435 qint8x8_t recip = vdup_n_s8(0);
1436 recip = vbsl_s8(calc_reciprocal, recip, a);
1437
1438 // Calculate reciprocal
1439 recip = vrecip_qs8(recip, fixed_point_position);
1440 a = vbsl_s8(calc_reciprocal, recip, a);
1441
1442 // Get decimal part of a
1443 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1444 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1445
1446 // Get exponent of 2^n which is equal or less than dec_a
1447 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1448
1449 // Get x to range (1, 2]
1450 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1451 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1452 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1453
1454 // Polynomial Approximation
1455 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1456
1457 // Reconstruct
1458 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1459
1460 // Set negative value for 0 < a < 1
1461 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1462
1463 return poly;
1464}
1465
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001466inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1467{
1468 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1469 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1470 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1471
1472 // If 0 < a < 1, calculate log(1/x)
1473 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1474 qint16x4_t recip = vdup_n_s16(0);
1475 recip = vbsl_s16(calc_reciprocal, recip, a);
1476
1477 // Calculate reciprocal
1478 recip = vrecip_qs16(recip, fixed_point_position);
1479 a = vbsl_s16(calc_reciprocal, recip, a);
1480
1481 // Get decimal part of a
1482 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1483 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1484
1485 // Get exponent of 2^n which is equal or less than dec_a
1486 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1487
1488 // Get x to range (1, 2]
1489 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1490 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1491 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1492
1493 // Polynomial Approximation
1494 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1495
1496 // Reconstruct
1497 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1498
1499 // Set negative value for 0 < a < 1
1500 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1501
1502 return poly;
1503}
1504
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001505inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1506{
1507 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1508 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1509 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1510
1511 // If 0 < a < 1, calculate log(1/x)
1512 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1513 qint8x16_t recip = vdupq_n_s8(0);
1514 recip = vbslq_s8(calc_reciprocal, a, recip);
1515
1516 // Calculate reciprocal
1517 recip = vrecipq_qs8(recip, fixed_point_position);
1518 a = vbslq_s8(calc_reciprocal, recip, a);
1519
1520 // Get decimal part of a
1521 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1522 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1523
1524 // Get exponent of 2^n which is equal or less than dec_a
1525 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1526
1527 // Get x to range (1, 2]
1528 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1529 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1530 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1531
1532 // Polynomial Approximation
1533 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1534
1535 // Reconstruct
1536 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1537
1538 // Set negative value for 0 < a < 1
1539 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1540
1541 return poly;
1542}
1543
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001544inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1545{
1546 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1547 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1548 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1549
1550 // If 0 < a < 1, calculate log(1/x)
1551 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1552 qint16x8_t recip = vdupq_n_s16(0);
1553 recip = vbslq_s16(calc_reciprocal, a, recip);
1554
1555 // Calculate reciprocal
1556 recip = vqrecipq_qs16(recip, fixed_point_position);
1557 a = vbslq_s16(calc_reciprocal, recip, a);
1558
1559 // Get decimal part of a
1560 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1561 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1562
1563 // Get exponent of 2^n which is equal or less than dec_a
1564 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1565
1566 // Get x to range (1, 2]
1567 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1568 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1569 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1570
1571 // Polynomial Approximation
1572 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1573
1574 // Reconstruct
1575 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1576
1577 // Set negative value for 0 < a < 1
1578 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1579
1580 return poly;
1581}
1582
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001583inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1584{
1585 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1586
1587 // Find shift value. Number must be in (0.5, 2) range.
1588 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1589
1590 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1591 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1592 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1593 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1594 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1595
1596 temp = vshl_s8(a, shift_value);
1597
1598 // Initial guess
1599 qint8x8_t x = temp;
1600
1601 // Calculate (x / 2) * (3 - a * x^2)
1602 // After three iterations we have the result for 8 bit
1603 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1604 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1605 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1606
1607 return vshl_s8(x, shift_value2);
1608}
1609
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001610inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1611{
1612 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1613
1614 // Find shift value. Number must be in (0.5, 2) range.
1615 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1616
1617 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1618 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1619 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1620 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1621 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1622
1623 temp = vshl_s16(a, shift_value);
1624
1625 // Initial guess
1626 qint16x4_t x = temp;
1627
1628 // Calculate (x / 2) * (3 - a * x^2)
1629 // After five iterations we have the result for 8 bit
1630 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1631 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1632 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1633 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1634 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1635
1636 return vshl_s16(x, shift_value2);
1637}
1638
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001639inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1640{
1641 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1642
1643 // Find shift value. Number must be in (0.5, 2) range.
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001644 qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001645
1646 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001647 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001648 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001649 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001650 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1651
1652 temp = vshl_s8(a, shift_value);
1653
1654 // Initial guess
1655 qint8x8_t x = temp;
1656
1657 // Calculate (x / 2) * (3 - a * x^2)
1658 // After three iterations we have the result for 8 bit
1659 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1660 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1661 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1662
1663 return vshl_s8(x, shift_value2);
1664}
1665
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001666inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1667{
1668 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1669
1670 // Find shift value. Number must be in (0.5, 2) range.
1671 qint16x4_t shift_value = vneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1672
1673 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1674 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1675 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1676 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
1677 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1678
1679 temp = vshl_s16(a, shift_value);
1680
1681 // Initial guess
1682 qint16x4_t x = temp;
1683
1684 // Calculate (x / 2) * (3 - a * x^2)
1685 // After five iterations we have the result for 16 bit
1686 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1687 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1688 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1689 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1690 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1691
1692 return vqshl_s16(x, shift_value2);
1693}
1694
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001695inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1696{
1697 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1698
1699 // Find shift value. Number must be in (0.5, 2) range.
1700 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1701
1702 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1703 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1704 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1705 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1706 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1707
1708 temp = vshlq_s8(a, shift_value);
1709
1710 // Initial guess
1711 qint8x16_t x = temp;
1712
1713 // Calculate (x / 2) * (3 - a * x^2)
1714 // After three iterations we have the result for 8 bit
1715 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1716 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1717 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1718
1719 return vshlq_s8(x, shift_value2);
1720}
1721
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001722inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1723{
1724 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1725
1726 // Find shift value. Number must be in (0.5, 2) range.
1727 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1728
1729 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1730 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1731 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1732 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1733 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1734
1735 temp = vshlq_s16(a, shift_value);
1736
1737 // Initial guess
1738 qint16x8_t x = temp;
1739
1740 // Calculate (x / 2) * (3 - a * x^2)
1741 // After five iterations we have the result for 16 bit
1742 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1743 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1744 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1745 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1746 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1747
1748 return vshlq_s16(x, shift_value2);
1749}
1750
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001751inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1752{
1753 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1754
1755 // Find shift value. Number must be in (0.5, 2) range.
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001756 qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001757
1758 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001759 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001760 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001761 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001762 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1763
1764 temp = vshlq_s8(a, shift_value);
1765
1766 // Initial guess
1767 qint8x16_t x = temp;
1768
1769 // Calculate (x / 2) * (3 - a * x^2)
1770 // After three iterations we have the result for 8 bit
1771 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1772 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1773 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1774
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001775 return vqshlq_s8(x, shift_value2);
1776}
1777
1778inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1779{
1780 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1781
1782 // Find shift value. Number must be in (0.5, 2) range.
1783 qint16x8_t shift_value = vnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1784
1785 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1786 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1787 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1788 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
1789 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1790
1791 temp = vqshlq_s16(a, shift_value);
1792
1793 // Initial guess
1794 qint16x8_t x = temp;
1795
1796 // Calculate (x / 2) * (3 - a * x^2)
1797 // After five iterations we have the result for 16 bit
1798 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1799 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1800 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1801 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1802 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1803
1804 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001805}
1806
1807inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
1808{
1809 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1810 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1811
1812 qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1813 qint8x8_t num = vqsub_qs8(exp2x, const_one);
1814 qint8x8_t den = vqadd_qs8(exp2x, const_one);
1815 qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
1816
1817 return tanh;
1818}
1819
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001820inline qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position)
1821{
1822 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1823 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1824
1825 qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1826 qint16x4_t num = vqsub_qs16(exp2x, const_one);
1827 qint16x4_t den = vqadd_qs16(exp2x, const_one);
1828 qint16x4_t tanh = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
1829
1830 return tanh;
1831}
1832
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001833inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
1834{
1835 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1836 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1837
1838 qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1839 qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1840 qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1841 qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
1842
1843 return tanh;
1844}
1845
1846inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1847{
1848 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1849}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001850
1851inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1852{
1853 float32x4x2_t res =
1854 {
1855 {
1856 vmaxq_f32(a.val[0], b.val[0]),
1857 vmaxq_f32(a.val[1], b.val[1])
1858 }
1859 };
1860 return res;
1861}
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001862
1863inline qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position)
1864{
1865 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1866 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1867
1868 qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1869 qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1870 qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1871 qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
1872
1873 return tanh;
1874}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001875}