blob: 86b789dc56bb4b2fc95a889f61d90177b952437b [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010028/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements
30 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010031static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032{
33 {
34 vdup_n_s8(0x7F), // 0.9978546
35 vdup_n_s8(0x3F), // 0.4994721
36 vdup_n_s8(0x16), // 0.1763723
37 vdup_n_s8(0x05), // 0.0435108
38 }
39};
40
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010041/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
42 * Format is in Q0.15 for all elements
43 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010044static const std::array<qint16x4_t, 4> exp_tab_qs16 =
45{
46 {
47 vdup_n_s16(0x7FBA), // 0.9978546
48 vdup_n_s16(0x3FE9), // 0.4994721
49 vdup_n_s16(0x1693), // 0.1763723
50 vdup_n_s16(0x0592), // 0.0435108
51 }
52};
53
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010054/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
55 * Format is in Q0.7 for all elements
56 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010057static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058{
59 {
60 vdupq_n_s8(0x7F), // 0.9978546
61 vdupq_n_s8(0x3F), // 0.4994721
62 vdupq_n_s8(0x16), // 0.1763723
63 vdupq_n_s8(0x05), // 0.0435108
64 }
65};
66
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010067/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
68 * Format is in Q0.15 for all elements
69 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010070static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
71{
72 {
73 vdupq_n_s16(0x7FBA), // 0.9978546
74 vdupq_n_s16(0x3FE9), // 0.4994721
75 vdupq_n_s16(0x1693), // 0.1763723
76 vdupq_n_s16(0x0592), // 0.0435108
77 }
78};
79
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010080/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
81 * Format is in Q0.7 for all elements except the first one which is in Q1.6
82 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010083static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010084{
85 {
86 vdup_n_s8(0x5C), // 1.4384189
87 vdup_n_s8(-0x56), // -0.6771900
88 vdup_n_s8(0x29), // 0.3218538
89 vdup_n_s8(-0x0A), // -0.0832229
90 }
91};
92
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010093/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
94 * Format is in Q0.15 for all elements except the first one which is in Q1.14
95 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010096static const std::array<qint16x4_t, 4> log_tab_qs16 =
97{
98 {
99 vdup_n_s16(0x5C0F), // 1.4384189
100 vdup_n_s16(-0x56AE), // -0.6771900
101 vdup_n_s16(0x2933), // 0.3218538
102 vdup_n_s16(-0x0AA7), // -0.0832229
103 }
104};
105
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100106/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
107 * Format is in Q0.7 for all elements except the first one which is in Q1.6
108 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100109static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100110{
111 {
112 vdupq_n_s8(0x5C), // 1.4384189
113 vdupq_n_s8(-0x56), // -0.6771900
114 vdupq_n_s8(0x29), // 0.3218538
115 vdupq_n_s8(-0x0A), // -0.0832229
116 }
117};
118
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100119/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
120 * Format is in Q0.15 for all elements except the first one which is in Q1.14
121 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100122static const std::array<qint16x8_t, 4> log_tabq_qs16 =
123{
124 {
125 vdupq_n_s16(0x5C0F), // 1.4384189
126 vdupq_n_s16(-0x56AE), // -0.6771900
127 vdupq_n_s16(0x2933), // 0.3218538
128 vdupq_n_s16(-0x0AA7), // -0.0832229
129 }
130};
131
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100132inline qint8x8_t vget_low_qs8(qint8x16_t a)
133{
134 return vget_low_s8(a);
135}
136
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100137inline qint16x4_t vget_low_qs16(qint16x8_t a)
138{
139 return vget_low_s16(a);
140}
141
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100142inline qint8x8_t vget_high_qs8(qint8x16_t a)
143{
144 return vget_high_s8(a);
145}
146
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100147inline qint16x4_t vget_high_qs16(qint16x8_t a)
148{
149 return vget_high_s16(a);
150}
151
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100152inline qint8x8_t vld1_qs8(const qint8_t *addr)
153{
154 return vld1_s8(addr);
155}
156
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157inline qint16x4_t vld1_qs16(const qint16_t *addr)
158{
159 return vld1_s16(addr);
160}
161
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100162inline qint8x16_t vld1q_qs8(const qint8_t *addr)
163{
164 return vld1q_s8(addr);
165}
166
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100167inline qint16x8_t vld1q_qs16(const qint16_t *addr)
168{
169 return vld1q_s16(addr);
170}
171
172inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
173{
174 return vld1_dup_s8(addr);
175}
176
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100177inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
178{
179 return vld1_dup_s16(addr);
180}
181
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100182inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
183{
184 return vld1q_dup_s8(addr);
185}
186
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100187inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
188{
189 return vld1q_dup_s16(addr);
190}
191
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100192inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
193{
194 return vld2q_s16(addr);
195}
196
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
198{
199 vst1_s8(addr, b);
200}
201
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100202inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
203{
204 vst1_s16(addr, b);
205}
206
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100207inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
208{
209 vst1q_s8(addr, b);
210}
211
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100212inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
213{
214 vst1q_s16(addr, b);
215}
216
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100217inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
218{
219 vst2q_s16(addr, b);
220}
221
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100222inline qint8x8_t vqmovn_qs16(qint16x8_t a)
223{
224 return vqmovn_s16(a);
225}
226
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100227inline qint16x4_t vqmovn_qs32(qint32x4_t a)
228{
229 return vqmovn_s32(a);
230}
231
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100232inline qint8x8_t vdup_n_qs8(qint8_t a)
233{
234 return vdup_n_s8(a);
235}
236
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100237inline qint16x4_t vdup_n_qs16(qint16_t a)
238{
239 return vdup_n_s16(a);
240}
241
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100242inline qint8x16_t vdupq_n_qs8(qint8_t a)
243{
244 return vdupq_n_s8(a);
245}
246
247inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
248{
249 float32x4x4_t res =
250 {
251 {
252 vdupq_n_f32(a),
253 vdupq_n_f32(a),
254 vdupq_n_f32(a),
255 vdupq_n_f32(a),
256 }
257 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100258 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100259}
260
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +0100261inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
262{
263 float32x4x2_t res =
264 {
265 {
266 vdupq_n_f32(a),
267 vdupq_n_f32(a),
268 }
269 };
270 return vqcvtq_qs16_f32(res, fixed_point_position);
271}
272
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100273inline qint16x8_t vdupq_n_qs16(qint16_t a)
274{
275 return vdupq_n_s16(a);
276}
277
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100278inline qint32x4_t vdupq_n_qs32(qint32_t a)
279{
280 return vdupq_n_s32(a);
281}
282
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283inline qint8x8_t vabs_qs8(qint8x8_t a)
284{
285 return vabs_s8(a);
286}
287
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100288inline qint16x4_t vabs_qs16(qint16x4_t a)
289{
290 return vabs_s16(a);
291}
292
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100293inline qint8x16_t vabsq_qs8(qint8x16_t a)
294{
295 return vabsq_s8(a);
296}
297
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100298inline qint16x8_t vabsq_qs16(qint16x8_t a)
299{
300 return vabsq_s16(a);
301}
302
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100303inline qint8x8_t vqabs_qs8(qint8x8_t a)
304{
305 return vqabs_s8(a);
306}
307
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100308inline qint16x4_t vqabs_qs16(qint16x4_t a)
309{
310 return vqabs_s16(a);
311}
312
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100313inline qint8x16_t vqabsq_qs8(qint8x16_t a)
314{
315 return vqabsq_s8(a);
316}
317
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100318inline qint16x8_t vqabsq_qs16(qint16x8_t a)
319{
320 return vqabsq_s16(a);
321}
322
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100323inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
324{
325 return vmax_s8(a, b);
326}
327
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100328inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
329{
330 return vmax_s16(a, b);
331}
332
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100333inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
334{
335 return vmaxq_s8(a, b);
336}
337
338inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
339{
340 return vpmax_s8(a, b);
341}
342
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100343inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
344{
345 return vpmax_s16(a, b);
346}
347
348inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
349{
350 return vmaxq_s16(a, b);
351}
352
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100353inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
354{
355 return vmin_s8(a, b);
356}
357
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100358inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
359{
360 return vmin_s16(a, b);
361}
362
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100363inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
364{
365 return vminq_s8(a, b);
366}
367
368inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
369{
370 return vpmin_s8(a, b);
371}
372
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100373inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
374{
375 return vpmin_s16(a, b);
376}
377
378inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
379{
380 return vminq_s16(a, b);
381}
382
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100383inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
384{
385 return vadd_s8(a, b);
386}
387
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100388inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
389{
390 return vadd_s16(a, b);
391}
392
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100393inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
394{
395 return vaddq_s8(a, b);
396}
397
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100398inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
399{
400 return vaddq_s16(a, b);
401}
402
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
404{
405 return vqadd_s8(a, b);
406}
407
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100408inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
409{
410 return vqadd_s16(a, b);
411}
412
Georgios Pinitas9247c922017-06-28 18:29:47 +0100413inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
414{
415 return vqadd_s32(a, b);
416}
417
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100418inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
419{
420 return vqaddq_s8(a, b);
421}
422
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100423inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
424{
425 return vqaddq_s16(a, b);
426}
427
Georgios Pinitas9247c922017-06-28 18:29:47 +0100428inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
429{
430 return vqaddq_s32(a, b);
431}
432
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100433inline int16x4_t vpaddl_qs8(qint8x8_t a)
434{
435 return vpaddl_s8(a);
436}
437
438inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
439{
440 return vsub_s8(a, b);
441}
442
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100443inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
444{
445 return vsub_s16(a, b);
446}
447
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100448inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
449{
450 return vsubq_s8(a, b);
451}
452
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100453inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
454{
455 return vsubq_s16(a, b);
456}
457
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100458inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
459{
460 return vqsub_s8(a, b);
461}
462
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100463inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
464{
465 return vqsub_s16(a, b);
466}
467
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100468inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
469{
470 return vqsubq_s8(a, b);
471}
472
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100473inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
474{
475 return vqsubq_s16(a, b);
476}
477
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100478inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
479{
480 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
481
482 // Initialize the temporary result with a constant used to round up the result
483 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
484
485 // Vector multiply-accumulate long
486 res = vmlal_s8(res, a, b);
487
488 // Shift right by fixed_point_position
489 res = vshlq_s16(res, fixed_point_position_s16);
490
491 // Convert back to qint8
492 return vmovn_s16(res);
493}
494
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100495inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
496{
497 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
498
499 // Initialize the temporary result with a constant used to round up the result
500 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
501
502 // Vector multiply-accumulate long
503 res = vmlal_s16(res, a, b);
504
505 // Shift right by fixed_point_position
506 res = vshlq_s32(res, fixed_point_position_s32);
507
508 // Convert back to qint16
509 return vmovn_s32(res);
510}
511
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100512inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
513{
514 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
515
516 // Initialize the temporary results with a constant used to round up the result
517 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
518 qint16x8_t res1 = res0;
519
520 // Vector multiply-accumulate long
521 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
522 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
523
524 // Shift right by fixed_point_position
525 res0 = vshlq_s16(res0, fixed_point_position_s16);
526 res1 = vshlq_s16(res1, fixed_point_position_s16);
527
528 // Convert back to qint8
529 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
530}
531
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100532inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
533{
534 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
535
536 // Initialize the temporary results with a constant used to round up the result
537 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
538 qint32x4_t res1 = res0;
539
540 // Vector multiply-accumulate long
541 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
542 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
543
544 // Shift right by fixed_point_position
545 res0 = vshlq_s32(res0, fixed_point_position_s32);
546 res1 = vshlq_s32(res1, fixed_point_position_s32);
547
548 // Convert back to qint16
549 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
550}
551
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100552inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
553{
554 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
555
556 // Initialize the temporary result with a constant used to round up the result
557 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
558
559 // Vector multiply-accumulate long
560 res = vmlal_s8(res, a, b);
561
562 // Shift right by fixed_point_position
563 res = vqshlq_s16(res, fixed_point_position_s16);
564
565 // Convert back to qint8 and saturate
566 return vqmovn_s16(res);
567}
568
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100569inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
570{
571 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
572
573 // Initialize the temporary result with a constant used to round up the result
574 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
575
576 // Vector multiply-accumulate long
577 res = vmlal_s16(res, a, b);
578
579 // Shift right by fixed_point_position
580 res = vqshlq_s32(res, fixed_point_position_s32);
581
582 // Convert back to qint16 and saturate
583 return vqmovn_s32(res);
584}
585
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100586inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
587{
588 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
589
590 // Initialize the temporary results with a constant used to round up the result
591 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
592 qint16x8_t res1 = res0;
593
594 // Vector multiply-accumulate long
595 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
596 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
597
598 // Shift right by fixed_point_position
599 res0 = vqshlq_s16(res0, fixed_point_position_s16);
600 res1 = vqshlq_s16(res1, fixed_point_position_s16);
601
602 // Convert back to qint8 and saturate
603 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
604}
605
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100606inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
607{
608 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
609
610 // Initialize the temporary results with a constant used to round up the result
611 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
612 qint32x4_t res1 = res0;
613
614 // Vector multiply-accumulate long
615 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
616 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
617
618 // Shift right by fixed_point_position
619 res0 = vqshlq_s32(res0, fixed_point_position_s32);
620 res1 = vqshlq_s32(res1, fixed_point_position_s32);
621
622 // Convert back to qint16 and saturate
623 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
624}
625
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100626inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
627{
628 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
629
630 qint16x8_t res = vmull_s8(a, b);
631
632 return vqrshlq_s16(res, fixed_point_position_s16);
633}
634
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100635inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
636{
637 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
638
639 // Initialize the temporary results with a constant used to round up the result
640 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
641
642 // Vector multiply-accumulate long
643 tmp = vmull_s16(a, b);
644
645 // Shift right by fixed_point_position
646 return vqshlq_s32(tmp, fixed_point_position_s32);
647}
648
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100649inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
650{
651 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
652
653 // Initialize the temporary results with a constant used to round up the result
654 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
655
656 // Vector multiply-accumulate long
657 tmp = vmlal_s8(tmp, b, c);
658
659 // Shift right by fixed_point_position
660 tmp = vshlq_s16(tmp, fixed_point_position_s16);
661
662 // Convert back to qint8 and accumulate
663 return vadd_s8(a, vmovn_s16(tmp));
664}
665
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100666inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
667{
668 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
669
670 // Initialize the temporary results with a constant used to round up the result
671 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
672
673 // Vector multiply-accumulate long
674 tmp = vmlal_s16(tmp, b, c);
675
676 // Shift right by fixed_point_position
677 tmp = vshlq_s32(tmp, fixed_point_position_s32);
678
679 // Convert back to qint16 and accumulate
680 return vadd_s16(a, vmovn_s32(tmp));
681}
682
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100683inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
684{
685 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
686
687 // Initialize the temporary results with a constant used to round up the result
688 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
689 qint16x8_t tmp1 = tmp0;
690
691 // Vector multiply-accumulate long
692 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
693 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
694
695 // Shift right by fixed_point_position
696 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
697 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
698
699 // Convert back to qint8 and accumulate
700 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
701}
702
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100703inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
704{
705 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
706
707 // Initialize the temporary results with a constant used to round up the result
708 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
709 qint32x4_t tmp1 = tmp0;
710
711 // Vector multiply-accumulate long
712 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
713 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
714
715 // Shift right by fixed_point_position
716 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
717 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
718
719 // Convert back to qint16 and accumulate
720 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
721}
722
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
724{
725 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
726
727 // Initialize the temporary results with a constant used to round up the result
728 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
729
730 // Vector multiply-accumulate long
731 tmp = vmlal_s8(tmp, b, c);
732
733 // Shift right by fixed_point_position
734 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
735
736 // Convert back to qint8 and accumulate
737 return vqadd_s8(a, vqmovn_s16(tmp));
738}
739
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100740inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
741{
742 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
743
744 // Initialize the temporary results with a constant used to round up the result
745 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
746
747 // Vector multiply-accumulate long
748 tmp = vmlal_s16(tmp, b, c);
749
750 // Shift right by fixed_point_position
751 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
752
753 // Convert back to qint8 and accumulate
754 return vqadd_s16(a, vqmovn_s32(tmp));
755}
756
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100757inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
758{
759 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
760
761 // Initialize the temporary results with a constant used to round up the result
762 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
763 qint16x8_t tmp1 = tmp0;
764
765 // Vector multiply-accumulate long
766 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
767 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
768
769 // Shift right by fixed_point_position
770 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
771 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
772
773 // Convert back to qint8 and accumulate
774 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
775 return vqaddq_s8(a, res);
776}
777
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100778inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
779{
780 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
781
782 // Initialize the temporary results with a constant used to round up the result
783 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
784 qint32x4_t tmp1 = tmp0;
785
786 // Vector multiply-accumulate long
787 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
788 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
789
790 // Shift right by fixed_point_position
791 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
792 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
793
794 // Convert back to qint16 and accumulate
795 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
796 return vqaddq_s16(a, res);
797}
798
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100799inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
800{
801 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
802
803 // Initialize the temporary results with a constant used to round up the result
804 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
805
806 // Vector multiply-accumulate long
807 tmp = vmlal_s8(tmp, b, c);
808
809 // Shift right by fixed_point_position
810 tmp = vshlq_s16(tmp, fixed_point_position_s16);
811
812 // Accumulate
813 return vaddq_s16(a, tmp);
814}
815
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100816inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
817{
818 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
819
820 // Initialize the temporary results with a constant used to round up the result
821 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
822
823 // Vector multiply-accumulate long
824 tmp = vmlal_s16(tmp, b, c);
825
826 // Shift right by fixed_point_position
827 tmp = vshlq_s32(tmp, fixed_point_position_s32);
828
829 // Accumulate
830 return vaddq_s32(a, tmp);
831}
832
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100833inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
834{
835 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
836
837 // Initialize the temporary results with a constant used to round up the result
838 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
839
840 // Vector multiply-accumulate long
841 tmp = vmlal_s8(tmp, b, c);
842
843 // Shift right by fixed_point_position
844 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
845
846 // Accumulate
847 return vqaddq_s16(a, tmp);
848}
849
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100850inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
851{
852 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
853
854 // Initialize the temporary results with a constant used to round up the result
855 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
856
857 // Vector multiply-accumulate long
858 tmp = vmlal_s16(tmp, b, c);
859
860 // Shift right by fixed_point_position
861 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
862
863 // Accumulate
864 return vqaddq_s32(a, tmp);
865}
866
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100867inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100868{
869 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
870
871 float32x4x2_t res_f32 =
872 {
873 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100874 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
875 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100876 }
877 };
878
879 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
880 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
881
882 const int32x4x2_t res_s32 =
883 {
884 {
885 vcvtq_s32_f32(res_f32.val[0]),
886 vcvtq_s32_f32(res_f32.val[1]),
887 }
888 };
889
890 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
891
892 return vqmovn_s16(res_s16);
893}
894
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100895inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100896{
897 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
898
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100899 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100900
901 res_f32 = vmlaq_f32(res_f32, a, pow2);
902
903 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
904
905 return vqmovn_s32(res_s32);
906}
907
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100908inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100909{
910 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
911
912 float32x4x4_t res_f32 =
913 {
914 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100915 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
916 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
917 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
918 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100919 }
920 };
921
922 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
923 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
924 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
925 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
926
927 const int32x4x4_t res_s32 =
928 {
929 {
930 vcvtq_s32_f32(res_f32.val[0]),
931 vcvtq_s32_f32(res_f32.val[1]),
932 vcvtq_s32_f32(res_f32.val[2]),
933 vcvtq_s32_f32(res_f32.val[3]),
934 }
935 };
936
937 const int16x8x2_t res_s16 =
938 {
939 {
940 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
941 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
942 }
943 };
944
945 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
946}
947
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100948inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100949{
950 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
951
952 float32x4x2_t res_f32 =
953 {
954 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100955 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
956 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100957 }
958 };
959
960 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
961 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
962
963 const int32x4x2_t res_s32 =
964 {
965 {
966 vcvtq_s32_f32(res_f32.val[0]),
967 vcvtq_s32_f32(res_f32.val[1])
968 }
969 };
970
971 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
972}
973
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100974inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
975{
976 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
977
978 const int16x8_t res_s16 = vmovl_s8(a);
979
980 const int32x4x2_t res_s32 =
981 {
982 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100983 vmovl_s16(vget_low_qs16(res_s16)),
984 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100985 }
986 };
987
988 float32x4x2_t res_f32 =
989 {
990 {
991 vcvtq_f32_s32(res_s32.val[0]),
992 vcvtq_f32_s32(res_s32.val[1])
993 }
994 };
995
996 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
997 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
998
999 return res_f32;
1000}
1001
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001002inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
1003{
1004 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1005 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
1006
1007 return vmulq_f32(res_f32, pow2);
1008}
1009
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001010inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
1011{
1012 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1013
1014 const int16x8x2_t res_s16 =
1015 {
1016 {
1017 vmovl_s8(vget_low_s8(a)),
1018 vmovl_s8(vget_high_s8(a)),
1019 }
1020 };
1021
1022 const int32x4x4_t res_s32 =
1023 {
1024 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001025 vmovl_s16(vget_low_qs16(res_s16.val[0])),
1026 vmovl_s16(vget_high_qs16(res_s16.val[0])),
1027 vmovl_s16(vget_low_qs16(res_s16.val[1])),
1028 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001029 }
1030 };
1031
1032 float32x4x4_t res_f32 =
1033 {
1034 {
1035 vcvtq_f32_s32(res_s32.val[0]),
1036 vcvtq_f32_s32(res_s32.val[1]),
1037 vcvtq_f32_s32(res_s32.val[2]),
1038 vcvtq_f32_s32(res_s32.val[3])
1039 }
1040 };
1041
1042 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1043 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1044 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1045 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1046
1047 return res_f32;
1048}
1049
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001050inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1051{
1052 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1053
1054 const int32x4x2_t res_s32 =
1055 {
1056 {
1057 vmovl_s16(vget_low_qs16(a)),
1058 vmovl_s16(vget_high_qs16(a))
1059 }
1060 };
1061
1062 float32x4x2_t res_f32 =
1063 {
1064 {
1065 vcvtq_f32_s32(res_s32.val[0]),
1066 vcvtq_f32_s32(res_s32.val[1])
1067 }
1068 };
1069
1070 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1071 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1072
1073 return res_f32;
1074}
1075
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001076inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1077{
1078 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001079 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1080 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1081 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001082 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001083
1084 // Find shift value
1085 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1086 const qint8x8_t temp = vshl_s8(a, shift_value);
1087
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001088 // Newton-Raphson division initial estimate X0 calculation
1089 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001090
1091 uint8x8_t set_one = vcgt_s8(x, const_one);
1092 x = vbsl_s8(set_one, const_one, x);
1093
1094 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001095 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1096 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1097 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001098
1099 return vshl_s8(x, shift_value);
1100}
1101
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001102inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1103{
1104 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1105 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1106 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1107 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001108 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001109
1110 // Find shift value
1111 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1112 const qint16x4_t temp = vshl_s16(a, shift_value);
1113
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001114 // Newton-Raphson division initial estimate X0 calculation
1115 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001116
1117 uint16x4_t set_one = vcgt_s16(x, const_one);
1118 x = vbsl_s16(set_one, const_one, x);
1119
Michalis Spyrou25466a92017-08-17 12:56:46 +01001120 // Use four iterations of Newton-Raphson method to get the result
1121 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1122 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1123 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1124 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001125
1126 return vshl_s16(x, shift_value);
1127}
1128
Georgios Pinitas9247c922017-06-28 18:29:47 +01001129inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1130{
1131 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1132 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1133 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1134 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001135 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001136
1137 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001138 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001139 const qint8x8_t temp = vqshl_s8(a, shift_value);
1140
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001141 // Newton-Raphson division initial estimate X0 calculation
1142 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001143
1144 uint8x8_t set_one = vcgt_s8(x, const_one);
1145 x = vbsl_s8(set_one, const_one, x);
1146
1147 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001148 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1149 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1150 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001151
1152 return vqshl_s8(x, shift_value);
1153}
1154
1155inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1156{
1157 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1158 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1159 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1160 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001161 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001162
1163 // Find shift value
1164 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1165 const qint16x4_t temp = vqshl_s16(a, shift_value);
1166
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001167 // Newton-Raphson division initial estimate X0 calculation
1168 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001169
1170 uint16x4_t set_one = vcgt_s16(x, const_one);
1171 x = vbsl_s16(set_one, const_one, x);
1172
Michalis Spyrou25466a92017-08-17 12:56:46 +01001173 // Use four iterations of Newton-Raphson method to get the result
1174 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1175 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1176 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1177 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001178
1179 return vqshl_s16(x, shift_value);
1180}
1181
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001182inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1183{
1184 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001185 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1186 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1187 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001188 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001189
1190 // Find shift value
1191 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1192 const qint8x16_t temp = vshlq_s8(a, shift_value);
1193
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001194 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001195 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001196
1197 // Set initial guess to one if x > 1
1198 uint8x16_t set_one = vcgtq_s8(x, const_one);
1199 x = vbslq_s8(set_one, const_one, x);
1200
1201 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001202 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1203 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1204 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001205
1206 return vshlq_s8(x, shift_value);
1207}
1208
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001209inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1210{
1211 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1212 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1213 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1214 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001215 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001216
1217 // Find shift value
1218 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1219 const qint16x8_t temp = vshlq_s16(a, shift_value);
1220
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001221 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001222 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1223
1224 // Set initial guess to one if x > 1
1225 uint16x8_t set_one = vcgtq_s16(x, const_one);
1226 x = vbslq_s16(set_one, const_one, x);
1227
Michalis Spyrou25466a92017-08-17 12:56:46 +01001228 // Use four iterations of Newton-Raphson method to get the result
1229 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1230 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1231 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1232 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001233
1234 return vshlq_s16(x, shift_value);
1235}
1236
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001237inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1238{
1239 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001240 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1241 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1242 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001243 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001244
1245 // Find shift value
1246 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1247 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1248
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001249 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001250 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001251
1252 // Set initial guess to one if x > 1
1253 uint8x16_t set_one = vcgtq_s8(x, const_one);
1254 x = vbslq_s8(set_one, const_one, x);
1255
1256 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001257 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1258 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1259 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001260
1261 return vqshlq_s8(x, shift_value);
1262}
1263
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001264inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1265{
1266 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1267 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1268 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1269 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001270 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001271
1272 // Find shift value
1273 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1274 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1275
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001276 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001277 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1278
1279 // Set initial guess to one if x > 1
1280 uint16x8_t set_one = vcgtq_s16(x, const_one);
1281 x = vbslq_s16(set_one, const_one, x);
1282
Michalis Spyrou25466a92017-08-17 12:56:46 +01001283 // Use four iterations of Newton-Raphson method to get the result
1284 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1285 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1286 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1287 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001288
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001289 // Saturate result in case of overflow
1290 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001291}
1292
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001293inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1294{
1295 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1296}
1297
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001298inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1299{
1300 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1301}
1302
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001303inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1304{
1305 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1306}
1307
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001308inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1309{
1310 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1311}
1312
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001313template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001314inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001315{
1316 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1317 const qint8x8_t const_one = vdup_n_s8(1);
1318 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1319 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1320 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1321 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1322 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1323 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1324 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1325 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1326 return res;
1327}
1328
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001329template <bool islog>
1330inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1331{
1332 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1333 const qint16x4_t const_one = vdup_n_s16(1);
1334 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1335 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1336 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1337 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1338 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1339 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1340 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1341 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1342 return res;
1343}
1344
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001345template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001346inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001347{
1348 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1349 const qint8x8_t const_one = vdup_n_s8(1);
1350 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1351 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1352 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1353 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1354 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1355 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1356 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1357 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1358 return res;
1359}
1360
1361template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001362inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1363{
1364 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1365 const qint16x4_t const_one = vdup_n_s16(1);
1366 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1367 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1368 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1369 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1370 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1371 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1372 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1373 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1374 return res;
1375}
1376
1377template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001378inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1379{
1380 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1381 const qint8x16_t const_one = vdupq_n_s8(1);
1382 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1383 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1384 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1385 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1386 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1387 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1388 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1389 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1390 return res;
1391}
1392
1393template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001394inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1395{
1396 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1397 const qint16x8_t const_one = vdupq_n_s16(1);
1398 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1399 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1400 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1401 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1402 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1403 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1404 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1405 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1406 return res;
1407}
1408
1409template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001410inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1411{
1412 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1413 const qint8x16_t const_one = vdupq_n_s8(1);
1414 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1415 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1416 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1417 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1418 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1419 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1420 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1421 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1422 return res;
1423}
1424
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001425template <bool islog>
1426inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1427{
1428 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1429 const qint16x8_t const_one = vdupq_n_s16(1);
1430 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1431 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1432 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1433 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1434 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1435 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1436 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1437 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1438 return res;
1439}
1440
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001441inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1442{
1443 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1444 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1445 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1446 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1447
1448 // Perform range reduction [-log(2),log(2)]
1449 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1450
1451 // get decimal part from m
1452 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1453
1454 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1455 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1456
1457 // Polynomial Approximation
1458 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1459 poly = vqadd_s8(poly, const_one);
1460
1461 // Reconstruct
1462 poly = vqshl_s8(poly, dec_m);
1463
1464 return poly;
1465}
1466
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001467inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1468{
1469 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1470 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1471 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1472 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1473
1474 // Perform range reduction [-log(2),log(2)]
1475 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1476
1477 // get decimal part from m
1478 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1479
1480 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1481 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1482
1483 // Polynomial Approximation
1484 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1485 poly = vqadd_s16(poly, const_one);
1486
1487 // Reconstruct
1488 poly = vqshl_s16(poly, dec_m);
1489
1490 return poly;
1491}
1492
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001493inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1494{
1495 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1496 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1497 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1498 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1499
1500 // Perform range reduction [-log(2),log(2)]
1501 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1502
1503 // get decimal part from m
1504 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1505
1506 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1507 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1508
1509 // Polynomial Approximation
1510 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1511 poly = vqaddq_s8(poly, const_one);
1512
1513 // Reconstruct
1514 poly = vqshlq_s8(poly, dec_m);
1515
1516 return poly;
1517}
1518
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001519inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1520{
1521 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1522 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1523 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1524 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1525
1526 // Perform range reduction [-log(2),log(2)]
1527 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1528
1529 // get decimal part from m
1530 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1531
1532 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1533 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1534
1535 // Polynomial Approximation
1536 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1537 poly = vqaddq_s16(poly, const_one);
1538
1539 // Reconstruct
1540 poly = vqshlq_s16(poly, dec_m);
1541
1542 return poly;
1543}
1544
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001545inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1546{
1547 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1548 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1549 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1550
1551 // If 0 < a < 1, calculate log(1/x)
1552 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1553 qint8x8_t recip = vdup_n_s8(0);
1554 recip = vbsl_s8(calc_reciprocal, recip, a);
1555
1556 // Calculate reciprocal
1557 recip = vrecip_qs8(recip, fixed_point_position);
1558 a = vbsl_s8(calc_reciprocal, recip, a);
1559
1560 // Get decimal part of a
1561 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1562 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1563
1564 // Get exponent of 2^n which is equal or less than dec_a
1565 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1566
1567 // Get x to range (1, 2]
1568 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1569 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1570 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1571
1572 // Polynomial Approximation
1573 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1574
1575 // Reconstruct
1576 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1577
1578 // Set negative value for 0 < a < 1
1579 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1580
1581 return poly;
1582}
1583
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001584inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1585{
1586 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1587 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1588 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1589
1590 // If 0 < a < 1, calculate log(1/x)
1591 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1592 qint16x4_t recip = vdup_n_s16(0);
1593 recip = vbsl_s16(calc_reciprocal, recip, a);
1594
1595 // Calculate reciprocal
1596 recip = vrecip_qs16(recip, fixed_point_position);
1597 a = vbsl_s16(calc_reciprocal, recip, a);
1598
1599 // Get decimal part of a
1600 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1601 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1602
1603 // Get exponent of 2^n which is equal or less than dec_a
1604 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1605
1606 // Get x to range (1, 2]
1607 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1608 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1609 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1610
1611 // Polynomial Approximation
1612 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1613
1614 // Reconstruct
1615 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1616
1617 // Set negative value for 0 < a < 1
1618 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1619
1620 return poly;
1621}
1622
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001623inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1624{
1625 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1626 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1627 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1628
1629 // If 0 < a < 1, calculate log(1/x)
1630 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1631 qint8x16_t recip = vdupq_n_s8(0);
1632 recip = vbslq_s8(calc_reciprocal, a, recip);
1633
1634 // Calculate reciprocal
1635 recip = vrecipq_qs8(recip, fixed_point_position);
1636 a = vbslq_s8(calc_reciprocal, recip, a);
1637
1638 // Get decimal part of a
1639 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1640 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1641
1642 // Get exponent of 2^n which is equal or less than dec_a
1643 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1644
1645 // Get x to range (1, 2]
1646 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1647 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1648 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1649
1650 // Polynomial Approximation
1651 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1652
1653 // Reconstruct
1654 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1655
1656 // Set negative value for 0 < a < 1
1657 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1658
1659 return poly;
1660}
1661
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001662inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1663{
1664 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1665 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1666 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1667
1668 // If 0 < a < 1, calculate log(1/x)
1669 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1670 qint16x8_t recip = vdupq_n_s16(0);
1671 recip = vbslq_s16(calc_reciprocal, a, recip);
1672
1673 // Calculate reciprocal
1674 recip = vqrecipq_qs16(recip, fixed_point_position);
1675 a = vbslq_s16(calc_reciprocal, recip, a);
1676
1677 // Get decimal part of a
1678 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1679 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1680
1681 // Get exponent of 2^n which is equal or less than dec_a
1682 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1683
1684 // Get x to range (1, 2]
1685 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1686 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1687 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1688
1689 // Polynomial Approximation
1690 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1691
1692 // Reconstruct
1693 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1694
1695 // Set negative value for 0 < a < 1
1696 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1697
1698 return poly;
1699}
1700
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001701inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1702{
1703 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1704
1705 // Find shift value. Number must be in (0.5, 2) range.
1706 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1707
1708 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1709 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1710 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1711 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1712 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1713
1714 temp = vshl_s8(a, shift_value);
1715
1716 // Initial guess
1717 qint8x8_t x = temp;
1718
1719 // Calculate (x / 2) * (3 - a * x^2)
1720 // After three iterations we have the result for 8 bit
1721 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1722 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1723 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1724
1725 return vshl_s8(x, shift_value2);
1726}
1727
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001728inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1729{
1730 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1731
1732 // Find shift value. Number must be in (0.5, 2) range.
1733 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1734
1735 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1736 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1737 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1738 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1739 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1740
1741 temp = vshl_s16(a, shift_value);
1742
1743 // Initial guess
1744 qint16x4_t x = temp;
1745
1746 // Calculate (x / 2) * (3 - a * x^2)
1747 // After five iterations we have the result for 8 bit
1748 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1749 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1750 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1751 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1753
1754 return vshl_s16(x, shift_value2);
1755}
1756
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001757inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1758{
1759 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1760
1761 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001762 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001763
1764 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001765 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001766 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001767 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001768 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001769
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001770 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001771
1772 // Initial guess
1773 qint8x8_t x = temp;
1774
1775 // Calculate (x / 2) * (3 - a * x^2)
1776 // After three iterations we have the result for 8 bit
1777 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1778 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1779 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1780
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001781 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001782}
1783
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001784inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1785{
1786 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1787
1788 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001789 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001790
1791 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1792 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1793 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1794 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001795 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001796
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001797 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001798
1799 // Initial guess
1800 qint16x4_t x = temp;
1801
1802 // Calculate (x / 2) * (3 - a * x^2)
1803 // After five iterations we have the result for 16 bit
1804 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1805 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1806 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1807 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1809
1810 return vqshl_s16(x, shift_value2);
1811}
1812
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001813inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1814{
1815 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1816
1817 // Find shift value. Number must be in (0.5, 2) range.
1818 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1819
1820 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1821 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1822 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1823 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1824 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1825
1826 temp = vshlq_s8(a, shift_value);
1827
1828 // Initial guess
1829 qint8x16_t x = temp;
1830
1831 // Calculate (x / 2) * (3 - a * x^2)
1832 // After three iterations we have the result for 8 bit
1833 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1834 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1835 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1836
1837 return vshlq_s8(x, shift_value2);
1838}
1839
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001840inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1841{
1842 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1843
1844 // Find shift value. Number must be in (0.5, 2) range.
1845 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1846
1847 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1848 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1849 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1850 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1851 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1852
1853 temp = vshlq_s16(a, shift_value);
1854
1855 // Initial guess
1856 qint16x8_t x = temp;
1857
1858 // Calculate (x / 2) * (3 - a * x^2)
1859 // After five iterations we have the result for 16 bit
1860 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1861 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1862 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1863 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1864 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1865
1866 return vshlq_s16(x, shift_value2);
1867}
1868
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001869inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1870{
1871 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1872
1873 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001874 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001875
1876 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001877 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001878 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001879 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001880 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001881
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001882 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001883
1884 // Initial guess
1885 qint8x16_t x = temp;
1886
1887 // Calculate (x / 2) * (3 - a * x^2)
1888 // After three iterations we have the result for 8 bit
1889 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1890 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1891 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1892
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001893 return vqshlq_s8(x, shift_value2);
1894}
1895
1896inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1897{
1898 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1899
1900 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001901 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001902
1903 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1904 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1905 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1906 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001907 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001908
1909 temp = vqshlq_s16(a, shift_value);
1910
1911 // Initial guess
1912 qint16x8_t x = temp;
1913
1914 // Calculate (x / 2) * (3 - a * x^2)
1915 // After five iterations we have the result for 16 bit
1916 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1917 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1918 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1919 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1920 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1921
1922 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001923}
1924
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001925inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001926{
1927 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1928 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1929
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001930 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1931 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1932 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1933 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001934
1935 return tanh;
1936}
1937
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001938inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001939{
1940 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1941 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1942
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001943 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1944 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1945 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1946 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001947
1948 return tanh;
1949}
1950
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001951inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001952{
1953 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1954 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1955
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001956 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1957 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1958 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1959 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001960
1961 return tanh;
1962}
1963
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001964inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1965{
1966 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1967 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1968
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001969 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1970 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1971 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1972 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001973
1974 return tanh;
1975}
1976
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001977inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1978{
1979 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1980}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001981
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +01001982inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1983{
1984 return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
1985}
1986
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001987inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1988{
1989 float32x4x2_t res =
1990 {
1991 {
1992 vmaxq_f32(a.val[0], b.val[0]),
1993 vmaxq_f32(a.val[1], b.val[1])
1994 }
1995 };
1996 return res;
1997}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001998}