blob: b86c3cbec3a7b630c98f3e52f0017f2815ade385 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Alex Gildayc357c472018-03-21 13:54:09 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Abe Mbise4bd2cb82017-09-27 18:39:19 +010024#include <array>
Georgios Pinitas00394ae2017-06-22 18:13:55 +010025#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
27namespace arm_compute
28{
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010029/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
30 * Format is in Q0.7 for all elements
31 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010032static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033{
34 {
35 vdup_n_s8(0x7F), // 0.9978546
36 vdup_n_s8(0x3F), // 0.4994721
37 vdup_n_s8(0x16), // 0.1763723
38 vdup_n_s8(0x05), // 0.0435108
39 }
40};
41
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010042/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
43 * Format is in Q0.15 for all elements
44 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010045static const std::array<qint16x4_t, 4> exp_tab_qs16 =
46{
47 {
48 vdup_n_s16(0x7FBA), // 0.9978546
49 vdup_n_s16(0x3FE9), // 0.4994721
50 vdup_n_s16(0x1693), // 0.1763723
51 vdup_n_s16(0x0592), // 0.0435108
52 }
53};
54
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010055/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
56 * Format is in Q0.7 for all elements
57 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010058static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010059{
60 {
61 vdupq_n_s8(0x7F), // 0.9978546
62 vdupq_n_s8(0x3F), // 0.4994721
63 vdupq_n_s8(0x16), // 0.1763723
64 vdupq_n_s8(0x05), // 0.0435108
65 }
66};
67
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010068/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
69 * Format is in Q0.15 for all elements
70 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010071static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
72{
73 {
74 vdupq_n_s16(0x7FBA), // 0.9978546
75 vdupq_n_s16(0x3FE9), // 0.4994721
76 vdupq_n_s16(0x1693), // 0.1763723
77 vdupq_n_s16(0x0592), // 0.0435108
78 }
79};
80
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010081/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
82 * Format is in Q0.7 for all elements except the first one which is in Q1.6
83 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010084static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010085{
86 {
87 vdup_n_s8(0x5C), // 1.4384189
88 vdup_n_s8(-0x56), // -0.6771900
89 vdup_n_s8(0x29), // 0.3218538
90 vdup_n_s8(-0x0A), // -0.0832229
91 }
92};
93
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010094/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
95 * Format is in Q0.15 for all elements except the first one which is in Q1.14
96 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010097static const std::array<qint16x4_t, 4> log_tab_qs16 =
98{
99 {
100 vdup_n_s16(0x5C0F), // 1.4384189
101 vdup_n_s16(-0x56AE), // -0.6771900
102 vdup_n_s16(0x2933), // 0.3218538
103 vdup_n_s16(-0x0AA7), // -0.0832229
104 }
105};
106
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100107/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
108 * Format is in Q0.7 for all elements except the first one which is in Q1.6
109 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100110static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100111{
112 {
113 vdupq_n_s8(0x5C), // 1.4384189
114 vdupq_n_s8(-0x56), // -0.6771900
115 vdupq_n_s8(0x29), // 0.3218538
116 vdupq_n_s8(-0x0A), // -0.0832229
117 }
118};
119
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100120/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
121 * Format is in Q0.15 for all elements except the first one which is in Q1.14
122 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100123static const std::array<qint16x8_t, 4> log_tabq_qs16 =
124{
125 {
126 vdupq_n_s16(0x5C0F), // 1.4384189
127 vdupq_n_s16(-0x56AE), // -0.6771900
128 vdupq_n_s16(0x2933), // 0.3218538
129 vdupq_n_s16(-0x0AA7), // -0.0832229
130 }
131};
132
Alex Gildayc357c472018-03-21 13:54:09 +0000133#ifndef DOXYGEN_SKIP_THIS
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134inline qint8x8_t vget_low_qs8(qint8x16_t a)
135{
136 return vget_low_s8(a);
137}
138
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100139inline qint16x4_t vget_low_qs16(qint16x8_t a)
140{
141 return vget_low_s16(a);
142}
143
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144inline qint8x8_t vget_high_qs8(qint8x16_t a)
145{
146 return vget_high_s8(a);
147}
148
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100149inline qint16x4_t vget_high_qs16(qint16x8_t a)
150{
151 return vget_high_s16(a);
152}
153
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100154inline qint8x8_t vld1_qs8(const qint8_t *addr)
155{
156 return vld1_s8(addr);
157}
158
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159inline qint16x4_t vld1_qs16(const qint16_t *addr)
160{
161 return vld1_s16(addr);
162}
163
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100164inline qint8x16_t vld1q_qs8(const qint8_t *addr)
165{
166 return vld1q_s8(addr);
167}
168
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100169inline qint16x8_t vld1q_qs16(const qint16_t *addr)
170{
171 return vld1q_s16(addr);
172}
173
174inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
175{
176 return vld1_dup_s8(addr);
177}
178
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100179inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
180{
181 return vld1_dup_s16(addr);
182}
183
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100184inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
185{
186 return vld1q_dup_s8(addr);
187}
188
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100189inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
190{
191 return vld1q_dup_s16(addr);
192}
193
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100194inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
195{
196 return vld2q_s16(addr);
197}
198
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100199inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
200{
201 vst1_s8(addr, b);
202}
203
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
205{
206 vst1_s16(addr, b);
207}
208
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100209inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
210{
211 vst1q_s8(addr, b);
212}
213
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
215{
216 vst1q_s16(addr, b);
217}
218
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100219inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
220{
221 vst2q_s16(addr, b);
222}
223
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224inline qint8x8_t vqmovn_qs16(qint16x8_t a)
225{
226 return vqmovn_s16(a);
227}
228
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100229inline qint16x4_t vqmovn_qs32(qint32x4_t a)
230{
231 return vqmovn_s32(a);
232}
233
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234inline qint8x8_t vdup_n_qs8(qint8_t a)
235{
236 return vdup_n_s8(a);
237}
238
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100239inline qint16x4_t vdup_n_qs16(qint16_t a)
240{
241 return vdup_n_s16(a);
242}
243
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100244inline qint8x16_t vdupq_n_qs8(qint8_t a)
245{
246 return vdupq_n_s8(a);
247}
248
249inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
250{
251 float32x4x4_t res =
252 {
253 {
254 vdupq_n_f32(a),
255 vdupq_n_f32(a),
256 vdupq_n_f32(a),
257 vdupq_n_f32(a),
258 }
259 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100260 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100261}
262
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +0100263inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
264{
265 float32x4x2_t res =
266 {
267 {
268 vdupq_n_f32(a),
269 vdupq_n_f32(a),
270 }
271 };
272 return vqcvtq_qs16_f32(res, fixed_point_position);
273}
274
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100275inline qint16x8_t vdupq_n_qs16(qint16_t a)
276{
277 return vdupq_n_s16(a);
278}
279
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100280inline qint32x4_t vdupq_n_qs32(qint32_t a)
281{
282 return vdupq_n_s32(a);
283}
284
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285inline qint8x8_t vabs_qs8(qint8x8_t a)
286{
287 return vabs_s8(a);
288}
289
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100290inline qint16x4_t vabs_qs16(qint16x4_t a)
291{
292 return vabs_s16(a);
293}
294
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100295inline qint8x16_t vabsq_qs8(qint8x16_t a)
296{
297 return vabsq_s8(a);
298}
299
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100300inline qint16x8_t vabsq_qs16(qint16x8_t a)
301{
302 return vabsq_s16(a);
303}
304
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100305inline qint8x8_t vqabs_qs8(qint8x8_t a)
306{
307 return vqabs_s8(a);
308}
309
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100310inline qint16x4_t vqabs_qs16(qint16x4_t a)
311{
312 return vqabs_s16(a);
313}
314
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100315inline qint8x16_t vqabsq_qs8(qint8x16_t a)
316{
317 return vqabsq_s8(a);
318}
319
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100320inline qint16x8_t vqabsq_qs16(qint16x8_t a)
321{
322 return vqabsq_s16(a);
323}
324
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100325inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
326{
327 return vmax_s8(a, b);
328}
329
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100330inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
331{
332 return vmax_s16(a, b);
333}
334
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100335inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
336{
337 return vmaxq_s8(a, b);
338}
339
340inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
341{
342 return vpmax_s8(a, b);
343}
344
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100345inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
346{
347 return vpmax_s16(a, b);
348}
349
350inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
351{
352 return vmaxq_s16(a, b);
353}
354
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100355inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
356{
357 return vmin_s8(a, b);
358}
359
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100360inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
361{
362 return vmin_s16(a, b);
363}
364
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100365inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
366{
367 return vminq_s8(a, b);
368}
369
370inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
371{
372 return vpmin_s8(a, b);
373}
374
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100375inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
376{
377 return vpmin_s16(a, b);
378}
379
380inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
381{
382 return vminq_s16(a, b);
383}
384
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100385inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
386{
387 return vadd_s8(a, b);
388}
389
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100390inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
391{
392 return vadd_s16(a, b);
393}
394
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100395inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
396{
397 return vaddq_s8(a, b);
398}
399
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100400inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
401{
402 return vaddq_s16(a, b);
403}
404
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100405inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
406{
407 return vqadd_s8(a, b);
408}
409
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100410inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
411{
412 return vqadd_s16(a, b);
413}
414
Georgios Pinitas9247c922017-06-28 18:29:47 +0100415inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
416{
417 return vqadd_s32(a, b);
418}
419
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100420inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
421{
422 return vqaddq_s8(a, b);
423}
424
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100425inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
426{
427 return vqaddq_s16(a, b);
428}
429
Georgios Pinitas9247c922017-06-28 18:29:47 +0100430inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
431{
432 return vqaddq_s32(a, b);
433}
434
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100435inline int16x4_t vpaddl_qs8(qint8x8_t a)
436{
437 return vpaddl_s8(a);
438}
439
440inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
441{
442 return vsub_s8(a, b);
443}
444
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100445inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
446{
447 return vsub_s16(a, b);
448}
449
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100450inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
451{
452 return vsubq_s8(a, b);
453}
454
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100455inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
456{
457 return vsubq_s16(a, b);
458}
459
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100460inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
461{
462 return vqsub_s8(a, b);
463}
464
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100465inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
466{
467 return vqsub_s16(a, b);
468}
469
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100470inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
471{
472 return vqsubq_s8(a, b);
473}
474
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100475inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
476{
477 return vqsubq_s16(a, b);
478}
479
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100480inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
481{
482 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
483
484 // Initialize the temporary result with a constant used to round up the result
485 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
486
487 // Vector multiply-accumulate long
488 res = vmlal_s8(res, a, b);
489
490 // Shift right by fixed_point_position
491 res = vshlq_s16(res, fixed_point_position_s16);
492
493 // Convert back to qint8
494 return vmovn_s16(res);
495}
496
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100497inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
498{
499 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
500
501 // Initialize the temporary result with a constant used to round up the result
502 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
503
504 // Vector multiply-accumulate long
505 res = vmlal_s16(res, a, b);
506
507 // Shift right by fixed_point_position
508 res = vshlq_s32(res, fixed_point_position_s32);
509
510 // Convert back to qint16
511 return vmovn_s32(res);
512}
513
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100514inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
515{
516 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
517
518 // Initialize the temporary results with a constant used to round up the result
519 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
520 qint16x8_t res1 = res0;
521
522 // Vector multiply-accumulate long
523 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
524 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
525
526 // Shift right by fixed_point_position
527 res0 = vshlq_s16(res0, fixed_point_position_s16);
528 res1 = vshlq_s16(res1, fixed_point_position_s16);
529
530 // Convert back to qint8
531 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
532}
533
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100534inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
535{
536 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
537
538 // Initialize the temporary results with a constant used to round up the result
539 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
540 qint32x4_t res1 = res0;
541
542 // Vector multiply-accumulate long
543 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
544 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
545
546 // Shift right by fixed_point_position
547 res0 = vshlq_s32(res0, fixed_point_position_s32);
548 res1 = vshlq_s32(res1, fixed_point_position_s32);
549
550 // Convert back to qint16
551 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
552}
553
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100554inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
555{
556 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
557
558 // Initialize the temporary result with a constant used to round up the result
559 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
560
561 // Vector multiply-accumulate long
562 res = vmlal_s8(res, a, b);
563
564 // Shift right by fixed_point_position
565 res = vqshlq_s16(res, fixed_point_position_s16);
566
567 // Convert back to qint8 and saturate
568 return vqmovn_s16(res);
569}
570
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100571inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
572{
573 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
574
575 // Initialize the temporary result with a constant used to round up the result
576 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
577
578 // Vector multiply-accumulate long
579 res = vmlal_s16(res, a, b);
580
581 // Shift right by fixed_point_position
582 res = vqshlq_s32(res, fixed_point_position_s32);
583
584 // Convert back to qint16 and saturate
585 return vqmovn_s32(res);
586}
587
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100588inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
589{
590 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
591
592 // Initialize the temporary results with a constant used to round up the result
593 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
594 qint16x8_t res1 = res0;
595
596 // Vector multiply-accumulate long
597 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
598 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
599
600 // Shift right by fixed_point_position
601 res0 = vqshlq_s16(res0, fixed_point_position_s16);
602 res1 = vqshlq_s16(res1, fixed_point_position_s16);
603
604 // Convert back to qint8 and saturate
605 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
606}
607
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100608inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
609{
610 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
611
612 // Initialize the temporary results with a constant used to round up the result
613 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
614 qint32x4_t res1 = res0;
615
616 // Vector multiply-accumulate long
617 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
618 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
619
620 // Shift right by fixed_point_position
621 res0 = vqshlq_s32(res0, fixed_point_position_s32);
622 res1 = vqshlq_s32(res1, fixed_point_position_s32);
623
624 // Convert back to qint16 and saturate
625 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
626}
627
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100628inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
629{
630 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
631
632 qint16x8_t res = vmull_s8(a, b);
633
634 return vqrshlq_s16(res, fixed_point_position_s16);
635}
636
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100637inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
638{
639 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
640
641 // Initialize the temporary results with a constant used to round up the result
642 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
643
644 // Vector multiply-accumulate long
645 tmp = vmull_s16(a, b);
646
647 // Shift right by fixed_point_position
648 return vqshlq_s32(tmp, fixed_point_position_s32);
649}
650
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100651inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
652{
653 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
654
655 // Initialize the temporary results with a constant used to round up the result
656 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
657
658 // Vector multiply-accumulate long
659 tmp = vmlal_s8(tmp, b, c);
660
661 // Shift right by fixed_point_position
662 tmp = vshlq_s16(tmp, fixed_point_position_s16);
663
664 // Convert back to qint8 and accumulate
665 return vadd_s8(a, vmovn_s16(tmp));
666}
667
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100668inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
669{
670 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
671
672 // Initialize the temporary results with a constant used to round up the result
673 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
674
675 // Vector multiply-accumulate long
676 tmp = vmlal_s16(tmp, b, c);
677
678 // Shift right by fixed_point_position
679 tmp = vshlq_s32(tmp, fixed_point_position_s32);
680
681 // Convert back to qint16 and accumulate
682 return vadd_s16(a, vmovn_s32(tmp));
683}
684
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100685inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
686{
687 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
688
689 // Initialize the temporary results with a constant used to round up the result
690 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
691 qint16x8_t tmp1 = tmp0;
692
693 // Vector multiply-accumulate long
694 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
695 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
696
697 // Shift right by fixed_point_position
698 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
699 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
700
701 // Convert back to qint8 and accumulate
702 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
703}
704
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100705inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
706{
707 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
708
709 // Initialize the temporary results with a constant used to round up the result
710 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
711 qint32x4_t tmp1 = tmp0;
712
713 // Vector multiply-accumulate long
714 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
715 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
716
717 // Shift right by fixed_point_position
718 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
719 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
720
721 // Convert back to qint16 and accumulate
722 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
723}
724
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100725inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
726{
727 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
728
729 // Initialize the temporary results with a constant used to round up the result
730 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
731
732 // Vector multiply-accumulate long
733 tmp = vmlal_s8(tmp, b, c);
734
735 // Shift right by fixed_point_position
736 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
737
738 // Convert back to qint8 and accumulate
739 return vqadd_s8(a, vqmovn_s16(tmp));
740}
741
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100742inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
743{
744 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
745
746 // Initialize the temporary results with a constant used to round up the result
747 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
748
749 // Vector multiply-accumulate long
750 tmp = vmlal_s16(tmp, b, c);
751
752 // Shift right by fixed_point_position
753 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
754
755 // Convert back to qint8 and accumulate
756 return vqadd_s16(a, vqmovn_s32(tmp));
757}
758
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100759inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
760{
761 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
762
763 // Initialize the temporary results with a constant used to round up the result
764 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
765 qint16x8_t tmp1 = tmp0;
766
767 // Vector multiply-accumulate long
768 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
769 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
770
771 // Shift right by fixed_point_position
772 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
773 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
774
775 // Convert back to qint8 and accumulate
776 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
777 return vqaddq_s8(a, res);
778}
779
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100780inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
781{
782 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
783
784 // Initialize the temporary results with a constant used to round up the result
785 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
786 qint32x4_t tmp1 = tmp0;
787
788 // Vector multiply-accumulate long
789 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
790 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
791
792 // Shift right by fixed_point_position
793 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
794 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
795
796 // Convert back to qint16 and accumulate
797 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
798 return vqaddq_s16(a, res);
799}
800
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100801inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
802{
803 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
804
805 // Initialize the temporary results with a constant used to round up the result
806 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
807
808 // Vector multiply-accumulate long
809 tmp = vmlal_s8(tmp, b, c);
810
811 // Shift right by fixed_point_position
812 tmp = vshlq_s16(tmp, fixed_point_position_s16);
813
814 // Accumulate
815 return vaddq_s16(a, tmp);
816}
817
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100818inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
819{
820 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
821
822 // Initialize the temporary results with a constant used to round up the result
823 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
824
825 // Vector multiply-accumulate long
826 tmp = vmlal_s16(tmp, b, c);
827
828 // Shift right by fixed_point_position
829 tmp = vshlq_s32(tmp, fixed_point_position_s32);
830
831 // Accumulate
832 return vaddq_s32(a, tmp);
833}
834
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100835inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
836{
837 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
838
839 // Initialize the temporary results with a constant used to round up the result
840 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
841
842 // Vector multiply-accumulate long
843 tmp = vmlal_s8(tmp, b, c);
844
845 // Shift right by fixed_point_position
846 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
847
848 // Accumulate
849 return vqaddq_s16(a, tmp);
850}
851
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100852inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
853{
854 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
855
856 // Initialize the temporary results with a constant used to round up the result
857 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
858
859 // Vector multiply-accumulate long
860 tmp = vmlal_s16(tmp, b, c);
861
862 // Shift right by fixed_point_position
863 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
864
865 // Accumulate
866 return vqaddq_s32(a, tmp);
867}
868
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100869inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100870{
871 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
872
873 float32x4x2_t res_f32 =
874 {
875 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100876 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
877 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100878 }
879 };
880
881 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
882 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
883
884 const int32x4x2_t res_s32 =
885 {
886 {
887 vcvtq_s32_f32(res_f32.val[0]),
888 vcvtq_s32_f32(res_f32.val[1]),
889 }
890 };
891
892 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
893
894 return vqmovn_s16(res_s16);
895}
896
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100897inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100898{
899 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
900
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100901 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100902
903 res_f32 = vmlaq_f32(res_f32, a, pow2);
904
905 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
906
907 return vqmovn_s32(res_s32);
908}
909
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100910inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100911{
912 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
913
914 float32x4x4_t res_f32 =
915 {
916 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100917 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
918 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
919 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
920 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100921 }
922 };
923
924 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
925 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
926 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
927 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
928
929 const int32x4x4_t res_s32 =
930 {
931 {
932 vcvtq_s32_f32(res_f32.val[0]),
933 vcvtq_s32_f32(res_f32.val[1]),
934 vcvtq_s32_f32(res_f32.val[2]),
935 vcvtq_s32_f32(res_f32.val[3]),
936 }
937 };
938
939 const int16x8x2_t res_s16 =
940 {
941 {
942 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
943 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
944 }
945 };
946
947 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
948}
949
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100950inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100951{
952 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
953
954 float32x4x2_t res_f32 =
955 {
956 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100957 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
958 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100959 }
960 };
961
962 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
963 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
964
965 const int32x4x2_t res_s32 =
966 {
967 {
968 vcvtq_s32_f32(res_f32.val[0]),
969 vcvtq_s32_f32(res_f32.val[1])
970 }
971 };
972
973 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
974}
975
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100976inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
977{
978 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
979
980 const int16x8_t res_s16 = vmovl_s8(a);
981
982 const int32x4x2_t res_s32 =
983 {
984 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100985 vmovl_s16(vget_low_qs16(res_s16)),
986 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100987 }
988 };
989
990 float32x4x2_t res_f32 =
991 {
992 {
993 vcvtq_f32_s32(res_s32.val[0]),
994 vcvtq_f32_s32(res_s32.val[1])
995 }
996 };
997
998 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
999 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1000
1001 return res_f32;
1002}
1003
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001004inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
1005{
1006 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1007 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
1008
1009 return vmulq_f32(res_f32, pow2);
1010}
1011
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001012inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
1013{
1014 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1015
1016 const int16x8x2_t res_s16 =
1017 {
1018 {
1019 vmovl_s8(vget_low_s8(a)),
1020 vmovl_s8(vget_high_s8(a)),
1021 }
1022 };
1023
1024 const int32x4x4_t res_s32 =
1025 {
1026 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001027 vmovl_s16(vget_low_qs16(res_s16.val[0])),
1028 vmovl_s16(vget_high_qs16(res_s16.val[0])),
1029 vmovl_s16(vget_low_qs16(res_s16.val[1])),
1030 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001031 }
1032 };
1033
1034 float32x4x4_t res_f32 =
1035 {
1036 {
1037 vcvtq_f32_s32(res_s32.val[0]),
1038 vcvtq_f32_s32(res_s32.val[1]),
1039 vcvtq_f32_s32(res_s32.val[2]),
1040 vcvtq_f32_s32(res_s32.val[3])
1041 }
1042 };
1043
1044 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1045 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1046 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1047 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1048
1049 return res_f32;
1050}
1051
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001052inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1053{
1054 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1055
1056 const int32x4x2_t res_s32 =
1057 {
1058 {
1059 vmovl_s16(vget_low_qs16(a)),
1060 vmovl_s16(vget_high_qs16(a))
1061 }
1062 };
1063
1064 float32x4x2_t res_f32 =
1065 {
1066 {
1067 vcvtq_f32_s32(res_s32.val[0]),
1068 vcvtq_f32_s32(res_s32.val[1])
1069 }
1070 };
1071
1072 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1073 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1074
1075 return res_f32;
1076}
1077
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001078inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1079{
1080 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001081 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1082 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1083 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001084 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001085
1086 // Find shift value
1087 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1088 const qint8x8_t temp = vshl_s8(a, shift_value);
1089
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001090 // Newton-Raphson division initial estimate X0 calculation
1091 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001092
1093 uint8x8_t set_one = vcgt_s8(x, const_one);
1094 x = vbsl_s8(set_one, const_one, x);
1095
1096 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001097 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1098 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1099 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001100
1101 return vshl_s8(x, shift_value);
1102}
1103
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001104inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1105{
1106 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1107 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1108 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1109 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001110 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001111
1112 // Find shift value
1113 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1114 const qint16x4_t temp = vshl_s16(a, shift_value);
1115
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001116 // Newton-Raphson division initial estimate X0 calculation
1117 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001118
1119 uint16x4_t set_one = vcgt_s16(x, const_one);
1120 x = vbsl_s16(set_one, const_one, x);
1121
Michalis Spyrou25466a92017-08-17 12:56:46 +01001122 // Use four iterations of Newton-Raphson method to get the result
1123 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1124 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1125 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1126 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001127
1128 return vshl_s16(x, shift_value);
1129}
1130
Georgios Pinitas9247c922017-06-28 18:29:47 +01001131inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1132{
1133 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1134 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1135 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1136 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001137 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001138
1139 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001140 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001141 const qint8x8_t temp = vqshl_s8(a, shift_value);
1142
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001143 // Newton-Raphson division initial estimate X0 calculation
1144 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001145
1146 uint8x8_t set_one = vcgt_s8(x, const_one);
1147 x = vbsl_s8(set_one, const_one, x);
1148
1149 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001150 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1151 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1152 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001153
1154 return vqshl_s8(x, shift_value);
1155}
1156
1157inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1158{
1159 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1160 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1161 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1162 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001163 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001164
1165 // Find shift value
1166 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1167 const qint16x4_t temp = vqshl_s16(a, shift_value);
1168
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001169 // Newton-Raphson division initial estimate X0 calculation
1170 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001171
1172 uint16x4_t set_one = vcgt_s16(x, const_one);
1173 x = vbsl_s16(set_one, const_one, x);
1174
Michalis Spyrou25466a92017-08-17 12:56:46 +01001175 // Use four iterations of Newton-Raphson method to get the result
1176 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1177 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1178 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1179 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001180
1181 return vqshl_s16(x, shift_value);
1182}
1183
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001184inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1185{
1186 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001187 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1188 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1189 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001190 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001191
1192 // Find shift value
1193 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1194 const qint8x16_t temp = vshlq_s8(a, shift_value);
1195
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001196 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001197 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001198
1199 // Set initial guess to one if x > 1
1200 uint8x16_t set_one = vcgtq_s8(x, const_one);
1201 x = vbslq_s8(set_one, const_one, x);
1202
1203 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001204 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1205 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1206 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001207
1208 return vshlq_s8(x, shift_value);
1209}
1210
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001211inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1212{
1213 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1214 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1215 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1216 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001217 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001218
1219 // Find shift value
1220 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1221 const qint16x8_t temp = vshlq_s16(a, shift_value);
1222
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001223 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001224 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1225
1226 // Set initial guess to one if x > 1
1227 uint16x8_t set_one = vcgtq_s16(x, const_one);
1228 x = vbslq_s16(set_one, const_one, x);
1229
Michalis Spyrou25466a92017-08-17 12:56:46 +01001230 // Use four iterations of Newton-Raphson method to get the result
1231 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1232 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1233 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1234 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001235
1236 return vshlq_s16(x, shift_value);
1237}
1238
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001239inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1240{
1241 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001242 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1243 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1244 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001245 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001246
1247 // Find shift value
1248 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1249 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1250
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001251 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001252 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001253
1254 // Set initial guess to one if x > 1
1255 uint8x16_t set_one = vcgtq_s8(x, const_one);
1256 x = vbslq_s8(set_one, const_one, x);
1257
1258 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001259 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1260 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1261 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001262
1263 return vqshlq_s8(x, shift_value);
1264}
1265
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001266inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1267{
1268 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1269 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1270 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1271 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001272 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001273
1274 // Find shift value
1275 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1276 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1277
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001278 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001279 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1280
1281 // Set initial guess to one if x > 1
1282 uint16x8_t set_one = vcgtq_s16(x, const_one);
1283 x = vbslq_s16(set_one, const_one, x);
1284
Michalis Spyrou25466a92017-08-17 12:56:46 +01001285 // Use four iterations of Newton-Raphson method to get the result
1286 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1287 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1288 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1289 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001290
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001291 // Saturate result in case of overflow
1292 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001293}
1294
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001295inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1296{
1297 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1298}
1299
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001300inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1301{
1302 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1303}
1304
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001305inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1306{
1307 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1308}
1309
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001310inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1311{
1312 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1313}
1314
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001315template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001316inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001317{
1318 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1319 const qint8x8_t const_one = vdup_n_s8(1);
1320 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1321 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1322 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1323 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1324 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1325 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1326 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1327 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1328 return res;
1329}
1330
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001331template <bool islog>
1332inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1333{
1334 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1335 const qint16x4_t const_one = vdup_n_s16(1);
1336 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1337 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1338 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1339 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1340 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1341 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1342 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1343 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1344 return res;
1345}
1346
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001347template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001348inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001349{
1350 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1351 const qint8x8_t const_one = vdup_n_s8(1);
1352 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1353 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1354 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1355 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1356 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1357 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1358 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1359 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1360 return res;
1361}
1362
1363template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001364inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1365{
1366 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1367 const qint16x4_t const_one = vdup_n_s16(1);
1368 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1369 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1370 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1371 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1372 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1373 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1374 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1375 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1376 return res;
1377}
1378
1379template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001380inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1381{
1382 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1383 const qint8x16_t const_one = vdupq_n_s8(1);
1384 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1385 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1386 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1387 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1388 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1389 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1390 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1391 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1392 return res;
1393}
1394
1395template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001396inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1397{
1398 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1399 const qint16x8_t const_one = vdupq_n_s16(1);
1400 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1401 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1402 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1403 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1404 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1405 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1406 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1407 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1408 return res;
1409}
1410
1411template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001412inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1413{
1414 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1415 const qint8x16_t const_one = vdupq_n_s8(1);
1416 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1417 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1418 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1419 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1420 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1421 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1422 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1423 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1424 return res;
1425}
1426
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001427template <bool islog>
1428inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1429{
1430 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1431 const qint16x8_t const_one = vdupq_n_s16(1);
1432 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1433 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1434 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1435 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1436 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1437 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1438 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1439 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1440 return res;
1441}
1442
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001443inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1444{
1445 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1446 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1447 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1448 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1449
1450 // Perform range reduction [-log(2),log(2)]
1451 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1452
1453 // get decimal part from m
1454 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1455
1456 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1457 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1458
1459 // Polynomial Approximation
1460 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1461 poly = vqadd_s8(poly, const_one);
1462
1463 // Reconstruct
1464 poly = vqshl_s8(poly, dec_m);
1465
1466 return poly;
1467}
1468
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001469inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1470{
1471 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1472 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1473 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1474 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1475
1476 // Perform range reduction [-log(2),log(2)]
1477 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1478
1479 // get decimal part from m
1480 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1481
1482 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1483 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1484
1485 // Polynomial Approximation
1486 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1487 poly = vqadd_s16(poly, const_one);
1488
1489 // Reconstruct
1490 poly = vqshl_s16(poly, dec_m);
1491
1492 return poly;
1493}
1494
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001495inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1496{
1497 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1498 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1499 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1500 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1501
1502 // Perform range reduction [-log(2),log(2)]
1503 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1504
1505 // get decimal part from m
1506 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1507
1508 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1509 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1510
1511 // Polynomial Approximation
1512 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1513 poly = vqaddq_s8(poly, const_one);
1514
1515 // Reconstruct
1516 poly = vqshlq_s8(poly, dec_m);
1517
1518 return poly;
1519}
1520
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001521inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1522{
1523 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1524 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1525 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1526 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1527
1528 // Perform range reduction [-log(2),log(2)]
1529 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1530
1531 // get decimal part from m
1532 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1533
1534 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1535 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1536
1537 // Polynomial Approximation
1538 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1539 poly = vqaddq_s16(poly, const_one);
1540
1541 // Reconstruct
1542 poly = vqshlq_s16(poly, dec_m);
1543
1544 return poly;
1545}
1546
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001547inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1548{
1549 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1550 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1551 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1552
1553 // If 0 < a < 1, calculate log(1/x)
1554 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1555 qint8x8_t recip = vdup_n_s8(0);
1556 recip = vbsl_s8(calc_reciprocal, recip, a);
1557
1558 // Calculate reciprocal
1559 recip = vrecip_qs8(recip, fixed_point_position);
1560 a = vbsl_s8(calc_reciprocal, recip, a);
1561
1562 // Get decimal part of a
1563 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1564 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1565
1566 // Get exponent of 2^n which is equal or less than dec_a
1567 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1568
1569 // Get x to range (1, 2]
1570 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1571 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1572 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1573
1574 // Polynomial Approximation
1575 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1576
1577 // Reconstruct
1578 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1579
1580 // Set negative value for 0 < a < 1
1581 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1582
1583 return poly;
1584}
1585
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001586inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1587{
1588 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1589 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1590 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1591
1592 // If 0 < a < 1, calculate log(1/x)
1593 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1594 qint16x4_t recip = vdup_n_s16(0);
1595 recip = vbsl_s16(calc_reciprocal, recip, a);
1596
1597 // Calculate reciprocal
1598 recip = vrecip_qs16(recip, fixed_point_position);
1599 a = vbsl_s16(calc_reciprocal, recip, a);
1600
1601 // Get decimal part of a
1602 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1603 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1604
1605 // Get exponent of 2^n which is equal or less than dec_a
1606 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1607
1608 // Get x to range (1, 2]
1609 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1610 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1611 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1612
1613 // Polynomial Approximation
1614 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1615
1616 // Reconstruct
1617 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1618
1619 // Set negative value for 0 < a < 1
1620 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1621
1622 return poly;
1623}
1624
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001625inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1626{
1627 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1628 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1629 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1630
1631 // If 0 < a < 1, calculate log(1/x)
1632 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1633 qint8x16_t recip = vdupq_n_s8(0);
1634 recip = vbslq_s8(calc_reciprocal, a, recip);
1635
1636 // Calculate reciprocal
1637 recip = vrecipq_qs8(recip, fixed_point_position);
1638 a = vbslq_s8(calc_reciprocal, recip, a);
1639
1640 // Get decimal part of a
1641 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1642 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1643
1644 // Get exponent of 2^n which is equal or less than dec_a
1645 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1646
1647 // Get x to range (1, 2]
1648 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1649 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1650 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1651
1652 // Polynomial Approximation
1653 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1654
1655 // Reconstruct
1656 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1657
1658 // Set negative value for 0 < a < 1
1659 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1660
1661 return poly;
1662}
1663
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001664inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1665{
1666 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1667 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1668 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1669
1670 // If 0 < a < 1, calculate log(1/x)
1671 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1672 qint16x8_t recip = vdupq_n_s16(0);
1673 recip = vbslq_s16(calc_reciprocal, a, recip);
1674
1675 // Calculate reciprocal
1676 recip = vqrecipq_qs16(recip, fixed_point_position);
1677 a = vbslq_s16(calc_reciprocal, recip, a);
1678
1679 // Get decimal part of a
1680 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1681 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1682
1683 // Get exponent of 2^n which is equal or less than dec_a
1684 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1685
1686 // Get x to range (1, 2]
1687 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1688 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1689 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1690
1691 // Polynomial Approximation
1692 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1693
1694 // Reconstruct
1695 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1696
1697 // Set negative value for 0 < a < 1
1698 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1699
1700 return poly;
1701}
1702
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001703inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1704{
1705 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1706
1707 // Find shift value. Number must be in (0.5, 2) range.
1708 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1709
1710 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1711 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1712 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1713 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1714 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1715
1716 temp = vshl_s8(a, shift_value);
1717
1718 // Initial guess
1719 qint8x8_t x = temp;
1720
1721 // Calculate (x / 2) * (3 - a * x^2)
1722 // After three iterations we have the result for 8 bit
1723 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1724 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1725 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1726
1727 return vshl_s8(x, shift_value2);
1728}
1729
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001730inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1731{
1732 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1733
1734 // Find shift value. Number must be in (0.5, 2) range.
1735 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1736
1737 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1738 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1739 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1740 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1741 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1742
1743 temp = vshl_s16(a, shift_value);
1744
1745 // Initial guess
1746 qint16x4_t x = temp;
1747
1748 // Calculate (x / 2) * (3 - a * x^2)
1749 // After five iterations we have the result for 8 bit
1750 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1751 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1753 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1754 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1755
1756 return vshl_s16(x, shift_value2);
1757}
1758
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001759inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1760{
1761 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1762
1763 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001764 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001765
1766 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001767 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001768 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001769 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001770 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001771
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001772 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001773
1774 // Initial guess
1775 qint8x8_t x = temp;
1776
1777 // Calculate (x / 2) * (3 - a * x^2)
1778 // After three iterations we have the result for 8 bit
1779 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1780 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1781 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1782
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001783 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001784}
1785
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001786inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1787{
1788 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1789
1790 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001791 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001792
1793 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1794 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1795 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1796 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001797 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001798
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001799 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001800
1801 // Initial guess
1802 qint16x4_t x = temp;
1803
1804 // Calculate (x / 2) * (3 - a * x^2)
1805 // After five iterations we have the result for 16 bit
1806 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1807 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1809 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1810 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1811
1812 return vqshl_s16(x, shift_value2);
1813}
1814
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001815inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1816{
1817 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1818
1819 // Find shift value. Number must be in (0.5, 2) range.
1820 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1821
1822 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1823 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1824 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1825 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1826 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1827
1828 temp = vshlq_s8(a, shift_value);
1829
1830 // Initial guess
1831 qint8x16_t x = temp;
1832
1833 // Calculate (x / 2) * (3 - a * x^2)
1834 // After three iterations we have the result for 8 bit
1835 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1836 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1837 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1838
1839 return vshlq_s8(x, shift_value2);
1840}
1841
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001842inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1843{
1844 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1845
1846 // Find shift value. Number must be in (0.5, 2) range.
1847 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1848
1849 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1850 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1851 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1852 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1853 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1854
1855 temp = vshlq_s16(a, shift_value);
1856
1857 // Initial guess
1858 qint16x8_t x = temp;
1859
1860 // Calculate (x / 2) * (3 - a * x^2)
1861 // After five iterations we have the result for 16 bit
1862 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1863 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1864 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1865 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1866 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1867
1868 return vshlq_s16(x, shift_value2);
1869}
1870
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001871inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1872{
1873 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1874
1875 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001876 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001877
1878 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001879 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001880 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001881 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001882 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001883
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001884 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001885
1886 // Initial guess
1887 qint8x16_t x = temp;
1888
1889 // Calculate (x / 2) * (3 - a * x^2)
1890 // After three iterations we have the result for 8 bit
1891 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1892 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1893 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1894
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001895 return vqshlq_s8(x, shift_value2);
1896}
1897
1898inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1899{
1900 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1901
1902 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001903 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001904
1905 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1906 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1907 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1908 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001909 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001910
1911 temp = vqshlq_s16(a, shift_value);
1912
1913 // Initial guess
1914 qint16x8_t x = temp;
1915
1916 // Calculate (x / 2) * (3 - a * x^2)
1917 // After five iterations we have the result for 16 bit
1918 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1919 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1920 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1921 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1922 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1923
1924 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001925}
1926
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001927inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001928{
1929 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1930 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1931
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001932 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1933 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1934 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1935 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001936
1937 return tanh;
1938}
1939
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001940inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001941{
1942 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1943 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1944
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001945 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1946 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1947 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1948 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001949
1950 return tanh;
1951}
1952
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001953inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001954{
1955 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1956 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1957
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001958 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1959 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1960 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1961 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001962
1963 return tanh;
1964}
1965
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001966inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1967{
1968 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1969 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1970
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001971 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1972 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1973 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1974 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001975
1976 return tanh;
1977}
1978
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001979inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1980{
1981 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1982}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001983
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +01001984inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1985{
1986 return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
1987}
1988
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001989inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1990{
1991 float32x4x2_t res =
1992 {
1993 {
1994 vmaxq_f32(a.val[0], b.val[0]),
1995 vmaxq_f32(a.val[1], b.val[1])
1996 }
1997 };
1998 return res;
1999}
Alex Gildayc357c472018-03-21 13:54:09 +00002000#endif /* DOXYGEN_SKIP_THIS */
Gian Marco Iodice356f6432017-09-22 11:32:21 +01002001} // namespace arm_compute