blob: a5d9e7685d591801553989e4edae98633257a716 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas00394ae2017-06-22 18:13:55 +010024#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26namespace arm_compute
27{
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010028/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
29 * Format is in Q0.7 for all elements
30 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010031static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010032{
33 {
34 vdup_n_s8(0x7F), // 0.9978546
35 vdup_n_s8(0x3F), // 0.4994721
36 vdup_n_s8(0x16), // 0.1763723
37 vdup_n_s8(0x05), // 0.0435108
38 }
39};
40
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010041/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
42 * Format is in Q0.15 for all elements
43 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010044static const std::array<qint16x4_t, 4> exp_tab_qs16 =
45{
46 {
47 vdup_n_s16(0x7FBA), // 0.9978546
48 vdup_n_s16(0x3FE9), // 0.4994721
49 vdup_n_s16(0x1693), // 0.1763723
50 vdup_n_s16(0x0592), // 0.0435108
51 }
52};
53
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010054/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
55 * Format is in Q0.7 for all elements
56 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010057static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010058{
59 {
60 vdupq_n_s8(0x7F), // 0.9978546
61 vdupq_n_s8(0x3F), // 0.4994721
62 vdupq_n_s8(0x16), // 0.1763723
63 vdupq_n_s8(0x05), // 0.0435108
64 }
65};
66
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010067/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
68 * Format is in Q0.15 for all elements
69 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010070static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
71{
72 {
73 vdupq_n_s16(0x7FBA), // 0.9978546
74 vdupq_n_s16(0x3FE9), // 0.4994721
75 vdupq_n_s16(0x1693), // 0.1763723
76 vdupq_n_s16(0x0592), // 0.0435108
77 }
78};
79
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010080/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
81 * Format is in Q0.7 for all elements except the first one which is in Q1.6
82 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010083static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010084{
85 {
86 vdup_n_s8(0x5C), // 1.4384189
87 vdup_n_s8(-0x56), // -0.6771900
88 vdup_n_s8(0x29), // 0.3218538
89 vdup_n_s8(-0x0A), // -0.0832229
90 }
91};
92
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010093/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
94 * Format is in Q0.15 for all elements except the first one which is in Q1.14
95 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010096static const std::array<qint16x4_t, 4> log_tab_qs16 =
97{
98 {
99 vdup_n_s16(0x5C0F), // 1.4384189
100 vdup_n_s16(-0x56AE), // -0.6771900
101 vdup_n_s16(0x2933), // 0.3218538
102 vdup_n_s16(-0x0AA7), // -0.0832229
103 }
104};
105
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100106/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
107 * Format is in Q0.7 for all elements except the first one which is in Q1.6
108 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100109static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100110{
111 {
112 vdupq_n_s8(0x5C), // 1.4384189
113 vdupq_n_s8(-0x56), // -0.6771900
114 vdupq_n_s8(0x29), // 0.3218538
115 vdupq_n_s8(-0x0A), // -0.0832229
116 }
117};
118
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100119/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
120 * Format is in Q0.15 for all elements except the first one which is in Q1.14
121 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100122static const std::array<qint16x8_t, 4> log_tabq_qs16 =
123{
124 {
125 vdupq_n_s16(0x5C0F), // 1.4384189
126 vdupq_n_s16(-0x56AE), // -0.6771900
127 vdupq_n_s16(0x2933), // 0.3218538
128 vdupq_n_s16(-0x0AA7), // -0.0832229
129 }
130};
131
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100132inline qint8x8_t vget_low_qs8(qint8x16_t a)
133{
134 return vget_low_s8(a);
135}
136
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100137inline qint16x4_t vget_low_qs16(qint16x8_t a)
138{
139 return vget_low_s16(a);
140}
141
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100142inline qint8x8_t vget_high_qs8(qint8x16_t a)
143{
144 return vget_high_s8(a);
145}
146
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100147inline qint16x4_t vget_high_qs16(qint16x8_t a)
148{
149 return vget_high_s16(a);
150}
151
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100152inline qint8x8_t vld1_qs8(const qint8_t *addr)
153{
154 return vld1_s8(addr);
155}
156
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100157inline qint16x4_t vld1_qs16(const qint16_t *addr)
158{
159 return vld1_s16(addr);
160}
161
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100162inline qint8x16_t vld1q_qs8(const qint8_t *addr)
163{
164 return vld1q_s8(addr);
165}
166
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100167inline qint16x8_t vld1q_qs16(const qint16_t *addr)
168{
169 return vld1q_s16(addr);
170}
171
172inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
173{
174 return vld1_dup_s8(addr);
175}
176
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100177inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
178{
179 return vld1_dup_s16(addr);
180}
181
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100182inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
183{
184 return vld1q_dup_s8(addr);
185}
186
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100187inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
188{
189 return vld1q_dup_s16(addr);
190}
191
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100192inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
193{
194 return vld2q_s16(addr);
195}
196
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
198{
199 vst1_s8(addr, b);
200}
201
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100202inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
203{
204 vst1_s16(addr, b);
205}
206
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100207inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
208{
209 vst1q_s8(addr, b);
210}
211
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100212inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
213{
214 vst1q_s16(addr, b);
215}
216
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100217inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
218{
219 vst2q_s16(addr, b);
220}
221
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100222inline qint8x8_t vqmovn_qs16(qint16x8_t a)
223{
224 return vqmovn_s16(a);
225}
226
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100227inline qint16x4_t vqmovn_qs32(qint32x4_t a)
228{
229 return vqmovn_s32(a);
230}
231
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100232inline qint8x8_t vdup_n_qs8(qint8_t a)
233{
234 return vdup_n_s8(a);
235}
236
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100237inline qint16x4_t vdup_n_qs16(qint16_t a)
238{
239 return vdup_n_s16(a);
240}
241
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100242inline qint8x16_t vdupq_n_qs8(qint8_t a)
243{
244 return vdupq_n_s8(a);
245}
246
247inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
248{
249 float32x4x4_t res =
250 {
251 {
252 vdupq_n_f32(a),
253 vdupq_n_f32(a),
254 vdupq_n_f32(a),
255 vdupq_n_f32(a),
256 }
257 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100258 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100259}
260
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +0100261inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
262{
263 float32x4x2_t res =
264 {
265 {
266 vdupq_n_f32(a),
267 vdupq_n_f32(a),
268 }
269 };
270 return vqcvtq_qs16_f32(res, fixed_point_position);
271}
272
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100273inline qint16x8_t vdupq_n_qs16(qint16_t a)
274{
275 return vdupq_n_s16(a);
276}
277
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100278inline qint32x4_t vdupq_n_qs32(qint32_t a)
279{
280 return vdupq_n_s32(a);
281}
282
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283inline qint8x8_t vabs_qs8(qint8x8_t a)
284{
285 return vabs_s8(a);
286}
287
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100288inline qint16x4_t vabs_qs16(qint16x4_t a)
289{
290 return vabs_s16(a);
291}
292
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100293inline qint8x16_t vabsq_qs8(qint8x16_t a)
294{
295 return vabsq_s8(a);
296}
297
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100298inline qint16x8_t vabsq_qs16(qint16x8_t a)
299{
300 return vabsq_s16(a);
301}
302
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100303inline qint8x8_t vqabs_qs8(qint8x8_t a)
304{
305 return vqabs_s8(a);
306}
307
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100308inline qint16x4_t vqabs_qs16(qint16x4_t a)
309{
310 return vqabs_s16(a);
311}
312
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100313inline qint8x16_t vqabsq_qs8(qint8x16_t a)
314{
315 return vqabsq_s8(a);
316}
317
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100318inline qint16x8_t vqabsq_qs16(qint16x8_t a)
319{
320 return vqabsq_s16(a);
321}
322
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100323inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
324{
325 return vmax_s8(a, b);
326}
327
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100328inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
329{
330 return vmax_s16(a, b);
331}
332
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100333inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
334{
335 return vmaxq_s8(a, b);
336}
337
338inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
339{
340 return vpmax_s8(a, b);
341}
342
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100343inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
344{
345 return vpmax_s16(a, b);
346}
347
348inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
349{
350 return vmaxq_s16(a, b);
351}
352
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100353inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
354{
355 return vmin_s8(a, b);
356}
357
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100358inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
359{
360 return vmin_s16(a, b);
361}
362
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100363inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
364{
365 return vminq_s8(a, b);
366}
367
368inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
369{
370 return vpmin_s8(a, b);
371}
372
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100373inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
374{
375 return vpmin_s16(a, b);
376}
377
378inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
379{
380 return vminq_s16(a, b);
381}
382
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100383inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
384{
385 return vadd_s8(a, b);
386}
387
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100388inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
389{
390 return vadd_s16(a, b);
391}
392
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100393inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
394{
395 return vaddq_s8(a, b);
396}
397
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100398inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
399{
400 return vaddq_s16(a, b);
401}
402
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
404{
405 return vqadd_s8(a, b);
406}
407
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100408inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
409{
410 return vqadd_s16(a, b);
411}
412
Georgios Pinitas9247c922017-06-28 18:29:47 +0100413inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
414{
415 return vqadd_s32(a, b);
416}
417
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100418inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
419{
420 return vqaddq_s8(a, b);
421}
422
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100423inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
424{
425 return vqaddq_s16(a, b);
426}
427
Georgios Pinitas9247c922017-06-28 18:29:47 +0100428inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
429{
430 return vqaddq_s32(a, b);
431}
432
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100433inline int16x4_t vpaddl_qs8(qint8x8_t a)
434{
435 return vpaddl_s8(a);
436}
437
438inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
439{
440 return vsub_s8(a, b);
441}
442
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100443inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
444{
445 return vsub_s16(a, b);
446}
447
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100448inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
449{
450 return vsubq_s8(a, b);
451}
452
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100453inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
454{
455 return vsubq_s16(a, b);
456}
457
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100458inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
459{
460 return vqsub_s8(a, b);
461}
462
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100463inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
464{
465 return vqsub_s16(a, b);
466}
467
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100468inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
469{
470 return vqsubq_s8(a, b);
471}
472
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100473inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
474{
475 return vqsubq_s16(a, b);
476}
477
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100478inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
479{
480 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
481
482 // Initialize the temporary result with a constant used to round up the result
483 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
484
485 // Vector multiply-accumulate long
486 res = vmlal_s8(res, a, b);
487
488 // Shift right by fixed_point_position
489 res = vshlq_s16(res, fixed_point_position_s16);
490
491 // Convert back to qint8
492 return vmovn_s16(res);
493}
494
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100495inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
496{
497 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
498
499 // Initialize the temporary result with a constant used to round up the result
500 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
501
502 // Vector multiply-accumulate long
503 res = vmlal_s16(res, a, b);
504
505 // Shift right by fixed_point_position
506 res = vshlq_s32(res, fixed_point_position_s32);
507
508 // Convert back to qint16
509 return vmovn_s32(res);
510}
511
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100512inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
513{
514 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
515
516 // Initialize the temporary results with a constant used to round up the result
517 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
518 qint16x8_t res1 = res0;
519
520 // Vector multiply-accumulate long
521 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
522 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
523
524 // Shift right by fixed_point_position
525 res0 = vshlq_s16(res0, fixed_point_position_s16);
526 res1 = vshlq_s16(res1, fixed_point_position_s16);
527
528 // Convert back to qint8
529 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
530}
531
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100532inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
533{
534 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
535
536 // Initialize the temporary results with a constant used to round up the result
537 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
538 qint32x4_t res1 = res0;
539
540 // Vector multiply-accumulate long
541 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
542 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
543
544 // Shift right by fixed_point_position
545 res0 = vshlq_s32(res0, fixed_point_position_s32);
546 res1 = vshlq_s32(res1, fixed_point_position_s32);
547
548 // Convert back to qint16
549 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
550}
551
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100552inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
553{
554 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
555
556 // Initialize the temporary result with a constant used to round up the result
557 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
558
559 // Vector multiply-accumulate long
560 res = vmlal_s8(res, a, b);
561
562 // Shift right by fixed_point_position
563 res = vqshlq_s16(res, fixed_point_position_s16);
564
565 // Convert back to qint8 and saturate
566 return vqmovn_s16(res);
567}
568
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100569inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
570{
571 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
572
573 // Initialize the temporary result with a constant used to round up the result
574 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
575
576 // Vector multiply-accumulate long
577 res = vmlal_s16(res, a, b);
578
579 // Shift right by fixed_point_position
580 res = vqshlq_s32(res, fixed_point_position_s32);
581
582 // Convert back to qint16 and saturate
583 return vqmovn_s32(res);
584}
585
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100586inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
587{
588 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
589
590 // Initialize the temporary results with a constant used to round up the result
591 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
592 qint16x8_t res1 = res0;
593
594 // Vector multiply-accumulate long
595 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
596 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
597
598 // Shift right by fixed_point_position
599 res0 = vqshlq_s16(res0, fixed_point_position_s16);
600 res1 = vqshlq_s16(res1, fixed_point_position_s16);
601
602 // Convert back to qint8 and saturate
603 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
604}
605
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100606inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
607{
608 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
609
610 // Initialize the temporary results with a constant used to round up the result
611 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
612 qint32x4_t res1 = res0;
613
614 // Vector multiply-accumulate long
615 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
616 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
617
618 // Shift right by fixed_point_position
619 res0 = vqshlq_s32(res0, fixed_point_position_s32);
620 res1 = vqshlq_s32(res1, fixed_point_position_s32);
621
622 // Convert back to qint16 and saturate
623 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
624}
625
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100626inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
627{
628 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
629
630 qint16x8_t res = vmull_s8(a, b);
631
632 return vqrshlq_s16(res, fixed_point_position_s16);
633}
634
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100635inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
636{
637 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
638
639 // Initialize the temporary results with a constant used to round up the result
640 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
641
642 // Vector multiply-accumulate long
643 tmp = vmull_s16(a, b);
644
645 // Shift right by fixed_point_position
646 return vqshlq_s32(tmp, fixed_point_position_s32);
647}
648
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100649inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
650{
651 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
652
653 // Initialize the temporary results with a constant used to round up the result
654 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
655
656 // Vector multiply-accumulate long
657 tmp = vmlal_s8(tmp, b, c);
658
659 // Shift right by fixed_point_position
660 tmp = vshlq_s16(tmp, fixed_point_position_s16);
661
662 // Convert back to qint8 and accumulate
663 return vadd_s8(a, vmovn_s16(tmp));
664}
665
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100666inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
667{
668 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
669
670 // Initialize the temporary results with a constant used to round up the result
671 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
672
673 // Vector multiply-accumulate long
674 tmp = vmlal_s16(tmp, b, c);
675
676 // Shift right by fixed_point_position
677 tmp = vshlq_s32(tmp, fixed_point_position_s32);
678
679 // Convert back to qint16 and accumulate
680 return vadd_s16(a, vmovn_s32(tmp));
681}
682
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100683inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
684{
685 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
686
687 // Initialize the temporary results with a constant used to round up the result
688 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
689 qint16x8_t tmp1 = tmp0;
690
691 // Vector multiply-accumulate long
692 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
693 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
694
695 // Shift right by fixed_point_position
696 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
697 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
698
699 // Convert back to qint8 and accumulate
700 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
701}
702
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100703inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
704{
705 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
706
707 // Initialize the temporary results with a constant used to round up the result
708 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
709 qint32x4_t tmp1 = tmp0;
710
711 // Vector multiply-accumulate long
712 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
713 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
714
715 // Shift right by fixed_point_position
716 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
717 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
718
719 // Convert back to qint16 and accumulate
720 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
721}
722
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100723inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
724{
725 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
726
727 // Initialize the temporary results with a constant used to round up the result
728 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
729
730 // Vector multiply-accumulate long
731 tmp = vmlal_s8(tmp, b, c);
732
733 // Shift right by fixed_point_position
734 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
735
736 // Convert back to qint8 and accumulate
737 return vqadd_s8(a, vqmovn_s16(tmp));
738}
739
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100740inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
741{
742 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
743
744 // Initialize the temporary results with a constant used to round up the result
745 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
746
747 // Vector multiply-accumulate long
748 tmp = vmlal_s16(tmp, b, c);
749
750 // Shift right by fixed_point_position
751 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
752
753 // Convert back to qint8 and accumulate
754 return vqadd_s16(a, vqmovn_s32(tmp));
755}
756
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100757inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
758{
759 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
760
761 // Initialize the temporary results with a constant used to round up the result
762 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
763 qint16x8_t tmp1 = tmp0;
764
765 // Vector multiply-accumulate long
766 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
767 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
768
769 // Shift right by fixed_point_position
770 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
771 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
772
773 // Convert back to qint8 and accumulate
774 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
775 return vqaddq_s8(a, res);
776}
777
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100778inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
779{
780 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
781
782 // Initialize the temporary results with a constant used to round up the result
783 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
784 qint32x4_t tmp1 = tmp0;
785
786 // Vector multiply-accumulate long
787 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
788 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
789
790 // Shift right by fixed_point_position
791 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
792 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
793
794 // Convert back to qint16 and accumulate
795 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
796 return vqaddq_s16(a, res);
797}
798
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100799inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
800{
801 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
802
803 // Initialize the temporary results with a constant used to round up the result
804 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
805
806 // Vector multiply-accumulate long
807 tmp = vmlal_s8(tmp, b, c);
808
809 // Shift right by fixed_point_position
810 tmp = vshlq_s16(tmp, fixed_point_position_s16);
811
812 // Accumulate
813 return vaddq_s16(a, tmp);
814}
815
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100816inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
817{
818 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
819
820 // Initialize the temporary results with a constant used to round up the result
821 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
822
823 // Vector multiply-accumulate long
824 tmp = vmlal_s16(tmp, b, c);
825
826 // Shift right by fixed_point_position
827 tmp = vshlq_s32(tmp, fixed_point_position_s32);
828
829 // Accumulate
830 return vaddq_s32(a, tmp);
831}
832
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100833inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
834{
835 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
836
837 // Initialize the temporary results with a constant used to round up the result
838 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
839
840 // Vector multiply-accumulate long
841 tmp = vmlal_s8(tmp, b, c);
842
843 // Shift right by fixed_point_position
844 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
845
846 // Accumulate
847 return vqaddq_s16(a, tmp);
848}
849
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100850inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
851{
852 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
853
854 // Initialize the temporary results with a constant used to round up the result
855 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
856
857 // Vector multiply-accumulate long
858 tmp = vmlal_s16(tmp, b, c);
859
860 // Shift right by fixed_point_position
861 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
862
863 // Accumulate
864 return vqaddq_s32(a, tmp);
865}
866
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100867inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100868{
869 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
870
871 float32x4x2_t res_f32 =
872 {
873 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100874 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
875 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100876 }
877 };
878
879 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
880 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
881
882 const int32x4x2_t res_s32 =
883 {
884 {
885 vcvtq_s32_f32(res_f32.val[0]),
886 vcvtq_s32_f32(res_f32.val[1]),
887 }
888 };
889
890 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
891
892 return vqmovn_s16(res_s16);
893}
894
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100895inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100896{
897 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
898
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100899 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100900
901 res_f32 = vmlaq_f32(res_f32, a, pow2);
902
903 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
904
905 return vqmovn_s32(res_s32);
906}
907
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100908inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100909{
910 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
911
912 float32x4x4_t res_f32 =
913 {
914 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100915 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
916 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
917 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
918 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100919 }
920 };
921
922 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
923 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
924 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
925 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
926
927 const int32x4x4_t res_s32 =
928 {
929 {
930 vcvtq_s32_f32(res_f32.val[0]),
931 vcvtq_s32_f32(res_f32.val[1]),
932 vcvtq_s32_f32(res_f32.val[2]),
933 vcvtq_s32_f32(res_f32.val[3]),
934 }
935 };
936
937 const int16x8x2_t res_s16 =
938 {
939 {
940 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
941 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
942 }
943 };
944
945 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
946}
947
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100948inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100949{
950 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
951
952 float32x4x2_t res_f32 =
953 {
954 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100955 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
956 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100957 }
958 };
959
960 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
961 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
962
963 const int32x4x2_t res_s32 =
964 {
965 {
966 vcvtq_s32_f32(res_f32.val[0]),
967 vcvtq_s32_f32(res_f32.val[1])
968 }
969 };
970
971 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
972}
973
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100974inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
975{
976 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
977
978 const int16x8_t res_s16 = vmovl_s8(a);
979
980 const int32x4x2_t res_s32 =
981 {
982 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100983 vmovl_s16(vget_low_qs16(res_s16)),
984 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100985 }
986 };
987
988 float32x4x2_t res_f32 =
989 {
990 {
991 vcvtq_f32_s32(res_s32.val[0]),
992 vcvtq_f32_s32(res_s32.val[1])
993 }
994 };
995
996 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
997 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
998
999 return res_f32;
1000}
1001
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001002inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
1003{
1004 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1005 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
1006
1007 return vmulq_f32(res_f32, pow2);
1008}
1009
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001010inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
1011{
1012 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1013
1014 const int16x8x2_t res_s16 =
1015 {
1016 {
1017 vmovl_s8(vget_low_s8(a)),
1018 vmovl_s8(vget_high_s8(a)),
1019 }
1020 };
1021
1022 const int32x4x4_t res_s32 =
1023 {
1024 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001025 vmovl_s16(vget_low_qs16(res_s16.val[0])),
1026 vmovl_s16(vget_high_qs16(res_s16.val[0])),
1027 vmovl_s16(vget_low_qs16(res_s16.val[1])),
1028 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001029 }
1030 };
1031
1032 float32x4x4_t res_f32 =
1033 {
1034 {
1035 vcvtq_f32_s32(res_s32.val[0]),
1036 vcvtq_f32_s32(res_s32.val[1]),
1037 vcvtq_f32_s32(res_s32.val[2]),
1038 vcvtq_f32_s32(res_s32.val[3])
1039 }
1040 };
1041
1042 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1043 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1044 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1045 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1046
1047 return res_f32;
1048}
1049
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001050inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1051{
1052 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1053
1054 const int32x4x2_t res_s32 =
1055 {
1056 {
1057 vmovl_s16(vget_low_qs16(a)),
1058 vmovl_s16(vget_high_qs16(a))
1059 }
1060 };
1061
1062 float32x4x2_t res_f32 =
1063 {
1064 {
1065 vcvtq_f32_s32(res_s32.val[0]),
1066 vcvtq_f32_s32(res_s32.val[1])
1067 }
1068 };
1069
1070 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1071 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1072
1073 return res_f32;
1074}
1075
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001076inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1077{
1078 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001079 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1080 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1081 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001082
1083 // Find shift value
1084 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1085 const qint8x8_t temp = vshl_s8(a, shift_value);
1086
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001087 // Newton-Raphson division initial estimate X0 calculation
1088 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001089
1090 uint8x8_t set_one = vcgt_s8(x, const_one);
1091 x = vbsl_s8(set_one, const_one, x);
1092
1093 // Use three iterations of Newton-Raphson method to get the result
1094 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1095 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1096 x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1097
1098 return vshl_s8(x, shift_value);
1099}
1100
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001101inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1102{
1103 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1104 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1105 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1106 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1107
1108 // Find shift value
1109 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1110 const qint16x4_t temp = vshl_s16(a, shift_value);
1111
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001112 // Newton-Raphson division initial estimate X0 calculation
1113 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001114
1115 uint16x4_t set_one = vcgt_s16(x, const_one);
1116 x = vbsl_s16(set_one, const_one, x);
1117
1118 // Use five iterations of Newton-Raphson method to get the result
1119 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1120 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1121 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1122 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1123 x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1124
1125 return vshl_s16(x, shift_value);
1126}
1127
Georgios Pinitas9247c922017-06-28 18:29:47 +01001128inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1129{
1130 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1131 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1132 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1133 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1134
1135 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001136 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001137 const qint8x8_t temp = vqshl_s8(a, shift_value);
1138
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001139 // Newton-Raphson division initial estimate X0 calculation
1140 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001141
1142 uint8x8_t set_one = vcgt_s8(x, const_one);
1143 x = vbsl_s8(set_one, const_one, x);
1144
1145 // Use three iterations of Newton-Raphson method to get the result
1146 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1147 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1148 x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
1149
1150 return vqshl_s8(x, shift_value);
1151}
1152
1153inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1154{
1155 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1156 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1157 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1158 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1159
1160 // Find shift value
1161 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1162 const qint16x4_t temp = vqshl_s16(a, shift_value);
1163
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001164 // Newton-Raphson division initial estimate X0 calculation
1165 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001166
1167 uint16x4_t set_one = vcgt_s16(x, const_one);
1168 x = vbsl_s16(set_one, const_one, x);
1169
1170 // Use five iterations of Newton-Raphson method to get the result
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001171 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1172 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1173 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1174 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
1175 x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001176
1177 return vqshl_s16(x, shift_value);
1178}
1179
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001180inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1181{
1182 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001183 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1184 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1185 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001186
1187 // Find shift value
1188 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1189 const qint8x16_t temp = vshlq_s8(a, shift_value);
1190
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001191 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001192 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001193
1194 // Set initial guess to one if x > 1
1195 uint8x16_t set_one = vcgtq_s8(x, const_one);
1196 x = vbslq_s8(set_one, const_one, x);
1197
1198 // Use three iterations of Newton-Raphson method to get the result
1199 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1200 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1201 x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1202
1203 return vshlq_s8(x, shift_value);
1204}
1205
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001206inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1207{
1208 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1209 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1210 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1211 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1212
1213 // Find shift value
1214 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1215 const qint16x8_t temp = vshlq_s16(a, shift_value);
1216
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001217 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001218 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1219
1220 // Set initial guess to one if x > 1
1221 uint16x8_t set_one = vcgtq_s16(x, const_one);
1222 x = vbslq_s16(set_one, const_one, x);
1223
1224 // Use five iterations of Newton-Raphson method to get the result
1225 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1226 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1227 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1228 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1229 x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1230
1231 return vshlq_s16(x, shift_value);
1232}
1233
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001234inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1235{
1236 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001237 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1238 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1239 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001240
1241 // Find shift value
1242 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1243 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1244
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001245 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001246 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001247
1248 // Set initial guess to one if x > 1
1249 uint8x16_t set_one = vcgtq_s8(x, const_one);
1250 x = vbslq_s8(set_one, const_one, x);
1251
1252 // Use three iterations of Newton-Raphson method to get the result
1253 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1254 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1255 x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
1256
1257 return vqshlq_s8(x, shift_value);
1258}
1259
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001260inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1261{
1262 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1263 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1264 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1265 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1266
1267 // Find shift value
1268 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1269 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1270
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001271 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001272 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1273
1274 // Set initial guess to one if x > 1
1275 uint16x8_t set_one = vcgtq_s16(x, const_one);
1276 x = vbslq_s16(set_one, const_one, x);
1277
1278 // Use five iterations of Newton-Raphson method to get the result
1279 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1280 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1281 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1282 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1283 x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
1284
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001285 // Saturate result in case of overflow
1286 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001287}
1288
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001289inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1290{
1291 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1292}
1293
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001294inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1295{
1296 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1297}
1298
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001299inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1300{
1301 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1302}
1303
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001304inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1305{
1306 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1307}
1308
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001309template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001310inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001311{
1312 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1313 const qint8x8_t const_one = vdup_n_s8(1);
1314 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1315 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1316 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1317 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1318 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1319 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1320 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1321 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1322 return res;
1323}
1324
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001325template <bool islog>
1326inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1327{
1328 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1329 const qint16x4_t const_one = vdup_n_s16(1);
1330 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1331 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1332 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1333 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1334 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1335 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1336 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1337 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1338 return res;
1339}
1340
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001341template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001342inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001343{
1344 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1345 const qint8x8_t const_one = vdup_n_s8(1);
1346 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1347 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1348 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1349 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1350 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1351 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1352 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1353 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1354 return res;
1355}
1356
1357template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001358inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1359{
1360 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1361 const qint16x4_t const_one = vdup_n_s16(1);
1362 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1363 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1364 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1365 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1366 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1367 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1368 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1369 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1370 return res;
1371}
1372
1373template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001374inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1375{
1376 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1377 const qint8x16_t const_one = vdupq_n_s8(1);
1378 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1379 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1380 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1381 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1382 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1383 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1384 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1385 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1386 return res;
1387}
1388
1389template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001390inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1391{
1392 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1393 const qint16x8_t const_one = vdupq_n_s16(1);
1394 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1395 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1396 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1397 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1398 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1399 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1400 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1401 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1402 return res;
1403}
1404
1405template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001406inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1407{
1408 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1409 const qint8x16_t const_one = vdupq_n_s8(1);
1410 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1411 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1412 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1413 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1414 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1415 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1416 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1417 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1418 return res;
1419}
1420
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001421template <bool islog>
1422inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1423{
1424 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1425 const qint16x8_t const_one = vdupq_n_s16(1);
1426 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1427 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1428 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1429 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1430 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1431 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1432 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1433 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1434 return res;
1435}
1436
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001437inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1438{
1439 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1440 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1441 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1442 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1443
1444 // Perform range reduction [-log(2),log(2)]
1445 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1446
1447 // get decimal part from m
1448 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1449
1450 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1451 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1452
1453 // Polynomial Approximation
1454 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1455 poly = vqadd_s8(poly, const_one);
1456
1457 // Reconstruct
1458 poly = vqshl_s8(poly, dec_m);
1459
1460 return poly;
1461}
1462
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001463inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1464{
1465 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1466 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1467 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1468 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1469
1470 // Perform range reduction [-log(2),log(2)]
1471 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1472
1473 // get decimal part from m
1474 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1475
1476 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1477 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1478
1479 // Polynomial Approximation
1480 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1481 poly = vqadd_s16(poly, const_one);
1482
1483 // Reconstruct
1484 poly = vqshl_s16(poly, dec_m);
1485
1486 return poly;
1487}
1488
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001489inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1490{
1491 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1492 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1493 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1494 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1495
1496 // Perform range reduction [-log(2),log(2)]
1497 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1498
1499 // get decimal part from m
1500 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1501
1502 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1503 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1504
1505 // Polynomial Approximation
1506 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1507 poly = vqaddq_s8(poly, const_one);
1508
1509 // Reconstruct
1510 poly = vqshlq_s8(poly, dec_m);
1511
1512 return poly;
1513}
1514
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001515inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1516{
1517 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1518 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1519 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1520 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1521
1522 // Perform range reduction [-log(2),log(2)]
1523 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1524
1525 // get decimal part from m
1526 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1527
1528 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1529 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1530
1531 // Polynomial Approximation
1532 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1533 poly = vqaddq_s16(poly, const_one);
1534
1535 // Reconstruct
1536 poly = vqshlq_s16(poly, dec_m);
1537
1538 return poly;
1539}
1540
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001541inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1542{
1543 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1544 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1545 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1546
1547 // If 0 < a < 1, calculate log(1/x)
1548 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1549 qint8x8_t recip = vdup_n_s8(0);
1550 recip = vbsl_s8(calc_reciprocal, recip, a);
1551
1552 // Calculate reciprocal
1553 recip = vrecip_qs8(recip, fixed_point_position);
1554 a = vbsl_s8(calc_reciprocal, recip, a);
1555
1556 // Get decimal part of a
1557 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1558 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1559
1560 // Get exponent of 2^n which is equal or less than dec_a
1561 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1562
1563 // Get x to range (1, 2]
1564 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1565 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1566 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1567
1568 // Polynomial Approximation
1569 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1570
1571 // Reconstruct
1572 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1573
1574 // Set negative value for 0 < a < 1
1575 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1576
1577 return poly;
1578}
1579
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001580inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1581{
1582 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1583 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1584 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1585
1586 // If 0 < a < 1, calculate log(1/x)
1587 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1588 qint16x4_t recip = vdup_n_s16(0);
1589 recip = vbsl_s16(calc_reciprocal, recip, a);
1590
1591 // Calculate reciprocal
1592 recip = vrecip_qs16(recip, fixed_point_position);
1593 a = vbsl_s16(calc_reciprocal, recip, a);
1594
1595 // Get decimal part of a
1596 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1597 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1598
1599 // Get exponent of 2^n which is equal or less than dec_a
1600 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1601
1602 // Get x to range (1, 2]
1603 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1604 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1605 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1606
1607 // Polynomial Approximation
1608 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1609
1610 // Reconstruct
1611 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1612
1613 // Set negative value for 0 < a < 1
1614 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1615
1616 return poly;
1617}
1618
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001619inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1620{
1621 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1622 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1623 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1624
1625 // If 0 < a < 1, calculate log(1/x)
1626 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1627 qint8x16_t recip = vdupq_n_s8(0);
1628 recip = vbslq_s8(calc_reciprocal, a, recip);
1629
1630 // Calculate reciprocal
1631 recip = vrecipq_qs8(recip, fixed_point_position);
1632 a = vbslq_s8(calc_reciprocal, recip, a);
1633
1634 // Get decimal part of a
1635 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1636 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1637
1638 // Get exponent of 2^n which is equal or less than dec_a
1639 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1640
1641 // Get x to range (1, 2]
1642 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1643 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1644 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1645
1646 // Polynomial Approximation
1647 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1648
1649 // Reconstruct
1650 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1651
1652 // Set negative value for 0 < a < 1
1653 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1654
1655 return poly;
1656}
1657
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001658inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1659{
1660 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1661 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1662 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1663
1664 // If 0 < a < 1, calculate log(1/x)
1665 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1666 qint16x8_t recip = vdupq_n_s16(0);
1667 recip = vbslq_s16(calc_reciprocal, a, recip);
1668
1669 // Calculate reciprocal
1670 recip = vqrecipq_qs16(recip, fixed_point_position);
1671 a = vbslq_s16(calc_reciprocal, recip, a);
1672
1673 // Get decimal part of a
1674 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1675 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1676
1677 // Get exponent of 2^n which is equal or less than dec_a
1678 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1679
1680 // Get x to range (1, 2]
1681 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1682 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1683 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1684
1685 // Polynomial Approximation
1686 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1687
1688 // Reconstruct
1689 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1690
1691 // Set negative value for 0 < a < 1
1692 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1693
1694 return poly;
1695}
1696
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001697inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1698{
1699 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1700
1701 // Find shift value. Number must be in (0.5, 2) range.
1702 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1703
1704 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1705 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1706 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1707 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1708 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1709
1710 temp = vshl_s8(a, shift_value);
1711
1712 // Initial guess
1713 qint8x8_t x = temp;
1714
1715 // Calculate (x / 2) * (3 - a * x^2)
1716 // After three iterations we have the result for 8 bit
1717 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1718 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1719 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1720
1721 return vshl_s8(x, shift_value2);
1722}
1723
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001724inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1725{
1726 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1727
1728 // Find shift value. Number must be in (0.5, 2) range.
1729 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1730
1731 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1732 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1733 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1734 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1735 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1736
1737 temp = vshl_s16(a, shift_value);
1738
1739 // Initial guess
1740 qint16x4_t x = temp;
1741
1742 // Calculate (x / 2) * (3 - a * x^2)
1743 // After five iterations we have the result for 8 bit
1744 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1745 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1746 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1747 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1748 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1749
1750 return vshl_s16(x, shift_value2);
1751}
1752
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001753inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1754{
1755 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1756
1757 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001758 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001759
1760 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001761 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001762 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001763 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001764 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001765
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001766 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001767
1768 // Initial guess
1769 qint8x8_t x = temp;
1770
1771 // Calculate (x / 2) * (3 - a * x^2)
1772 // After three iterations we have the result for 8 bit
1773 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1774 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1775 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1776
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001777 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001778}
1779
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001780inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1781{
1782 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1783
1784 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001785 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001786
1787 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1788 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1789 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1790 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001791 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001792
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001793 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001794
1795 // Initial guess
1796 qint16x4_t x = temp;
1797
1798 // Calculate (x / 2) * (3 - a * x^2)
1799 // After five iterations we have the result for 16 bit
1800 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1801 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1802 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1803 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1804 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1805
1806 return vqshl_s16(x, shift_value2);
1807}
1808
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001809inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1810{
1811 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1812
1813 // Find shift value. Number must be in (0.5, 2) range.
1814 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1815
1816 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1817 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1818 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1819 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1820 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1821
1822 temp = vshlq_s8(a, shift_value);
1823
1824 // Initial guess
1825 qint8x16_t x = temp;
1826
1827 // Calculate (x / 2) * (3 - a * x^2)
1828 // After three iterations we have the result for 8 bit
1829 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1830 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1831 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1832
1833 return vshlq_s8(x, shift_value2);
1834}
1835
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001836inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1837{
1838 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1839
1840 // Find shift value. Number must be in (0.5, 2) range.
1841 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1842
1843 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1844 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1845 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1846 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1847 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1848
1849 temp = vshlq_s16(a, shift_value);
1850
1851 // Initial guess
1852 qint16x8_t x = temp;
1853
1854 // Calculate (x / 2) * (3 - a * x^2)
1855 // After five iterations we have the result for 16 bit
1856 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1857 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1858 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1859 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1860 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1861
1862 return vshlq_s16(x, shift_value2);
1863}
1864
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001865inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1866{
1867 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1868
1869 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001870 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001871
1872 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001873 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001874 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001875 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001876 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001877
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001878 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001879
1880 // Initial guess
1881 qint8x16_t x = temp;
1882
1883 // Calculate (x / 2) * (3 - a * x^2)
1884 // After three iterations we have the result for 8 bit
1885 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1886 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1887 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1888
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001889 return vqshlq_s8(x, shift_value2);
1890}
1891
1892inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1893{
1894 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1895
1896 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001897 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001898
1899 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1900 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1901 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1902 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001903 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001904
1905 temp = vqshlq_s16(a, shift_value);
1906
1907 // Initial guess
1908 qint16x8_t x = temp;
1909
1910 // Calculate (x / 2) * (3 - a * x^2)
1911 // After five iterations we have the result for 16 bit
1912 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1913 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1914 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1915 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1916 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1917
1918 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001919}
1920
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001921inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001922{
1923 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1924 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1925
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001926 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1927 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1928 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1929 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001930
1931 return tanh;
1932}
1933
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001934inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001935{
1936 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1937 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1938
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001939 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1940 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1941 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1942 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001943
1944 return tanh;
1945}
1946
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001947inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001948{
1949 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1950 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1951
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001952 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1953 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1954 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1955 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001956
1957 return tanh;
1958}
1959
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001960inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1961{
1962 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1963 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1964
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001965 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1966 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1967 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1968 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001969
1970 return tanh;
1971}
1972
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001973inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1974{
1975 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1976}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001977
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +01001978inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1979{
1980 return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
1981}
1982
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001983inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1984{
1985 float32x4x2_t res =
1986 {
1987 {
1988 vmaxq_f32(a.val[0], b.val[0]),
1989 vmaxq_f32(a.val[1], b.val[1])
1990 }
1991 };
1992 return res;
1993}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001994}