blob: 966313d58b63eddbe2a5b42b74aca0147c72ccef [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Abe Mbise4bd2cb82017-09-27 18:39:19 +010024#include <array>
Georgios Pinitas00394ae2017-06-22 18:13:55 +010025#include <limits>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
27namespace arm_compute
28{
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010029/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
30 * Format is in Q0.7 for all elements
31 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010032static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033{
34 {
35 vdup_n_s8(0x7F), // 0.9978546
36 vdup_n_s8(0x3F), // 0.4994721
37 vdup_n_s8(0x16), // 0.1763723
38 vdup_n_s8(0x05), // 0.0435108
39 }
40};
41
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010042/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
43 * Format is in Q0.15 for all elements
44 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010045static const std::array<qint16x4_t, 4> exp_tab_qs16 =
46{
47 {
48 vdup_n_s16(0x7FBA), // 0.9978546
49 vdup_n_s16(0x3FE9), // 0.4994721
50 vdup_n_s16(0x1693), // 0.1763723
51 vdup_n_s16(0x0592), // 0.0435108
52 }
53};
54
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010055/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
56 * Format is in Q0.7 for all elements
57 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010058static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010059{
60 {
61 vdupq_n_s8(0x7F), // 0.9978546
62 vdupq_n_s8(0x3F), // 0.4994721
63 vdupq_n_s8(0x16), // 0.1763723
64 vdupq_n_s8(0x05), // 0.0435108
65 }
66};
67
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010068/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
69 * Format is in Q0.15 for all elements
70 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010071static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
72{
73 {
74 vdupq_n_s16(0x7FBA), // 0.9978546
75 vdupq_n_s16(0x3FE9), // 0.4994721
76 vdupq_n_s16(0x1693), // 0.1763723
77 vdupq_n_s16(0x0592), // 0.0435108
78 }
79};
80
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010081/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
82 * Format is in Q0.7 for all elements except the first one which is in Q1.6
83 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010084static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +010085{
86 {
87 vdup_n_s8(0x5C), // 1.4384189
88 vdup_n_s8(-0x56), // -0.6771900
89 vdup_n_s8(0x29), // 0.3218538
90 vdup_n_s8(-0x0A), // -0.0832229
91 }
92};
93
Michalis Spyroubbd9fb92017-06-22 12:57:51 +010094/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
95 * Format is in Q0.15 for all elements except the first one which is in Q1.14
96 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +010097static const std::array<qint16x4_t, 4> log_tab_qs16 =
98{
99 {
100 vdup_n_s16(0x5C0F), // 1.4384189
101 vdup_n_s16(-0x56AE), // -0.6771900
102 vdup_n_s16(0x2933), // 0.3218538
103 vdup_n_s16(-0x0AA7), // -0.0832229
104 }
105};
106
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100107/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
108 * Format is in Q0.7 for all elements except the first one which is in Q1.6
109 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100110static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100111{
112 {
113 vdupq_n_s8(0x5C), // 1.4384189
114 vdupq_n_s8(-0x56), // -0.6771900
115 vdupq_n_s8(0x29), // 0.3218538
116 vdupq_n_s8(-0x0A), // -0.0832229
117 }
118};
119
Michalis Spyroubbd9fb92017-06-22 12:57:51 +0100120/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
121 * Format is in Q0.15 for all elements except the first one which is in Q1.14
122 */
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100123static const std::array<qint16x8_t, 4> log_tabq_qs16 =
124{
125 {
126 vdupq_n_s16(0x5C0F), // 1.4384189
127 vdupq_n_s16(-0x56AE), // -0.6771900
128 vdupq_n_s16(0x2933), // 0.3218538
129 vdupq_n_s16(-0x0AA7), // -0.0832229
130 }
131};
132
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133inline qint8x8_t vget_low_qs8(qint8x16_t a)
134{
135 return vget_low_s8(a);
136}
137
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100138inline qint16x4_t vget_low_qs16(qint16x8_t a)
139{
140 return vget_low_s16(a);
141}
142
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100143inline qint8x8_t vget_high_qs8(qint8x16_t a)
144{
145 return vget_high_s8(a);
146}
147
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100148inline qint16x4_t vget_high_qs16(qint16x8_t a)
149{
150 return vget_high_s16(a);
151}
152
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100153inline qint8x8_t vld1_qs8(const qint8_t *addr)
154{
155 return vld1_s8(addr);
156}
157
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100158inline qint16x4_t vld1_qs16(const qint16_t *addr)
159{
160 return vld1_s16(addr);
161}
162
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100163inline qint8x16_t vld1q_qs8(const qint8_t *addr)
164{
165 return vld1q_s8(addr);
166}
167
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100168inline qint16x8_t vld1q_qs16(const qint16_t *addr)
169{
170 return vld1q_s16(addr);
171}
172
173inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
174{
175 return vld1_dup_s8(addr);
176}
177
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100178inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
179{
180 return vld1_dup_s16(addr);
181}
182
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100183inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
184{
185 return vld1q_dup_s8(addr);
186}
187
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100188inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
189{
190 return vld1q_dup_s16(addr);
191}
192
Michele Di Giorgio81f0d152017-07-11 15:00:52 +0100193inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
194{
195 return vld2q_s16(addr);
196}
197
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100198inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
199{
200 vst1_s8(addr, b);
201}
202
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100203inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
204{
205 vst1_s16(addr, b);
206}
207
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100208inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
209{
210 vst1q_s8(addr, b);
211}
212
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100213inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
214{
215 vst1q_s16(addr, b);
216}
217
Georgios Pinitasccc65d42017-06-27 17:39:11 +0100218inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
219{
220 vst2q_s16(addr, b);
221}
222
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100223inline qint8x8_t vqmovn_qs16(qint16x8_t a)
224{
225 return vqmovn_s16(a);
226}
227
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100228inline qint16x4_t vqmovn_qs32(qint32x4_t a)
229{
230 return vqmovn_s32(a);
231}
232
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100233inline qint8x8_t vdup_n_qs8(qint8_t a)
234{
235 return vdup_n_s8(a);
236}
237
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100238inline qint16x4_t vdup_n_qs16(qint16_t a)
239{
240 return vdup_n_s16(a);
241}
242
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100243inline qint8x16_t vdupq_n_qs8(qint8_t a)
244{
245 return vdupq_n_s8(a);
246}
247
248inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
249{
250 float32x4x4_t res =
251 {
252 {
253 vdupq_n_f32(a),
254 vdupq_n_f32(a),
255 vdupq_n_f32(a),
256 vdupq_n_f32(a),
257 }
258 };
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100259 return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100260}
261
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +0100262inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
263{
264 float32x4x2_t res =
265 {
266 {
267 vdupq_n_f32(a),
268 vdupq_n_f32(a),
269 }
270 };
271 return vqcvtq_qs16_f32(res, fixed_point_position);
272}
273
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100274inline qint16x8_t vdupq_n_qs16(qint16_t a)
275{
276 return vdupq_n_s16(a);
277}
278
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100279inline qint32x4_t vdupq_n_qs32(qint32_t a)
280{
281 return vdupq_n_s32(a);
282}
283
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100284inline qint8x8_t vabs_qs8(qint8x8_t a)
285{
286 return vabs_s8(a);
287}
288
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100289inline qint16x4_t vabs_qs16(qint16x4_t a)
290{
291 return vabs_s16(a);
292}
293
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100294inline qint8x16_t vabsq_qs8(qint8x16_t a)
295{
296 return vabsq_s8(a);
297}
298
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100299inline qint16x8_t vabsq_qs16(qint16x8_t a)
300{
301 return vabsq_s16(a);
302}
303
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100304inline qint8x8_t vqabs_qs8(qint8x8_t a)
305{
306 return vqabs_s8(a);
307}
308
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100309inline qint16x4_t vqabs_qs16(qint16x4_t a)
310{
311 return vqabs_s16(a);
312}
313
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100314inline qint8x16_t vqabsq_qs8(qint8x16_t a)
315{
316 return vqabsq_s8(a);
317}
318
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100319inline qint16x8_t vqabsq_qs16(qint16x8_t a)
320{
321 return vqabsq_s16(a);
322}
323
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100324inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
325{
326 return vmax_s8(a, b);
327}
328
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100329inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
330{
331 return vmax_s16(a, b);
332}
333
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100334inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
335{
336 return vmaxq_s8(a, b);
337}
338
339inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
340{
341 return vpmax_s8(a, b);
342}
343
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100344inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
345{
346 return vpmax_s16(a, b);
347}
348
349inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
350{
351 return vmaxq_s16(a, b);
352}
353
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100354inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
355{
356 return vmin_s8(a, b);
357}
358
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100359inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
360{
361 return vmin_s16(a, b);
362}
363
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100364inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
365{
366 return vminq_s8(a, b);
367}
368
369inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
370{
371 return vpmin_s8(a, b);
372}
373
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100374inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
375{
376 return vpmin_s16(a, b);
377}
378
379inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
380{
381 return vminq_s16(a, b);
382}
383
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100384inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
385{
386 return vadd_s8(a, b);
387}
388
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100389inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
390{
391 return vadd_s16(a, b);
392}
393
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100394inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
395{
396 return vaddq_s8(a, b);
397}
398
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100399inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
400{
401 return vaddq_s16(a, b);
402}
403
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100404inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
405{
406 return vqadd_s8(a, b);
407}
408
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100409inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
410{
411 return vqadd_s16(a, b);
412}
413
Georgios Pinitas9247c922017-06-28 18:29:47 +0100414inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
415{
416 return vqadd_s32(a, b);
417}
418
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100419inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
420{
421 return vqaddq_s8(a, b);
422}
423
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100424inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
425{
426 return vqaddq_s16(a, b);
427}
428
Georgios Pinitas9247c922017-06-28 18:29:47 +0100429inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
430{
431 return vqaddq_s32(a, b);
432}
433
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100434inline int16x4_t vpaddl_qs8(qint8x8_t a)
435{
436 return vpaddl_s8(a);
437}
438
439inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
440{
441 return vsub_s8(a, b);
442}
443
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100444inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
445{
446 return vsub_s16(a, b);
447}
448
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100449inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
450{
451 return vsubq_s8(a, b);
452}
453
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100454inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
455{
456 return vsubq_s16(a, b);
457}
458
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100459inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
460{
461 return vqsub_s8(a, b);
462}
463
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100464inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
465{
466 return vqsub_s16(a, b);
467}
468
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100469inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
470{
471 return vqsubq_s8(a, b);
472}
473
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100474inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
475{
476 return vqsubq_s16(a, b);
477}
478
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100479inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
480{
481 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
482
483 // Initialize the temporary result with a constant used to round up the result
484 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
485
486 // Vector multiply-accumulate long
487 res = vmlal_s8(res, a, b);
488
489 // Shift right by fixed_point_position
490 res = vshlq_s16(res, fixed_point_position_s16);
491
492 // Convert back to qint8
493 return vmovn_s16(res);
494}
495
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100496inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
497{
498 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
499
500 // Initialize the temporary result with a constant used to round up the result
501 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
502
503 // Vector multiply-accumulate long
504 res = vmlal_s16(res, a, b);
505
506 // Shift right by fixed_point_position
507 res = vshlq_s32(res, fixed_point_position_s32);
508
509 // Convert back to qint16
510 return vmovn_s32(res);
511}
512
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100513inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
514{
515 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
516
517 // Initialize the temporary results with a constant used to round up the result
518 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
519 qint16x8_t res1 = res0;
520
521 // Vector multiply-accumulate long
522 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
523 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
524
525 // Shift right by fixed_point_position
526 res0 = vshlq_s16(res0, fixed_point_position_s16);
527 res1 = vshlq_s16(res1, fixed_point_position_s16);
528
529 // Convert back to qint8
530 return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
531}
532
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100533inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
534{
535 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
536
537 // Initialize the temporary results with a constant used to round up the result
538 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
539 qint32x4_t res1 = res0;
540
541 // Vector multiply-accumulate long
542 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
543 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
544
545 // Shift right by fixed_point_position
546 res0 = vshlq_s32(res0, fixed_point_position_s32);
547 res1 = vshlq_s32(res1, fixed_point_position_s32);
548
549 // Convert back to qint16
550 return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
551}
552
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100553inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
554{
555 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
556
557 // Initialize the temporary result with a constant used to round up the result
558 qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
559
560 // Vector multiply-accumulate long
561 res = vmlal_s8(res, a, b);
562
563 // Shift right by fixed_point_position
564 res = vqshlq_s16(res, fixed_point_position_s16);
565
566 // Convert back to qint8 and saturate
567 return vqmovn_s16(res);
568}
569
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100570inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
571{
572 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
573
574 // Initialize the temporary result with a constant used to round up the result
575 qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
576
577 // Vector multiply-accumulate long
578 res = vmlal_s16(res, a, b);
579
580 // Shift right by fixed_point_position
581 res = vqshlq_s32(res, fixed_point_position_s32);
582
583 // Convert back to qint16 and saturate
584 return vqmovn_s32(res);
585}
586
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100587inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
588{
589 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
590
591 // Initialize the temporary results with a constant used to round up the result
592 qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
593 qint16x8_t res1 = res0;
594
595 // Vector multiply-accumulate long
596 res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
597 res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
598
599 // Shift right by fixed_point_position
600 res0 = vqshlq_s16(res0, fixed_point_position_s16);
601 res1 = vqshlq_s16(res1, fixed_point_position_s16);
602
603 // Convert back to qint8 and saturate
604 return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
605}
606
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100607inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
608{
609 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
610
611 // Initialize the temporary results with a constant used to round up the result
612 qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
613 qint32x4_t res1 = res0;
614
615 // Vector multiply-accumulate long
616 res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
617 res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
618
619 // Shift right by fixed_point_position
620 res0 = vqshlq_s32(res0, fixed_point_position_s32);
621 res1 = vqshlq_s32(res1, fixed_point_position_s32);
622
623 // Convert back to qint16 and saturate
624 return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
625}
626
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100627inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
628{
629 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
630
631 qint16x8_t res = vmull_s8(a, b);
632
633 return vqrshlq_s16(res, fixed_point_position_s16);
634}
635
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100636inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
637{
638 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
639
640 // Initialize the temporary results with a constant used to round up the result
641 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
642
643 // Vector multiply-accumulate long
644 tmp = vmull_s16(a, b);
645
646 // Shift right by fixed_point_position
647 return vqshlq_s32(tmp, fixed_point_position_s32);
648}
649
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100650inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
651{
652 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
653
654 // Initialize the temporary results with a constant used to round up the result
655 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
656
657 // Vector multiply-accumulate long
658 tmp = vmlal_s8(tmp, b, c);
659
660 // Shift right by fixed_point_position
661 tmp = vshlq_s16(tmp, fixed_point_position_s16);
662
663 // Convert back to qint8 and accumulate
664 return vadd_s8(a, vmovn_s16(tmp));
665}
666
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100667inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
668{
669 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
670
671 // Initialize the temporary results with a constant used to round up the result
672 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
673
674 // Vector multiply-accumulate long
675 tmp = vmlal_s16(tmp, b, c);
676
677 // Shift right by fixed_point_position
678 tmp = vshlq_s32(tmp, fixed_point_position_s32);
679
680 // Convert back to qint16 and accumulate
681 return vadd_s16(a, vmovn_s32(tmp));
682}
683
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100684inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
685{
686 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
687
688 // Initialize the temporary results with a constant used to round up the result
689 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
690 qint16x8_t tmp1 = tmp0;
691
692 // Vector multiply-accumulate long
693 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
694 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
695
696 // Shift right by fixed_point_position
697 tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
698 tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
699
700 // Convert back to qint8 and accumulate
701 return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
702}
703
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100704inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
705{
706 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
707
708 // Initialize the temporary results with a constant used to round up the result
709 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
710 qint32x4_t tmp1 = tmp0;
711
712 // Vector multiply-accumulate long
713 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
714 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
715
716 // Shift right by fixed_point_position
717 tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
718 tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
719
720 // Convert back to qint16 and accumulate
721 return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
722}
723
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100724inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
725{
726 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
727
728 // Initialize the temporary results with a constant used to round up the result
729 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
730
731 // Vector multiply-accumulate long
732 tmp = vmlal_s8(tmp, b, c);
733
734 // Shift right by fixed_point_position
735 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
736
737 // Convert back to qint8 and accumulate
738 return vqadd_s8(a, vqmovn_s16(tmp));
739}
740
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100741inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
742{
743 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
744
745 // Initialize the temporary results with a constant used to round up the result
746 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
747
748 // Vector multiply-accumulate long
749 tmp = vmlal_s16(tmp, b, c);
750
751 // Shift right by fixed_point_position
752 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
753
754 // Convert back to qint8 and accumulate
755 return vqadd_s16(a, vqmovn_s32(tmp));
756}
757
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100758inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
759{
760 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
761
762 // Initialize the temporary results with a constant used to round up the result
763 qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
764 qint16x8_t tmp1 = tmp0;
765
766 // Vector multiply-accumulate long
767 tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
768 tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
769
770 // Shift right by fixed_point_position
771 tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
772 tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
773
774 // Convert back to qint8 and accumulate
775 qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
776 return vqaddq_s8(a, res);
777}
778
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100779inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
780{
781 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
782
783 // Initialize the temporary results with a constant used to round up the result
784 qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
785 qint32x4_t tmp1 = tmp0;
786
787 // Vector multiply-accumulate long
788 tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
789 tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
790
791 // Shift right by fixed_point_position
792 tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
793 tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
794
795 // Convert back to qint16 and accumulate
796 qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
797 return vqaddq_s16(a, res);
798}
799
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100800inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
801{
802 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
803
804 // Initialize the temporary results with a constant used to round up the result
805 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
806
807 // Vector multiply-accumulate long
808 tmp = vmlal_s8(tmp, b, c);
809
810 // Shift right by fixed_point_position
811 tmp = vshlq_s16(tmp, fixed_point_position_s16);
812
813 // Accumulate
814 return vaddq_s16(a, tmp);
815}
816
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100817inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
818{
819 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
820
821 // Initialize the temporary results with a constant used to round up the result
822 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
823
824 // Vector multiply-accumulate long
825 tmp = vmlal_s16(tmp, b, c);
826
827 // Shift right by fixed_point_position
828 tmp = vshlq_s32(tmp, fixed_point_position_s32);
829
830 // Accumulate
831 return vaddq_s32(a, tmp);
832}
833
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
835{
836 const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
837
838 // Initialize the temporary results with a constant used to round up the result
839 qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
840
841 // Vector multiply-accumulate long
842 tmp = vmlal_s8(tmp, b, c);
843
844 // Shift right by fixed_point_position
845 tmp = vqshlq_s16(tmp, fixed_point_position_s16);
846
847 // Accumulate
848 return vqaddq_s16(a, tmp);
849}
850
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100851inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
852{
853 const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
854
855 // Initialize the temporary results with a constant used to round up the result
856 qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
857
858 // Vector multiply-accumulate long
859 tmp = vmlal_s16(tmp, b, c);
860
861 // Shift right by fixed_point_position
862 tmp = vqshlq_s32(tmp, fixed_point_position_s32);
863
864 // Accumulate
865 return vqaddq_s32(a, tmp);
866}
867
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100868inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100869{
870 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
871
872 float32x4x2_t res_f32 =
873 {
874 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100875 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
876 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100877 }
878 };
879
880 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
881 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
882
883 const int32x4x2_t res_s32 =
884 {
885 {
886 vcvtq_s32_f32(res_f32.val[0]),
887 vcvtq_s32_f32(res_f32.val[1]),
888 }
889 };
890
891 const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
892
893 return vqmovn_s16(res_s16);
894}
895
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100896inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100897{
898 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
899
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100900 float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100901
902 res_f32 = vmlaq_f32(res_f32, a, pow2);
903
904 const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
905
906 return vqmovn_s32(res_s32);
907}
908
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100909inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100910{
911 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
912
913 float32x4x4_t res_f32 =
914 {
915 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100916 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
917 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
918 vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
919 vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100920 }
921 };
922
923 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
924 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
925 res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
926 res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
927
928 const int32x4x4_t res_s32 =
929 {
930 {
931 vcvtq_s32_f32(res_f32.val[0]),
932 vcvtq_s32_f32(res_f32.val[1]),
933 vcvtq_s32_f32(res_f32.val[2]),
934 vcvtq_s32_f32(res_f32.val[3]),
935 }
936 };
937
938 const int16x8x2_t res_s16 =
939 {
940 {
941 vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
942 vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
943 }
944 };
945
946 return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
947}
948
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100949inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100950{
951 const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
952
953 float32x4x2_t res_f32 =
954 {
955 {
Georgios Pinitas21efeb42017-07-04 12:47:17 +0100956 vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
957 vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100958 }
959 };
960
961 res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
962 res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
963
964 const int32x4x2_t res_s32 =
965 {
966 {
967 vcvtq_s32_f32(res_f32.val[0]),
968 vcvtq_s32_f32(res_f32.val[1])
969 }
970 };
971
972 return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
973}
974
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100975inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
976{
977 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
978
979 const int16x8_t res_s16 = vmovl_s8(a);
980
981 const int32x4x2_t res_s32 =
982 {
983 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +0100984 vmovl_s16(vget_low_qs16(res_s16)),
985 vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100986 }
987 };
988
989 float32x4x2_t res_f32 =
990 {
991 {
992 vcvtq_f32_s32(res_s32.val[0]),
993 vcvtq_f32_s32(res_s32.val[1])
994 }
995 };
996
997 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
998 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
999
1000 return res_f32;
1001}
1002
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001003inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
1004{
1005 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1006 const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
1007
1008 return vmulq_f32(res_f32, pow2);
1009}
1010
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001011inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
1012{
1013 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1014
1015 const int16x8x2_t res_s16 =
1016 {
1017 {
1018 vmovl_s8(vget_low_s8(a)),
1019 vmovl_s8(vget_high_s8(a)),
1020 }
1021 };
1022
1023 const int32x4x4_t res_s32 =
1024 {
1025 {
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001026 vmovl_s16(vget_low_qs16(res_s16.val[0])),
1027 vmovl_s16(vget_high_qs16(res_s16.val[0])),
1028 vmovl_s16(vget_low_qs16(res_s16.val[1])),
1029 vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001030 }
1031 };
1032
1033 float32x4x4_t res_f32 =
1034 {
1035 {
1036 vcvtq_f32_s32(res_s32.val[0]),
1037 vcvtq_f32_s32(res_s32.val[1]),
1038 vcvtq_f32_s32(res_s32.val[2]),
1039 vcvtq_f32_s32(res_s32.val[3])
1040 }
1041 };
1042
1043 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1044 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1045 res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
1046 res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
1047
1048 return res_f32;
1049}
1050
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001051inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
1052{
1053 const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
1054
1055 const int32x4x2_t res_s32 =
1056 {
1057 {
1058 vmovl_s16(vget_low_qs16(a)),
1059 vmovl_s16(vget_high_qs16(a))
1060 }
1061 };
1062
1063 float32x4x2_t res_f32 =
1064 {
1065 {
1066 vcvtq_f32_s32(res_s32.val[0]),
1067 vcvtq_f32_s32(res_s32.val[1])
1068 }
1069 };
1070
1071 res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
1072 res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
1073
1074 return res_f32;
1075}
1076
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001077inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
1078{
1079 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001080 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1081 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1082 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001083 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001084
1085 // Find shift value
1086 const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1087 const qint8x8_t temp = vshl_s8(a, shift_value);
1088
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001089 // Newton-Raphson division initial estimate X0 calculation
1090 qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001091
1092 uint8x8_t set_one = vcgt_s8(x, const_one);
1093 x = vbsl_s8(set_one, const_one, x);
1094
1095 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001096 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1097 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1098 x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001099
1100 return vshl_s8(x, shift_value);
1101}
1102
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001103inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
1104{
1105 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1106 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1107 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1108 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001109 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001110
1111 // Find shift value
1112 const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1113 const qint16x4_t temp = vshl_s16(a, shift_value);
1114
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001115 // Newton-Raphson division initial estimate X0 calculation
1116 qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001117
1118 uint16x4_t set_one = vcgt_s16(x, const_one);
1119 x = vbsl_s16(set_one, const_one, x);
1120
Michalis Spyrou25466a92017-08-17 12:56:46 +01001121 // Use four iterations of Newton-Raphson method to get the result
1122 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1123 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1124 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1125 x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001126
1127 return vshl_s16(x, shift_value);
1128}
1129
Georgios Pinitas9247c922017-06-28 18:29:47 +01001130inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
1131{
1132 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
1133 const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1134 const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
1135 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001136 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001137
1138 // Find shift value
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001139 const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001140 const qint8x8_t temp = vqshl_s8(a, shift_value);
1141
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001142 // Newton-Raphson division initial estimate X0 calculation
1143 qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001144
1145 uint8x8_t set_one = vcgt_s8(x, const_one);
1146 x = vbsl_s8(set_one, const_one, x);
1147
1148 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001149 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1150 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
1151 x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001152
1153 return vqshl_s8(x, shift_value);
1154}
1155
1156inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
1157{
1158 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1159 const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
1160 const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1161 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001162 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001163
1164 // Find shift value
1165 const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1166 const qint16x4_t temp = vqshl_s16(a, shift_value);
1167
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001168 // Newton-Raphson division initial estimate X0 calculation
1169 qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas9247c922017-06-28 18:29:47 +01001170
1171 uint16x4_t set_one = vcgt_s16(x, const_one);
1172 x = vbsl_s16(set_one, const_one, x);
1173
Michalis Spyrou25466a92017-08-17 12:56:46 +01001174 // Use four iterations of Newton-Raphson method to get the result
1175 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1176 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1177 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
1178 x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas9247c922017-06-28 18:29:47 +01001179
1180 return vqshl_s16(x, shift_value);
1181}
1182
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001183inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
1184{
1185 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001186 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1187 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1188 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001189 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001190
1191 // Find shift value
1192 const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1193 const qint8x16_t temp = vshlq_s8(a, shift_value);
1194
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001195 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001196 qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001197
1198 // Set initial guess to one if x > 1
1199 uint8x16_t set_one = vcgtq_s8(x, const_one);
1200 x = vbslq_s8(set_one, const_one, x);
1201
1202 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001203 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1204 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1205 x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001206
1207 return vshlq_s8(x, shift_value);
1208}
1209
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001210inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
1211{
1212 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1213 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1214 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1215 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001216 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001217
1218 // Find shift value
1219 const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1220 const qint16x8_t temp = vshlq_s16(a, shift_value);
1221
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001222 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001223 qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
1224
1225 // Set initial guess to one if x > 1
1226 uint16x8_t set_one = vcgtq_s16(x, const_one);
1227 x = vbslq_s16(set_one, const_one, x);
1228
Michalis Spyrou25466a92017-08-17 12:56:46 +01001229 // Use four iterations of Newton-Raphson method to get the result
1230 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1231 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1232 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1233 x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001234
1235 return vshlq_s16(x, shift_value);
1236}
1237
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001238inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
1239{
1240 // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001241 const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
1242 const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
1243 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001244 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001245
1246 // Find shift value
1247 const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1248 const qint8x16_t temp = vqshlq_s8(a, shift_value);
1249
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001250 // Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001251 qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001252
1253 // Set initial guess to one if x > 1
1254 uint8x16_t set_one = vcgtq_s8(x, const_one);
1255 x = vbslq_s8(set_one, const_one, x);
1256
1257 // Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou25466a92017-08-17 12:56:46 +01001258 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1259 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
1260 x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001261
1262 return vqshlq_s8(x, shift_value);
1263}
1264
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001265inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
1266{
1267 // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
1268 const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
1269 const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
1270 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou25466a92017-08-17 12:56:46 +01001271 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001272
1273 // Find shift value
1274 const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1275 const qint16x8_t temp = vqshlq_s16(a, shift_value);
1276
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001277 // Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001278 qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
1279
1280 // Set initial guess to one if x > 1
1281 uint16x8_t set_one = vcgtq_s16(x, const_one);
1282 x = vbslq_s16(set_one, const_one, x);
1283
Michalis Spyrou25466a92017-08-17 12:56:46 +01001284 // Use four iterations of Newton-Raphson method to get the result
1285 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1286 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1287 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
1288 x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001289
Georgios Pinitas00394ae2017-06-22 18:13:55 +01001290 // Saturate result in case of overflow
1291 return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001292}
1293
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
1295{
1296 return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
1297}
1298
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001299inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
1300{
1301 return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
1302}
1303
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001304inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1305{
1306 return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
1307}
1308
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001309inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1310{
1311 return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
1312}
1313
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001314template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001315inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001316{
1317 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1318 const qint8x8_t const_one = vdup_n_s8(1);
1319 const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
1320 const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1321 const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1322 const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1323 const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
1324 const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
1325 const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
1326 const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
1327 return res;
1328}
1329
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001330template <bool islog>
1331inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1332{
1333 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1334 const qint16x4_t const_one = vdup_n_s16(1);
1335 const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
1336 const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1337 const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1338 const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1339 const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
1340 const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
1341 const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
1342 const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
1343 return res;
1344}
1345
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001346template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001347inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001348{
1349 const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
1350 const qint8x8_t const_one = vdup_n_s8(1);
1351 const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
1352 const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
1353 const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
1354 const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
1355 const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
1356 const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
1357 const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
1358 const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
1359 return res;
1360}
1361
1362template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001363inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
1364{
1365 const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
1366 const qint16x4_t const_one = vdup_n_s16(1);
1367 const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
1368 const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
1369 const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
1370 const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
1371 const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
1372 const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
1373 const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
1374 const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
1375 return res;
1376}
1377
1378template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001379inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1380{
1381 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1382 const qint8x16_t const_one = vdupq_n_s8(1);
1383 const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
1384 const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1385 const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1386 const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1387 const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
1388 const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
1389 const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
1390 const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
1391 return res;
1392}
1393
1394template <bool islog>
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001395inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1396{
1397 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1398 const qint16x8_t const_one = vdupq_n_s16(1);
1399 const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
1400 const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1401 const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1402 const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1403 const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
1404 const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
1405 const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
1406 const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
1407 return res;
1408}
1409
1410template <bool islog>
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001411inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
1412{
1413 const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
1414 const qint8x16_t const_one = vdupq_n_s8(1);
1415 const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
1416 const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
1417 const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
1418 const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
1419 const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
1420 const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
1421 const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
1422 const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
1423 return res;
1424}
1425
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001426template <bool islog>
1427inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
1428{
1429 const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
1430 const qint16x8_t const_one = vdupq_n_s16(1);
1431 const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
1432 const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
1433 const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
1434 const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
1435 const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
1436 const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
1437 const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
1438 const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
1439 return res;
1440}
1441
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001442inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
1443{
1444 const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
1445 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1446 const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
1447 const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1448
1449 // Perform range reduction [-log(2),log(2)]
1450 const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1451
1452 // get decimal part from m
1453 const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
1454
1455 qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1456 alpha = vqabs_qs8(vqsub_s8(a, alpha));
1457
1458 // Polynomial Approximation
1459 qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
1460 poly = vqadd_s8(poly, const_one);
1461
1462 // Reconstruct
1463 poly = vqshl_s8(poly, dec_m);
1464
1465 return poly;
1466}
1467
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001468inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
1469{
1470 const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
1471 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1472 const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
1473 const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1474
1475 // Perform range reduction [-log(2),log(2)]
1476 const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1477
1478 // get decimal part from m
1479 const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
1480
1481 qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1482 alpha = vqabs_qs16(vqsub_s16(a, alpha));
1483
1484 // Polynomial Approximation
1485 qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
1486 poly = vqadd_s16(poly, const_one);
1487
1488 // Reconstruct
1489 poly = vqshl_s16(poly, dec_m);
1490
1491 return poly;
1492}
1493
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001494inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
1495{
1496 const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
1497 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1498 const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
1499 const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
1500
1501 // Perform range reduction [-log(2),log(2)]
1502 const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1503
1504 // get decimal part from m
1505 const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
1506
1507 qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
1508 alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
1509
1510 // Polynomial Approximation
1511 qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
1512 poly = vqaddq_s8(poly, const_one);
1513
1514 // Reconstruct
1515 poly = vqshlq_s8(poly, dec_m);
1516
1517 return poly;
1518}
1519
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001520inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
1521{
1522 const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
1523 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1524 const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
1525 const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
1526
1527 // Perform range reduction [-log(2),log(2)]
1528 const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
1529
1530 // get decimal part from m
1531 const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
1532
1533 qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
1534 alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
1535
1536 // Polynomial Approximation
1537 qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
1538 poly = vqaddq_s16(poly, const_one);
1539
1540 // Reconstruct
1541 poly = vqshlq_s16(poly, dec_m);
1542
1543 return poly;
1544}
1545
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001546inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
1547{
1548 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1549 const qint8x8_t const_seven_dec = vdup_n_s8(7);
1550 const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1551
1552 // If 0 < a < 1, calculate log(1/x)
1553 uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
1554 qint8x8_t recip = vdup_n_s8(0);
1555 recip = vbsl_s8(calc_reciprocal, recip, a);
1556
1557 // Calculate reciprocal
1558 recip = vrecip_qs8(recip, fixed_point_position);
1559 a = vbsl_s8(calc_reciprocal, recip, a);
1560
1561 // Get decimal part of a
1562 qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
1563 qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
1564
1565 // Get exponent of 2^n which is equal or less than dec_a
1566 shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
1567
1568 // Get x to range (1, 2]
1569 const qint8x8_t shift_value_neg = vneg_s8(shift_value);
1570 const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
1571 const qint8x8_t sum = vmul_s8(shift_value, const_one);
1572
1573 // Polynomial Approximation
1574 qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
1575
1576 // Reconstruct
1577 poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
1578
1579 // Set negative value for 0 < a < 1
1580 poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
1581
1582 return poly;
1583}
1584
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001585inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
1586{
1587 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1588 const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
1589 const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1590
1591 // If 0 < a < 1, calculate log(1/x)
1592 uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
1593 qint16x4_t recip = vdup_n_s16(0);
1594 recip = vbsl_s16(calc_reciprocal, recip, a);
1595
1596 // Calculate reciprocal
1597 recip = vrecip_qs16(recip, fixed_point_position);
1598 a = vbsl_s16(calc_reciprocal, recip, a);
1599
1600 // Get decimal part of a
1601 qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
1602 qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
1603
1604 // Get exponent of 2^n which is equal or less than dec_a
1605 shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
1606
1607 // Get x to range (1, 2]
1608 const qint16x4_t shift_value_neg = vneg_s16(shift_value);
1609 const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
1610 const qint16x4_t sum = vmul_s16(shift_value, const_one);
1611
1612 // Polynomial Approximation
1613 qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
1614
1615 // Reconstruct
1616 poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
1617
1618 // Set negative value for 0 < a < 1
1619 poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
1620
1621 return poly;
1622}
1623
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001624inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
1625{
1626 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1627 const qint8x16_t const_seven_dec = vdupq_n_s8(7);
1628 const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
1629
1630 // If 0 < a < 1, calculate log(1/x)
1631 uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
1632 qint8x16_t recip = vdupq_n_s8(0);
1633 recip = vbslq_s8(calc_reciprocal, a, recip);
1634
1635 // Calculate reciprocal
1636 recip = vrecipq_qs8(recip, fixed_point_position);
1637 a = vbslq_s8(calc_reciprocal, recip, a);
1638
1639 // Get decimal part of a
1640 qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
1641 qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
1642
1643 // Get exponent of 2^n which is equal or less than dec_a
1644 shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
1645
1646 // Get x to range (1, 2]
1647 const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
1648 const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
1649 const qint8x16_t sum = vmulq_s8(shift_value, const_one);
1650
1651 // Polynomial Approximation
1652 qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
1653
1654 // Reconstruct
1655 poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
1656
1657 // Set negative value for 0 < a < 1
1658 poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
1659
1660 return poly;
1661}
1662
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001663inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
1664{
1665 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1666 const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
1667 const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
1668
1669 // If 0 < a < 1, calculate log(1/x)
1670 uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
1671 qint16x8_t recip = vdupq_n_s16(0);
1672 recip = vbslq_s16(calc_reciprocal, a, recip);
1673
1674 // Calculate reciprocal
1675 recip = vqrecipq_qs16(recip, fixed_point_position);
1676 a = vbslq_s16(calc_reciprocal, recip, a);
1677
1678 // Get decimal part of a
1679 qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
1680 qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
1681
1682 // Get exponent of 2^n which is equal or less than dec_a
1683 shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
1684
1685 // Get x to range (1, 2]
1686 const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
1687 const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
1688 const qint16x8_t sum = vmulq_s16(shift_value, const_one);
1689
1690 // Polynomial Approximation
1691 qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
1692
1693 // Reconstruct
1694 poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
1695
1696 // Set negative value for 0 < a < 1
1697 poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
1698
1699 return poly;
1700}
1701
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001702inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1703{
1704 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1705
1706 // Find shift value. Number must be in (0.5, 2) range.
1707 qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
1708
1709 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1710 qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
1711 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
1712 temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
1713 qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
1714
1715 temp = vshl_s8(a, shift_value);
1716
1717 // Initial guess
1718 qint8x8_t x = temp;
1719
1720 // Calculate (x / 2) * (3 - a * x^2)
1721 // After three iterations we have the result for 8 bit
1722 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1723 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1724 x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1725
1726 return vshl_s8(x, shift_value2);
1727}
1728
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001729inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1730{
1731 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1732
1733 // Find shift value. Number must be in (0.5, 2) range.
1734 qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
1735
1736 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1737 qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1738 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1739 temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
1740 qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
1741
1742 temp = vshl_s16(a, shift_value);
1743
1744 // Initial guess
1745 qint16x4_t x = temp;
1746
1747 // Calculate (x / 2) * (3 - a * x^2)
1748 // After five iterations we have the result for 8 bit
1749 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1750 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1751 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1752 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1753 x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1754
1755 return vshl_s16(x, shift_value2);
1756}
1757
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001758inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
1759{
1760 const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
1761
1762 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001763 qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001764
1765 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001766 qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001767 uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001768 temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001769 qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001770
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001771 temp = vqshl_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001772
1773 // Initial guess
1774 qint8x8_t x = temp;
1775
1776 // Calculate (x / 2) * (3 - a * x^2)
1777 // After three iterations we have the result for 8 bit
1778 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1779 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1780 x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1781
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001782 return vqshl_s8(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001783}
1784
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001785inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
1786{
1787 const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
1788
1789 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001790 qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001791
1792 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1793 qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
1794 uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
1795 temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001796 qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001797
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001798 temp = vqshl_s16(a, shift_value);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001799
1800 // Initial guess
1801 qint16x4_t x = temp;
1802
1803 // Calculate (x / 2) * (3 - a * x^2)
1804 // After five iterations we have the result for 16 bit
1805 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1806 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1807 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1808 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1809 x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1810
1811 return vqshl_s16(x, shift_value2);
1812}
1813
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001814inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1815{
1816 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1817
1818 // Find shift value. Number must be in (0.5, 2) range.
1819 qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
1820
1821 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1822 qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
1823 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
1824 temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
1825 qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
1826
1827 temp = vshlq_s8(a, shift_value);
1828
1829 // Initial guess
1830 qint8x16_t x = temp;
1831
1832 // Calculate (x / 2) * (3 - a * x^2)
1833 // After three iterations we have the result for 8 bit
1834 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1835 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1836 x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1837
1838 return vshlq_s8(x, shift_value2);
1839}
1840
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001841inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1842{
1843 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1844
1845 // Find shift value. Number must be in (0.5, 2) range.
1846 qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
1847
1848 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1849 qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1850 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1851 temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
1852 qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
1853
1854 temp = vshlq_s16(a, shift_value);
1855
1856 // Initial guess
1857 qint16x8_t x = temp;
1858
1859 // Calculate (x / 2) * (3 - a * x^2)
1860 // After five iterations we have the result for 16 bit
1861 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1862 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1863 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1864 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1865 x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1866
1867 return vshlq_s16(x, shift_value2);
1868}
1869
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001870inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
1871{
1872 const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
1873
1874 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001875 qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001876
1877 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001878 qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001879 uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001880 temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001881 qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001882
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001883 temp = vqshlq_s8(a, shift_value);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001884
1885 // Initial guess
1886 qint8x16_t x = temp;
1887
1888 // Calculate (x / 2) * (3 - a * x^2)
1889 // After three iterations we have the result for 8 bit
1890 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1891 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1892 x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1893
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001894 return vqshlq_s8(x, shift_value2);
1895}
1896
1897inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
1898{
1899 const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
1900
1901 // Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001902 qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001903
1904 // Add one when the shift value is negative in order to get the correct result when we shift right with 1
1905 qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
1906 uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
1907 temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001908 qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001909
1910 temp = vqshlq_s16(a, shift_value);
1911
1912 // Initial guess
1913 qint16x8_t x = temp;
1914
1915 // Calculate (x / 2) * (3 - a * x^2)
1916 // After five iterations we have the result for 16 bit
1917 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1918 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1919 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1920 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1921 x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
1922
1923 return vqshlq_s16(x, shift_value2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001924}
1925
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001926inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001927{
1928 const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
1929 const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
1930
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001931 const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
1932 const qint8x8_t num = vqsub_qs8(exp2x, const_one);
1933 const qint8x8_t den = vqadd_qs8(exp2x, const_one);
1934 const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001935
1936 return tanh;
1937}
1938
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001939inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001940{
1941 const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
1942 const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
1943
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001944 const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
1945 const qint16x4_t num = vqsub_qs16(exp2x, const_one);
1946 const qint16x4_t den = vqadd_qs16(exp2x, const_one);
1947 const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou0a8334c2017-06-14 18:00:05 +01001948
1949 return tanh;
1950}
1951
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001952inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001953{
1954 const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
1955 const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
1956
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001957 const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
1958 const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
1959 const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
1960 const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001961
1962 return tanh;
1963}
1964
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001965inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
1966{
1967 const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
1968 const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
1969
Michalis Spyroubbd3d602017-06-21 17:29:40 +01001970 const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
1971 const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
1972 const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
1973 const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitasccc65d42017-06-27 17:39:11 +01001974
1975 return tanh;
1976}
1977
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001978inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
1979{
1980 return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
1981}
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001982
Michele Di Giorgiod5e65c72017-07-26 17:09:17 +01001983inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
1984{
1985 return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
1986}
1987
Michele Di Giorgio8af2dd62017-06-19 15:19:29 +01001988inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
1989{
1990 float32x4x2_t res =
1991 {
1992 {
1993 vmaxq_f32(a.val[0], b.val[0]),
1994 vmaxq_f32(a.val[1], b.val[1])
1995 }
1996 };
1997 return res;
1998}
Gian Marco Iodice356f6432017-09-22 11:32:21 +01001999} // namespace arm_compute