Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: 05e481561d2f547e2e172057908cd419d59ebdcb [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
				28	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	30	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	31	{
				32	{
				33	vdup_n_s8(0x7F), // 0.9978546
				34	vdup_n_s8(0x3F), // 0.4994721
				35	vdup_n_s8(0x16), // 0.1763723
				36	vdup_n_s8(0x05), // 0.0435108
				37	}
				38	};
				39
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	40	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				41	* Format is in Q0.15 for all elements */
				42	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				43	{
				44	{
				45	vdup_n_s16(0x7FBA), // 0.9978546
				46	vdup_n_s16(0x3FE9), // 0.4994721
				47	vdup_n_s16(0x1693), // 0.1763723
				48	vdup_n_s16(0x0592), // 0.0435108
				49	}
				50	};
				51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				53	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	54	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	{
				56	{
				57	vdupq_n_s8(0x7F), // 0.9978546
				58	vdupq_n_s8(0x3F), // 0.4994721
				59	vdupq_n_s8(0x16), // 0.1763723
				60	vdupq_n_s8(0x05), // 0.0435108
				61	}
				62	};
				63
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	64	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				65	* Format is in Q0.15 for all elements */
				66	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				67	{
				68	{
				69	vdupq_n_s16(0x7FBA), // 0.9978546
				70	vdupq_n_s16(0x3FE9), // 0.4994721
				71	vdupq_n_s16(0x1693), // 0.1763723
				72	vdupq_n_s16(0x0592), // 0.0435108
				73	}
				74	};
				75
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	76	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				77	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	78	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	79	{
				80	{
				81	vdup_n_s8(0x5C), // 1.4384189
				82	vdup_n_s8(-0x56), // -0.6771900
				83	vdup_n_s8(0x29), // 0.3218538
				84	vdup_n_s8(-0x0A), // -0.0832229
				85	}
				86	};
				87
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	88	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				89	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				90	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				91	{
				92	{
				93	vdup_n_s16(0x5C0F), // 1.4384189
				94	vdup_n_s16(-0x56AE), // -0.6771900
				95	vdup_n_s16(0x2933), // 0.3218538
				96	vdup_n_s16(-0x0AA7), // -0.0832229
				97	}
				98	};
				99
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	100	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				101	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	102	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	103	{
				104	{
				105	vdupq_n_s8(0x5C), // 1.4384189
				106	vdupq_n_s8(-0x56), // -0.6771900
				107	vdupq_n_s8(0x29), // 0.3218538
				108	vdupq_n_s8(-0x0A), // -0.0832229
				109	}
				110	};
				111
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	112	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				113	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				114	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				115	{
				116	{
				117	vdupq_n_s16(0x5C0F), // 1.4384189
				118	vdupq_n_s16(-0x56AE), // -0.6771900
				119	vdupq_n_s16(0x2933), // 0.3218538
				120	vdupq_n_s16(-0x0AA7), // -0.0832229
				121	}
				122	};
				123
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				125	{
				126	return vget_low_s8(a);
				127	}
				128
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	129	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				130	{
				131	return vget_low_s16(a);
				132	}
				133
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	134	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				135	{
				136	return vget_high_s8(a);
				137	}
				138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	139	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				140	{
				141	return vget_high_s16(a);
				142	}
				143
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				145	{
				146	return vld1_s8(addr);
				147	}
				148
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	149	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				150	{
				151	return vld1_s16(addr);
				152	}
				153
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	154	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				155	{
				156	return vld1q_s8(addr);
				157	}
				158
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				160	{
				161	return vld1q_s16(addr);
				162	}
				163
				164	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				165	{
				166	return vld1_dup_s8(addr);
				167	}
				168
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	169	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				170	{
				171	return vld1_dup_s16(addr);
				172	}
				173
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	174	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				175	{
				176	return vld1q_dup_s8(addr);
				177	}
				178
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	179	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				180	{
				181	return vld1q_dup_s16(addr);
				182	}
				183
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame^]	184	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				185	{
				186	return vld2q_s16(addr);
				187	}
				188
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	189	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				190	{
				191	vst1_s8(addr, b);
				192	}
				193
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	194	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				195	{
				196	vst1_s16(addr, b);
				197	}
				198
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	199	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				200	{
				201	vst1q_s8(addr, b);
				202	}
				203
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				205	{
				206	vst1q_s16(addr, b);
				207	}
				208
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	209	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				210	{
				211	vst2q_s16(addr, b);
				212	}
				213
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				215	{
				216	return vqmovn_s16(a);
				217	}
				218
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	219	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				220	{
				221	return vqmovn_s32(a);
				222	}
				223
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	224	inline qint8x8_t vdup_n_qs8(qint8_t a)
				225	{
				226	return vdup_n_s8(a);
				227	}
				228
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	229	inline qint16x4_t vdup_n_qs16(qint16_t a)
				230	{
				231	return vdup_n_s16(a);
				232	}
				233
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	234	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				235	{
				236	return vdupq_n_s8(a);
				237	}
				238
				239	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				240	{
				241	float32x4x4_t res =
				242	{
				243	{
				244	vdupq_n_f32(a),
				245	vdupq_n_f32(a),
				246	vdupq_n_f32(a),
				247	vdupq_n_f32(a),
				248	}
				249	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	250	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	251	}
				252
				253	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				254	{
				255	return vdupq_n_s16(a);
				256	}
				257
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	258	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				259	{
				260	return vdupq_n_s32(a);
				261	}
				262
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	263	inline qint8x8_t vabs_qs8(qint8x8_t a)
				264	{
				265	return vabs_s8(a);
				266	}
				267
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	268	inline qint16x4_t vabs_qs16(qint16x4_t a)
				269	{
				270	return vabs_s16(a);
				271	}
				272
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	273	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				274	{
				275	return vabsq_s8(a);
				276	}
				277
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	278	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				279	{
				280	return vabsq_s16(a);
				281	}
				282
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	283	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				284	{
				285	return vqabs_s8(a);
				286	}
				287
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	288	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				289	{
				290	return vqabs_s16(a);
				291	}
				292
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	293	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				294	{
				295	return vqabsq_s8(a);
				296	}
				297
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	298	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				299	{
				300	return vqabsq_s16(a);
				301	}
				302
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	303	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				304	{
				305	return vmax_s8(a, b);
				306	}
				307
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	308	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				309	{
				310	return vmax_s16(a, b);
				311	}
				312
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	313	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				314	{
				315	return vmaxq_s8(a, b);
				316	}
				317
				318	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				319	{
				320	return vpmax_s8(a, b);
				321	}
				322
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	323	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				324	{
				325	return vpmax_s16(a, b);
				326	}
				327
				328	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				329	{
				330	return vmaxq_s16(a, b);
				331	}
				332
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	333	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				334	{
				335	return vmin_s8(a, b);
				336	}
				337
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	338	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				339	{
				340	return vmin_s16(a, b);
				341	}
				342
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	343	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				344	{
				345	return vminq_s8(a, b);
				346	}
				347
				348	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				349	{
				350	return vpmin_s8(a, b);
				351	}
				352
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	353	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				354	{
				355	return vpmin_s16(a, b);
				356	}
				357
				358	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				359	{
				360	return vminq_s16(a, b);
				361	}
				362
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	363	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				364	{
				365	return vadd_s8(a, b);
				366	}
				367
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	368	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				369	{
				370	return vadd_s16(a, b);
				371	}
				372
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	373	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				374	{
				375	return vaddq_s8(a, b);
				376	}
				377
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	378	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				379	{
				380	return vaddq_s16(a, b);
				381	}
				382
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	383	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				384	{
				385	return vqadd_s8(a, b);
				386	}
				387
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	388	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				389	{
				390	return vqadd_s16(a, b);
				391	}
				392
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	393	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				394	{
				395	return vqadd_s32(a, b);
				396	}
				397
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	398	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				399	{
				400	return vqaddq_s8(a, b);
				401	}
				402
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	403	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				404	{
				405	return vqaddq_s16(a, b);
				406	}
				407
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	408	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				409	{
				410	return vqaddq_s32(a, b);
				411	}
				412
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	413	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				414	{
				415	return vpaddl_s8(a);
				416	}
				417
				418	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				419	{
				420	return vsub_s8(a, b);
				421	}
				422
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	423	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				424	{
				425	return vsub_s16(a, b);
				426	}
				427
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	428	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				429	{
				430	return vsubq_s8(a, b);
				431	}
				432
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	433	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				434	{
				435	return vsubq_s16(a, b);
				436	}
				437
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	438	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				439	{
				440	return vqsub_s8(a, b);
				441	}
				442
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	443	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				444	{
				445	return vqsub_s16(a, b);
				446	}
				447
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	448	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				449	{
				450	return vqsubq_s8(a, b);
				451	}
				452
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	453	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				454	{
				455	return vqsubq_s16(a, b);
				456	}
				457
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	458	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				459	{
				460	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				461
				462	// Initialize the temporary result with a constant used to round up the result
				463	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				464
				465	// Vector multiply-accumulate long
				466	res = vmlal_s8(res, a, b);
				467
				468	// Shift right by fixed_point_position
				469	res = vshlq_s16(res, fixed_point_position_s16);
				470
				471	// Convert back to qint8
				472	return vmovn_s16(res);
				473	}
				474
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	475	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				476	{
				477	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				478
				479	// Initialize the temporary result with a constant used to round up the result
				480	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				481
				482	// Vector multiply-accumulate long
				483	res = vmlal_s16(res, a, b);
				484
				485	// Shift right by fixed_point_position
				486	res = vshlq_s32(res, fixed_point_position_s32);
				487
				488	// Convert back to qint16
				489	return vmovn_s32(res);
				490	}
				491
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	492	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				493	{
				494	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				495
				496	// Initialize the temporary results with a constant used to round up the result
				497	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				498	qint16x8_t res1 = res0;
				499
				500	// Vector multiply-accumulate long
				501	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				502	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				503
				504	// Shift right by fixed_point_position
				505	res0 = vshlq_s16(res0, fixed_point_position_s16);
				506	res1 = vshlq_s16(res1, fixed_point_position_s16);
				507
				508	// Convert back to qint8
				509	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				510	}
				511
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	512	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				513	{
				514	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				515
				516	// Initialize the temporary results with a constant used to round up the result
				517	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				518	qint32x4_t res1 = res0;
				519
				520	// Vector multiply-accumulate long
				521	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				522	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				523
				524	// Shift right by fixed_point_position
				525	res0 = vshlq_s32(res0, fixed_point_position_s32);
				526	res1 = vshlq_s32(res1, fixed_point_position_s32);
				527
				528	// Convert back to qint16
				529	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				530	}
				531
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	532	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				533	{
				534	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				535
				536	// Initialize the temporary result with a constant used to round up the result
				537	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				538
				539	// Vector multiply-accumulate long
				540	res = vmlal_s8(res, a, b);
				541
				542	// Shift right by fixed_point_position
				543	res = vqshlq_s16(res, fixed_point_position_s16);
				544
				545	// Convert back to qint8 and saturate
				546	return vqmovn_s16(res);
				547	}
				548
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	549	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				550	{
				551	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				552
				553	// Initialize the temporary result with a constant used to round up the result
				554	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				555
				556	// Vector multiply-accumulate long
				557	res = vmlal_s16(res, a, b);
				558
				559	// Shift right by fixed_point_position
				560	res = vqshlq_s32(res, fixed_point_position_s32);
				561
				562	// Convert back to qint16 and saturate
				563	return vqmovn_s32(res);
				564	}
				565
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	566	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				567	{
				568	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				569
				570	// Initialize the temporary results with a constant used to round up the result
				571	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				572	qint16x8_t res1 = res0;
				573
				574	// Vector multiply-accumulate long
				575	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				576	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				577
				578	// Shift right by fixed_point_position
				579	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				580	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				581
				582	// Convert back to qint8 and saturate
				583	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				584	}
				585
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	586	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				587	{
				588	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				589
				590	// Initialize the temporary results with a constant used to round up the result
				591	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				592	qint32x4_t res1 = res0;
				593
				594	// Vector multiply-accumulate long
				595	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				596	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				597
				598	// Shift right by fixed_point_position
				599	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				600	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				601
				602	// Convert back to qint16 and saturate
				603	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				604	}
				605
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	606	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				607	{
				608	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				609
				610	qint16x8_t res = vmull_s8(a, b);
				611
				612	return vqrshlq_s16(res, fixed_point_position_s16);
				613	}
				614
				615	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				616	{
				617	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				618
				619	// Initialize the temporary results with a constant used to round up the result
				620	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				621
				622	// Vector multiply-accumulate long
				623	tmp = vmlal_s8(tmp, b, c);
				624
				625	// Shift right by fixed_point_position
				626	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				627
				628	// Convert back to qint8 and accumulate
				629	return vadd_s8(a, vmovn_s16(tmp));
				630	}
				631
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	632	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				633	{
				634	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				635
				636	// Initialize the temporary results with a constant used to round up the result
				637	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				638
				639	// Vector multiply-accumulate long
				640	tmp = vmlal_s16(tmp, b, c);
				641
				642	// Shift right by fixed_point_position
				643	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				644
				645	// Convert back to qint16 and accumulate
				646	return vadd_s16(a, vmovn_s32(tmp));
				647	}
				648
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	649	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				650	{
				651	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				652
				653	// Initialize the temporary results with a constant used to round up the result
				654	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				655	qint16x8_t tmp1 = tmp0;
				656
				657	// Vector multiply-accumulate long
				658	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				659	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				660
				661	// Shift right by fixed_point_position
				662	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				663	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				664
				665	// Convert back to qint8 and accumulate
				666	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				667	}
				668
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	669	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				670	{
				671	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				672
				673	// Initialize the temporary results with a constant used to round up the result
				674	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				675	qint32x4_t tmp1 = tmp0;
				676
				677	// Vector multiply-accumulate long
				678	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				679	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				680
				681	// Shift right by fixed_point_position
				682	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				683	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				684
				685	// Convert back to qint16 and accumulate
				686	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				687	}
				688
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	689	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				690	{
				691	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				692
				693	// Initialize the temporary results with a constant used to round up the result
				694	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				695
				696	// Vector multiply-accumulate long
				697	tmp = vmlal_s8(tmp, b, c);
				698
				699	// Shift right by fixed_point_position
				700	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				701
				702	// Convert back to qint8 and accumulate
				703	return vqadd_s8(a, vqmovn_s16(tmp));
				704	}
				705
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	706	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				707	{
				708	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				709
				710	// Initialize the temporary results with a constant used to round up the result
				711	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				712
				713	// Vector multiply-accumulate long
				714	tmp = vmlal_s16(tmp, b, c);
				715
				716	// Shift right by fixed_point_position
				717	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				718
				719	// Convert back to qint8 and accumulate
				720	return vqadd_s16(a, vqmovn_s32(tmp));
				721	}
				722
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	723	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				724	{
				725	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				726
				727	// Initialize the temporary results with a constant used to round up the result
				728	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				729	qint16x8_t tmp1 = tmp0;
				730
				731	// Vector multiply-accumulate long
				732	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				733	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				734
				735	// Shift right by fixed_point_position
				736	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				737	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				738
				739	// Convert back to qint8 and accumulate
				740	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				741	return vqaddq_s8(a, res);
				742	}
				743
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	744	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				745	{
				746	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				747
				748	// Initialize the temporary results with a constant used to round up the result
				749	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				750	qint32x4_t tmp1 = tmp0;
				751
				752	// Vector multiply-accumulate long
				753	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				754	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				755
				756	// Shift right by fixed_point_position
				757	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				758	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				759
				760	// Convert back to qint16 and accumulate
				761	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				762	return vqaddq_s16(a, res);
				763	}
				764
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	765	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				766	{
				767	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				768
				769	// Initialize the temporary results with a constant used to round up the result
				770	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				771
				772	// Vector multiply-accumulate long
				773	tmp = vmlal_s8(tmp, b, c);
				774
				775	// Shift right by fixed_point_position
				776	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				777
				778	// Accumulate
				779	return vaddq_s16(a, tmp);
				780	}
				781
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	782	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				783	{
				784	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				785
				786	// Initialize the temporary results with a constant used to round up the result
				787	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				788
				789	// Vector multiply-accumulate long
				790	tmp = vmlal_s16(tmp, b, c);
				791
				792	// Shift right by fixed_point_position
				793	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				794
				795	// Accumulate
				796	return vaddq_s32(a, tmp);
				797	}
				798
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	799	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				800	{
				801	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				802
				803	// Initialize the temporary results with a constant used to round up the result
				804	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				805
				806	// Vector multiply-accumulate long
				807	tmp = vmlal_s8(tmp, b, c);
				808
				809	// Shift right by fixed_point_position
				810	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				811
				812	// Accumulate
				813	return vqaddq_s16(a, tmp);
				814	}
				815
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	816	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				817	{
				818	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				819
				820	// Initialize the temporary results with a constant used to round up the result
				821	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				822
				823	// Vector multiply-accumulate long
				824	tmp = vmlal_s16(tmp, b, c);
				825
				826	// Shift right by fixed_point_position
				827	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				828
				829	// Accumulate
				830	return vqaddq_s32(a, tmp);
				831	}
				832
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	833	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	834	{
				835	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				836
				837	float32x4x2_t res_f32 =
				838	{
				839	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	840	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				841	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	842	}
				843	};
				844
				845	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				846	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				847
				848	const int32x4x2_t res_s32 =
				849	{
				850	{
				851	vcvtq_s32_f32(res_f32.val[0]),
				852	vcvtq_s32_f32(res_f32.val[1]),
				853	}
				854	};
				855
				856	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				857
				858	return vqmovn_s16(res_s16);
				859	}
				860
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	861	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	862	{
				863	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				864
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	865	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	866
				867	res_f32 = vmlaq_f32(res_f32, a, pow2);
				868
				869	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				870
				871	return vqmovn_s32(res_s32);
				872	}
				873
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	874	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	875	{
				876	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				877
				878	float32x4x4_t res_f32 =
				879	{
				880	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	881	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				882	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				883	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				884	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	885	}
				886	};
				887
				888	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				889	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				890	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				891	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				892
				893	const int32x4x4_t res_s32 =
				894	{
				895	{
				896	vcvtq_s32_f32(res_f32.val[0]),
				897	vcvtq_s32_f32(res_f32.val[1]),
				898	vcvtq_s32_f32(res_f32.val[2]),
				899	vcvtq_s32_f32(res_f32.val[3]),
				900	}
				901	};
				902
				903	const int16x8x2_t res_s16 =
				904	{
				905	{
				906	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				907	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				908	}
				909	};
				910
				911	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				912	}
				913
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	914	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	915	{
				916	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				917
				918	float32x4x2_t res_f32 =
				919	{
				920	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	921	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				922	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	923	}
				924	};
				925
				926	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				927	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				928
				929	const int32x4x2_t res_s32 =
				930	{
				931	{
				932	vcvtq_s32_f32(res_f32.val[0]),
				933	vcvtq_s32_f32(res_f32.val[1])
				934	}
				935	};
				936
				937	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				938	}
				939
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	940	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				941	{
				942	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				943
				944	const int16x8_t res_s16 = vmovl_s8(a);
				945
				946	const int32x4x2_t res_s32 =
				947	{
				948	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	949	vmovl_s16(vget_low_qs16(res_s16)),
				950	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	951	}
				952	};
				953
				954	float32x4x2_t res_f32 =
				955	{
				956	{
				957	vcvtq_f32_s32(res_s32.val[0]),
				958	vcvtq_f32_s32(res_s32.val[1])
				959	}
				960	};
				961
				962	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				963	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				964
				965	return res_f32;
				966	}
				967
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	968	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				969	{
				970	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				971	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				972
				973	return vmulq_f32(res_f32, pow2);
				974	}
				975
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	976	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				977	{
				978	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				979
				980	const int16x8x2_t res_s16 =
				981	{
				982	{
				983	vmovl_s8(vget_low_s8(a)),
				984	vmovl_s8(vget_high_s8(a)),
				985	}
				986	};
				987
				988	const int32x4x4_t res_s32 =
				989	{
				990	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	991	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				992	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				993	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				994	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	995	}
				996	};
				997
				998	float32x4x4_t res_f32 =
				999	{
				1000	{
				1001	vcvtq_f32_s32(res_s32.val[0]),
				1002	vcvtq_f32_s32(res_s32.val[1]),
				1003	vcvtq_f32_s32(res_s32.val[2]),
				1004	vcvtq_f32_s32(res_s32.val[3])
				1005	}
				1006	};
				1007
				1008	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1009	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1010	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1011	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1012
				1013	return res_f32;
				1014	}
				1015
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1016	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1017	{
				1018	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1019
				1020	const int32x4x2_t res_s32 =
				1021	{
				1022	{
				1023	vmovl_s16(vget_low_qs16(a)),
				1024	vmovl_s16(vget_high_qs16(a))
				1025	}
				1026	};
				1027
				1028	float32x4x2_t res_f32 =
				1029	{
				1030	{
				1031	vcvtq_f32_s32(res_s32.val[0]),
				1032	vcvtq_f32_s32(res_s32.val[1])
				1033	}
				1034	};
				1035
				1036	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1037	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1038
				1039	return res_f32;
				1040	}
				1041
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1042	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1043	{
				1044	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1045	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1046	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1047	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1048
				1049	// Find shift value
				1050	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1051	const qint8x8_t temp = vshl_s8(a, shift_value);
				1052
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1053	qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1054
				1055	uint8x8_t set_one = vcgt_s8(x, const_one);
				1056	x = vbsl_s8(set_one, const_one, x);
				1057
				1058	// Use three iterations of Newton-Raphson method to get the result
				1059	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1060	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1061	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1062
				1063	return vshl_s8(x, shift_value);
				1064	}
				1065
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1066	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1067	{
				1068	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1069	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1070	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1071	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1072
				1073	// Find shift value
				1074	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1075	const qint16x4_t temp = vshl_s16(a, shift_value);
				1076
				1077	qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
				1078
				1079	uint16x4_t set_one = vcgt_s16(x, const_one);
				1080	x = vbsl_s16(set_one, const_one, x);
				1081
				1082	// Use five iterations of Newton-Raphson method to get the result
				1083	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1084	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1085	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1086	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1087	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1088
				1089	return vshl_s16(x, shift_value);
				1090	}
				1091
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1092	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1093	{
				1094	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1095	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1096	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1097	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1098
				1099	// Find shift value
				1100	const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1101	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1102
				1103	qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
				1104
				1105	uint8x8_t set_one = vcgt_s8(x, const_one);
				1106	x = vbsl_s8(set_one, const_one, x);
				1107
				1108	// Use three iterations of Newton-Raphson method to get the result
				1109	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1110	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1111	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1112
				1113	return vqshl_s8(x, shift_value);
				1114	}
				1115
				1116	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1117	{
				1118	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1119	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1120	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1121	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1122
				1123	// Find shift value
				1124	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1125	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1126
				1127	qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
				1128
				1129	uint16x4_t set_one = vcgt_s16(x, const_one);
				1130	x = vbsl_s16(set_one, const_one, x);
				1131
				1132	// Use five iterations of Newton-Raphson method to get the result
				1133	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1134	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1135	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1136	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1137	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1138
				1139	return vqshl_s16(x, shift_value);
				1140	}
				1141
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1142	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1143	{
				1144	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1145	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1146	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1147	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1148
				1149	// Find shift value
				1150	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1151	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1152
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1153	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1154
				1155	// Set initial guess to one if x > 1
				1156	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1157	x = vbslq_s8(set_one, const_one, x);
				1158
				1159	// Use three iterations of Newton-Raphson method to get the result
				1160	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1161	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1162	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1163
				1164	return vshlq_s8(x, shift_value);
				1165	}
				1166
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1167	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1168	{
				1169	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1170	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1171	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1172	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1173
				1174	// Find shift value
				1175	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1176	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1177
				1178	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1179
				1180	// Set initial guess to one if x > 1
				1181	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1182	x = vbslq_s16(set_one, const_one, x);
				1183
				1184	// Use five iterations of Newton-Raphson method to get the result
				1185	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1186	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1187	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1188	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1189	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1190
				1191	return vshlq_s16(x, shift_value);
				1192	}
				1193
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1194	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1195	{
				1196	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1197	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1198	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1199	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1200
				1201	// Find shift value
				1202	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1203	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1204
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1205	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1206
				1207	// Set initial guess to one if x > 1
				1208	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1209	x = vbslq_s8(set_one, const_one, x);
				1210
				1211	// Use three iterations of Newton-Raphson method to get the result
				1212	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1213	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1214	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1215
				1216	return vqshlq_s8(x, shift_value);
				1217	}
				1218
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1219	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1220	{
				1221	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1222	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1223	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1224	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1225
				1226	// Find shift value
				1227	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1228	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1229
				1230	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1231
				1232	// Set initial guess to one if x > 1
				1233	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1234	x = vbslq_s16(set_one, const_one, x);
				1235
				1236	// Use five iterations of Newton-Raphson method to get the result
				1237	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1238	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1239	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1240	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1241	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1242
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1243	// Saturate result in case of overflow
				1244	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1245	}
				1246
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1247	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1248	{
				1249	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1250	}
				1251
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1252	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1253	{
				1254	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1255	}
				1256
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1257	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1258	{
				1259	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1260	}
				1261
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1262	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1263	{
				1264	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1265	}
				1266
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1267	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1268	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1269	{
				1270	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1271	const qint8x8_t const_one = vdup_n_s8(1);
				1272	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1273	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1274	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1275	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1276	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1277	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1278	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1279	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1280	return res;
				1281	}
				1282
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1283	template <bool islog>
				1284	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1285	{
				1286	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1287	const qint16x4_t const_one = vdup_n_s16(1);
				1288	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1289	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1290	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1291	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1292	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1293	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1294	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1295	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1296	return res;
				1297	}
				1298
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1299	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1300	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1301	{
				1302	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1303	const qint8x8_t const_one = vdup_n_s8(1);
				1304	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1305	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1306	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1307	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1308	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1309	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1310	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1311	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1312	return res;
				1313	}
				1314
				1315	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1316	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1317	{
				1318	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1319	const qint16x4_t const_one = vdup_n_s16(1);
				1320	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1321	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1322	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1323	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1324	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1325	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1326	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1327	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1328	return res;
				1329	}
				1330
				1331	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1332	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1333	{
				1334	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1335	const qint8x16_t const_one = vdupq_n_s8(1);
				1336	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1337	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1338	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1339	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1340	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1341	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1342	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1343	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1344	return res;
				1345	}
				1346
				1347	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1348	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1349	{
				1350	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1351	const qint16x8_t const_one = vdupq_n_s16(1);
				1352	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1353	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1354	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1355	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1356	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1357	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1358	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1359	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1360	return res;
				1361	}
				1362
				1363	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1364	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1365	{
				1366	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1367	const qint8x16_t const_one = vdupq_n_s8(1);
				1368	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1369	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1370	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1371	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1372	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1373	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1374	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1375	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1376	return res;
				1377	}
				1378
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1379	template <bool islog>
				1380	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1381	{
				1382	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1383	const qint16x8_t const_one = vdupq_n_s16(1);
				1384	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1385	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1386	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1387	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1388	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1389	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1390	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1391	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1392	return res;
				1393	}
				1394
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1395	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1396	{
				1397	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1398	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1399	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1400	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1401
				1402	// Perform range reduction [-log(2),log(2)]
				1403	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1404
				1405	// get decimal part from m
				1406	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1407
				1408	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1409	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1410
				1411	// Polynomial Approximation
				1412	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1413	poly = vqadd_s8(poly, const_one);
				1414
				1415	// Reconstruct
				1416	poly = vqshl_s8(poly, dec_m);
				1417
				1418	return poly;
				1419	}
				1420
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1421	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1422	{
				1423	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1424	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1425	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1426	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1427
				1428	// Perform range reduction [-log(2),log(2)]
				1429	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1430
				1431	// get decimal part from m
				1432	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1433
				1434	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1435	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1436
				1437	// Polynomial Approximation
				1438	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1439	poly = vqadd_s16(poly, const_one);
				1440
				1441	// Reconstruct
				1442	poly = vqshl_s16(poly, dec_m);
				1443
				1444	return poly;
				1445	}
				1446
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1447	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1448	{
				1449	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1450	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1451	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1452	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1453
				1454	// Perform range reduction [-log(2),log(2)]
				1455	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1456
				1457	// get decimal part from m
				1458	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1459
				1460	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1461	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1462
				1463	// Polynomial Approximation
				1464	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1465	poly = vqaddq_s8(poly, const_one);
				1466
				1467	// Reconstruct
				1468	poly = vqshlq_s8(poly, dec_m);
				1469
				1470	return poly;
				1471	}
				1472
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1473	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1474	{
				1475	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1476	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1477	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1478	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1479
				1480	// Perform range reduction [-log(2),log(2)]
				1481	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1482
				1483	// get decimal part from m
				1484	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1485
				1486	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1487	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1488
				1489	// Polynomial Approximation
				1490	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1491	poly = vqaddq_s16(poly, const_one);
				1492
				1493	// Reconstruct
				1494	poly = vqshlq_s16(poly, dec_m);
				1495
				1496	return poly;
				1497	}
				1498
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1499	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1500	{
				1501	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1502	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1503	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1504
				1505	// If 0 < a < 1, calculate log(1/x)
				1506	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1507	qint8x8_t recip = vdup_n_s8(0);
				1508	recip = vbsl_s8(calc_reciprocal, recip, a);
				1509
				1510	// Calculate reciprocal
				1511	recip = vrecip_qs8(recip, fixed_point_position);
				1512	a = vbsl_s8(calc_reciprocal, recip, a);
				1513
				1514	// Get decimal part of a
				1515	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1516	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1517
				1518	// Get exponent of 2^n which is equal or less than dec_a
				1519	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1520
				1521	// Get x to range (1, 2]
				1522	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1523	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1524	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1525
				1526	// Polynomial Approximation
				1527	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1528
				1529	// Reconstruct
				1530	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1531
				1532	// Set negative value for 0 < a < 1
				1533	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1534
				1535	return poly;
				1536	}
				1537
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1538	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1539	{
				1540	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1541	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1542	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1543
				1544	// If 0 < a < 1, calculate log(1/x)
				1545	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1546	qint16x4_t recip = vdup_n_s16(0);
				1547	recip = vbsl_s16(calc_reciprocal, recip, a);
				1548
				1549	// Calculate reciprocal
				1550	recip = vrecip_qs16(recip, fixed_point_position);
				1551	a = vbsl_s16(calc_reciprocal, recip, a);
				1552
				1553	// Get decimal part of a
				1554	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1555	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1556
				1557	// Get exponent of 2^n which is equal or less than dec_a
				1558	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1559
				1560	// Get x to range (1, 2]
				1561	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1562	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1563	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1564
				1565	// Polynomial Approximation
				1566	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1567
				1568	// Reconstruct
				1569	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1570
				1571	// Set negative value for 0 < a < 1
				1572	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1573
				1574	return poly;
				1575	}
				1576
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1577	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1578	{
				1579	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1580	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1581	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1582
				1583	// If 0 < a < 1, calculate log(1/x)
				1584	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1585	qint8x16_t recip = vdupq_n_s8(0);
				1586	recip = vbslq_s8(calc_reciprocal, a, recip);
				1587
				1588	// Calculate reciprocal
				1589	recip = vrecipq_qs8(recip, fixed_point_position);
				1590	a = vbslq_s8(calc_reciprocal, recip, a);
				1591
				1592	// Get decimal part of a
				1593	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1594	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1595
				1596	// Get exponent of 2^n which is equal or less than dec_a
				1597	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1598
				1599	// Get x to range (1, 2]
				1600	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1601	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1602	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1603
				1604	// Polynomial Approximation
				1605	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1606
				1607	// Reconstruct
				1608	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1609
				1610	// Set negative value for 0 < a < 1
				1611	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1612
				1613	return poly;
				1614	}
				1615
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1616	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1617	{
				1618	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1619	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1620	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1621
				1622	// If 0 < a < 1, calculate log(1/x)
				1623	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1624	qint16x8_t recip = vdupq_n_s16(0);
				1625	recip = vbslq_s16(calc_reciprocal, a, recip);
				1626
				1627	// Calculate reciprocal
				1628	recip = vqrecipq_qs16(recip, fixed_point_position);
				1629	a = vbslq_s16(calc_reciprocal, recip, a);
				1630
				1631	// Get decimal part of a
				1632	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1633	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1634
				1635	// Get exponent of 2^n which is equal or less than dec_a
				1636	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1637
				1638	// Get x to range (1, 2]
				1639	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1640	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1641	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1642
				1643	// Polynomial Approximation
				1644	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1645
				1646	// Reconstruct
				1647	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1648
				1649	// Set negative value for 0 < a < 1
				1650	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1651
				1652	return poly;
				1653	}
				1654
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1655	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1656	{
				1657	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1658
				1659	// Find shift value. Number must be in (0.5, 2) range.
				1660	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1661
				1662	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1663	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1664	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1665	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1666	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1667
				1668	temp = vshl_s8(a, shift_value);
				1669
				1670	// Initial guess
				1671	qint8x8_t x = temp;
				1672
				1673	// Calculate (x / 2) * (3 - a * x^2)
				1674	// After three iterations we have the result for 8 bit
				1675	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1676	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1677	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1678
				1679	return vshl_s8(x, shift_value2);
				1680	}
				1681
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1682	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1683	{
				1684	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1685
				1686	// Find shift value. Number must be in (0.5, 2) range.
				1687	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1688
				1689	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1690	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1691	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1692	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1693	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1694
				1695	temp = vshl_s16(a, shift_value);
				1696
				1697	// Initial guess
				1698	qint16x4_t x = temp;
				1699
				1700	// Calculate (x / 2) * (3 - a * x^2)
				1701	// After five iterations we have the result for 8 bit
				1702	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1703	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1704	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1705	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1706	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1707
				1708	return vshl_s16(x, shift_value2);
				1709	}
				1710
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1711	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1712	{
				1713	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1714
				1715	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1716	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1717
				1718	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1719	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1720	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1721	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1722	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1723
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1724	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1725
				1726	// Initial guess
				1727	qint8x8_t x = temp;
				1728
				1729	// Calculate (x / 2) * (3 - a * x^2)
				1730	// After three iterations we have the result for 8 bit
				1731	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1732	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1733	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1734
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1735	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1736	}
				1737
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1738	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1739	{
				1740	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1741
				1742	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1743	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1744
				1745	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1746	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1747	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1748	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1749	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1750
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1751	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1752
				1753	// Initial guess
				1754	qint16x4_t x = temp;
				1755
				1756	// Calculate (x / 2) * (3 - a * x^2)
				1757	// After five iterations we have the result for 16 bit
				1758	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1759	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1760	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1761	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1762	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1763
				1764	return vqshl_s16(x, shift_value2);
				1765	}
				1766
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1767	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1768	{
				1769	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1770
				1771	// Find shift value. Number must be in (0.5, 2) range.
				1772	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1773
				1774	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1775	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1776	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1777	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1778	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1779
				1780	temp = vshlq_s8(a, shift_value);
				1781
				1782	// Initial guess
				1783	qint8x16_t x = temp;
				1784
				1785	// Calculate (x / 2) * (3 - a * x^2)
				1786	// After three iterations we have the result for 8 bit
				1787	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1788	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1789	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1790
				1791	return vshlq_s8(x, shift_value2);
				1792	}
				1793
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1794	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1795	{
				1796	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1797
				1798	// Find shift value. Number must be in (0.5, 2) range.
				1799	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1800
				1801	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1802	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1803	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1804	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1805	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1806
				1807	temp = vshlq_s16(a, shift_value);
				1808
				1809	// Initial guess
				1810	qint16x8_t x = temp;
				1811
				1812	// Calculate (x / 2) * (3 - a * x^2)
				1813	// After five iterations we have the result for 16 bit
				1814	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1815	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1816	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1817	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1818	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1819
				1820	return vshlq_s16(x, shift_value2);
				1821	}
				1822
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1823	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1824	{
				1825	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1826
				1827	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1828	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1829
				1830	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1831	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1832	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1833	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1834	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1835
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1836	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1837
				1838	// Initial guess
				1839	qint8x16_t x = temp;
				1840
				1841	// Calculate (x / 2) * (3 - a * x^2)
				1842	// After three iterations we have the result for 8 bit
				1843	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1844	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1845	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1846
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1847	return vqshlq_s8(x, shift_value2);
				1848	}
				1849
				1850	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1851	{
				1852	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1853
				1854	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1855	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1856
				1857	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1858	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1859	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1860	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1861	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1862
				1863	temp = vqshlq_s16(a, shift_value);
				1864
				1865	// Initial guess
				1866	qint16x8_t x = temp;
				1867
				1868	// Calculate (x / 2) * (3 - a * x^2)
				1869	// After five iterations we have the result for 16 bit
				1870	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1871	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1872	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1873	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1874	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1875
				1876	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1877	}
				1878
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1879	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1880	{
				1881	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1882	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1883
				1884	qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1885	qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1886	qint8x8_t den = vqadd_qs8(exp2x, const_one);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1887	qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1888
				1889	return tanh;
				1890	}
				1891
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1892	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1893	{
				1894	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1895	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1896
				1897	qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1898	qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1899	qint16x4_t den = vqadd_qs16(exp2x, const_one);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1900	qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1901
				1902	return tanh;
				1903	}
				1904
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1905	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1906	{
				1907	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1908	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1909
				1910	qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1911	qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1912	qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1913	qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
				1914
				1915	return tanh;
				1916	}
				1917
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1918	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1919	{
				1920	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1921	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1922
				1923	qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1924	qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1925	qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1926	qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
				1927
				1928	return tanh;
				1929	}
				1930
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1931	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1932	{
				1933	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1934	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1935
				1936	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1937	{
				1938	float32x4x2_t res =
				1939	{
				1940	{
				1941	vmaxq_f32(a.val[0], b.val[0]),
				1942	vmaxq_f32(a.val[1], b.val[1])
				1943	}
				1944	};
				1945	return res;
				1946	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1947	}