Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: 4e862ba387f381662f0b52333b2fbefcb2ac29c0 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame^]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
				28	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	30	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	31	{
				32	{
				33	vdup_n_s8(0x7F), // 0.9978546
				34	vdup_n_s8(0x3F), // 0.4994721
				35	vdup_n_s8(0x16), // 0.1763723
				36	vdup_n_s8(0x05), // 0.0435108
				37	}
				38	};
				39
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	40	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				41	* Format is in Q0.15 for all elements */
				42	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				43	{
				44	{
				45	vdup_n_s16(0x7FBA), // 0.9978546
				46	vdup_n_s16(0x3FE9), // 0.4994721
				47	vdup_n_s16(0x1693), // 0.1763723
				48	vdup_n_s16(0x0592), // 0.0435108
				49	}
				50	};
				51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				53	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	54	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	{
				56	{
				57	vdupq_n_s8(0x7F), // 0.9978546
				58	vdupq_n_s8(0x3F), // 0.4994721
				59	vdupq_n_s8(0x16), // 0.1763723
				60	vdupq_n_s8(0x05), // 0.0435108
				61	}
				62	};
				63
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	64	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				65	* Format is in Q0.15 for all elements */
				66	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				67	{
				68	{
				69	vdupq_n_s16(0x7FBA), // 0.9978546
				70	vdupq_n_s16(0x3FE9), // 0.4994721
				71	vdupq_n_s16(0x1693), // 0.1763723
				72	vdupq_n_s16(0x0592), // 0.0435108
				73	}
				74	};
				75
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	76	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				77	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	78	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	79	{
				80	{
				81	vdup_n_s8(0x5C), // 1.4384189
				82	vdup_n_s8(-0x56), // -0.6771900
				83	vdup_n_s8(0x29), // 0.3218538
				84	vdup_n_s8(-0x0A), // -0.0832229
				85	}
				86	};
				87
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	88	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				89	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				90	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				91	{
				92	{
				93	vdup_n_s16(0x5C0F), // 1.4384189
				94	vdup_n_s16(-0x56AE), // -0.6771900
				95	vdup_n_s16(0x2933), // 0.3218538
				96	vdup_n_s16(-0x0AA7), // -0.0832229
				97	}
				98	};
				99
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	100	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				101	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	102	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	103	{
				104	{
				105	vdupq_n_s8(0x5C), // 1.4384189
				106	vdupq_n_s8(-0x56), // -0.6771900
				107	vdupq_n_s8(0x29), // 0.3218538
				108	vdupq_n_s8(-0x0A), // -0.0832229
				109	}
				110	};
				111
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	112	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				113	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				114	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				115	{
				116	{
				117	vdupq_n_s16(0x5C0F), // 1.4384189
				118	vdupq_n_s16(-0x56AE), // -0.6771900
				119	vdupq_n_s16(0x2933), // 0.3218538
				120	vdupq_n_s16(-0x0AA7), // -0.0832229
				121	}
				122	};
				123
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				125	{
				126	return vget_low_s8(a);
				127	}
				128
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	129	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				130	{
				131	return vget_low_s16(a);
				132	}
				133
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	134	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				135	{
				136	return vget_high_s8(a);
				137	}
				138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	139	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				140	{
				141	return vget_high_s16(a);
				142	}
				143
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				145	{
				146	return vld1_s8(addr);
				147	}
				148
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	149	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				150	{
				151	return vld1_s16(addr);
				152	}
				153
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	154	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				155	{
				156	return vld1q_s8(addr);
				157	}
				158
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				160	{
				161	return vld1q_s16(addr);
				162	}
				163
				164	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				165	{
				166	return vld1_dup_s8(addr);
				167	}
				168
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	169	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				170	{
				171	return vld1_dup_s16(addr);
				172	}
				173
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	174	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				175	{
				176	return vld1q_dup_s8(addr);
				177	}
				178
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	179	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				180	{
				181	return vld1q_dup_s16(addr);
				182	}
				183
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	184	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				185	{
				186	vst1_s8(addr, b);
				187	}
				188
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	189	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				190	{
				191	vst1_s16(addr, b);
				192	}
				193
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	194	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				195	{
				196	vst1q_s8(addr, b);
				197	}
				198
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	199	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				200	{
				201	vst1q_s16(addr, b);
				202	}
				203
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	204	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				205	{
				206	vst2q_s16(addr, b);
				207	}
				208
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	209	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				210	{
				211	return vqmovn_s16(a);
				212	}
				213
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	214	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				215	{
				216	return vqmovn_s32(a);
				217	}
				218
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	219	inline qint8x8_t vdup_n_qs8(qint8_t a)
				220	{
				221	return vdup_n_s8(a);
				222	}
				223
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	224	inline qint16x4_t vdup_n_qs16(qint16_t a)
				225	{
				226	return vdup_n_s16(a);
				227	}
				228
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	229	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				230	{
				231	return vdupq_n_s8(a);
				232	}
				233
				234	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				235	{
				236	float32x4x4_t res =
				237	{
				238	{
				239	vdupq_n_f32(a),
				240	vdupq_n_f32(a),
				241	vdupq_n_f32(a),
				242	vdupq_n_f32(a),
				243	}
				244	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	245	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	246	}
				247
				248	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				249	{
				250	return vdupq_n_s16(a);
				251	}
				252
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	253	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				254	{
				255	return vdupq_n_s32(a);
				256	}
				257
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	258	inline qint8x8_t vabs_qs8(qint8x8_t a)
				259	{
				260	return vabs_s8(a);
				261	}
				262
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	263	inline qint16x4_t vabs_qs16(qint16x4_t a)
				264	{
				265	return vabs_s16(a);
				266	}
				267
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	268	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				269	{
				270	return vabsq_s8(a);
				271	}
				272
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	273	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				274	{
				275	return vabsq_s16(a);
				276	}
				277
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	278	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				279	{
				280	return vqabs_s8(a);
				281	}
				282
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	283	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				284	{
				285	return vqabs_s16(a);
				286	}
				287
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	288	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				289	{
				290	return vqabsq_s8(a);
				291	}
				292
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	293	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				294	{
				295	return vqabsq_s16(a);
				296	}
				297
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	298	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				299	{
				300	return vmax_s8(a, b);
				301	}
				302
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	303	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				304	{
				305	return vmax_s16(a, b);
				306	}
				307
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	308	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				309	{
				310	return vmaxq_s8(a, b);
				311	}
				312
				313	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				314	{
				315	return vpmax_s8(a, b);
				316	}
				317
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	318	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				319	{
				320	return vpmax_s16(a, b);
				321	}
				322
				323	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				324	{
				325	return vmaxq_s16(a, b);
				326	}
				327
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	328	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				329	{
				330	return vmin_s8(a, b);
				331	}
				332
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	333	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				334	{
				335	return vmin_s16(a, b);
				336	}
				337
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	338	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				339	{
				340	return vminq_s8(a, b);
				341	}
				342
				343	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				344	{
				345	return vpmin_s8(a, b);
				346	}
				347
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	348	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				349	{
				350	return vpmin_s16(a, b);
				351	}
				352
				353	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				354	{
				355	return vminq_s16(a, b);
				356	}
				357
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	358	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				359	{
				360	return vadd_s8(a, b);
				361	}
				362
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	363	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				364	{
				365	return vadd_s16(a, b);
				366	}
				367
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	368	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				369	{
				370	return vaddq_s8(a, b);
				371	}
				372
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	373	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				374	{
				375	return vaddq_s16(a, b);
				376	}
				377
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	378	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				379	{
				380	return vqadd_s8(a, b);
				381	}
				382
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	383	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				384	{
				385	return vqadd_s16(a, b);
				386	}
				387
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	388	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				389	{
				390	return vqadd_s32(a, b);
				391	}
				392
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	393	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				394	{
				395	return vqaddq_s8(a, b);
				396	}
				397
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	398	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				399	{
				400	return vqaddq_s16(a, b);
				401	}
				402
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	403	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				404	{
				405	return vqaddq_s32(a, b);
				406	}
				407
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	408	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				409	{
				410	return vpaddl_s8(a);
				411	}
				412
				413	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				414	{
				415	return vsub_s8(a, b);
				416	}
				417
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	418	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				419	{
				420	return vsub_s16(a, b);
				421	}
				422
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	423	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				424	{
				425	return vsubq_s8(a, b);
				426	}
				427
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	428	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				429	{
				430	return vsubq_s16(a, b);
				431	}
				432
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	433	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				434	{
				435	return vqsub_s8(a, b);
				436	}
				437
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	438	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				439	{
				440	return vqsub_s16(a, b);
				441	}
				442
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	443	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				444	{
				445	return vqsubq_s8(a, b);
				446	}
				447
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	448	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				449	{
				450	return vqsubq_s16(a, b);
				451	}
				452
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	453	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				454	{
				455	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				456
				457	// Initialize the temporary result with a constant used to round up the result
				458	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				459
				460	// Vector multiply-accumulate long
				461	res = vmlal_s8(res, a, b);
				462
				463	// Shift right by fixed_point_position
				464	res = vshlq_s16(res, fixed_point_position_s16);
				465
				466	// Convert back to qint8
				467	return vmovn_s16(res);
				468	}
				469
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	470	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				471	{
				472	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				473
				474	// Initialize the temporary result with a constant used to round up the result
				475	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				476
				477	// Vector multiply-accumulate long
				478	res = vmlal_s16(res, a, b);
				479
				480	// Shift right by fixed_point_position
				481	res = vshlq_s32(res, fixed_point_position_s32);
				482
				483	// Convert back to qint16
				484	return vmovn_s32(res);
				485	}
				486
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	487	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				488	{
				489	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				490
				491	// Initialize the temporary results with a constant used to round up the result
				492	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				493	qint16x8_t res1 = res0;
				494
				495	// Vector multiply-accumulate long
				496	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				497	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				498
				499	// Shift right by fixed_point_position
				500	res0 = vshlq_s16(res0, fixed_point_position_s16);
				501	res1 = vshlq_s16(res1, fixed_point_position_s16);
				502
				503	// Convert back to qint8
				504	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				505	}
				506
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	507	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				508	{
				509	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				510
				511	// Initialize the temporary results with a constant used to round up the result
				512	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				513	qint32x4_t res1 = res0;
				514
				515	// Vector multiply-accumulate long
				516	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				517	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				518
				519	// Shift right by fixed_point_position
				520	res0 = vshlq_s32(res0, fixed_point_position_s32);
				521	res1 = vshlq_s32(res1, fixed_point_position_s32);
				522
				523	// Convert back to qint16
				524	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				525	}
				526
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	527	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				528	{
				529	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				530
				531	// Initialize the temporary result with a constant used to round up the result
				532	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				533
				534	// Vector multiply-accumulate long
				535	res = vmlal_s8(res, a, b);
				536
				537	// Shift right by fixed_point_position
				538	res = vqshlq_s16(res, fixed_point_position_s16);
				539
				540	// Convert back to qint8 and saturate
				541	return vqmovn_s16(res);
				542	}
				543
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	544	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				545	{
				546	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				547
				548	// Initialize the temporary result with a constant used to round up the result
				549	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				550
				551	// Vector multiply-accumulate long
				552	res = vmlal_s16(res, a, b);
				553
				554	// Shift right by fixed_point_position
				555	res = vqshlq_s32(res, fixed_point_position_s32);
				556
				557	// Convert back to qint16 and saturate
				558	return vqmovn_s32(res);
				559	}
				560
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	561	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				562	{
				563	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				564
				565	// Initialize the temporary results with a constant used to round up the result
				566	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				567	qint16x8_t res1 = res0;
				568
				569	// Vector multiply-accumulate long
				570	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				571	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				572
				573	// Shift right by fixed_point_position
				574	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				575	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				576
				577	// Convert back to qint8 and saturate
				578	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				579	}
				580
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	581	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				582	{
				583	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				584
				585	// Initialize the temporary results with a constant used to round up the result
				586	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				587	qint32x4_t res1 = res0;
				588
				589	// Vector multiply-accumulate long
				590	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				591	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				592
				593	// Shift right by fixed_point_position
				594	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				595	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				596
				597	// Convert back to qint16 and saturate
				598	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				599	}
				600
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	601	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				602	{
				603	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				604
				605	qint16x8_t res = vmull_s8(a, b);
				606
				607	return vqrshlq_s16(res, fixed_point_position_s16);
				608	}
				609
				610	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				611	{
				612	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				613
				614	// Initialize the temporary results with a constant used to round up the result
				615	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				616
				617	// Vector multiply-accumulate long
				618	tmp = vmlal_s8(tmp, b, c);
				619
				620	// Shift right by fixed_point_position
				621	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				622
				623	// Convert back to qint8 and accumulate
				624	return vadd_s8(a, vmovn_s16(tmp));
				625	}
				626
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	627	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				628	{
				629	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				630
				631	// Initialize the temporary results with a constant used to round up the result
				632	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				633
				634	// Vector multiply-accumulate long
				635	tmp = vmlal_s16(tmp, b, c);
				636
				637	// Shift right by fixed_point_position
				638	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				639
				640	// Convert back to qint16 and accumulate
				641	return vadd_s16(a, vmovn_s32(tmp));
				642	}
				643
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	644	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				645	{
				646	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				647
				648	// Initialize the temporary results with a constant used to round up the result
				649	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				650	qint16x8_t tmp1 = tmp0;
				651
				652	// Vector multiply-accumulate long
				653	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				654	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				655
				656	// Shift right by fixed_point_position
				657	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				658	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				659
				660	// Convert back to qint8 and accumulate
				661	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				662	}
				663
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	664	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				665	{
				666	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				667
				668	// Initialize the temporary results with a constant used to round up the result
				669	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				670	qint32x4_t tmp1 = tmp0;
				671
				672	// Vector multiply-accumulate long
				673	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				674	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				675
				676	// Shift right by fixed_point_position
				677	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				678	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				679
				680	// Convert back to qint16 and accumulate
				681	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				682	}
				683
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	684	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				685	{
				686	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				687
				688	// Initialize the temporary results with a constant used to round up the result
				689	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				690
				691	// Vector multiply-accumulate long
				692	tmp = vmlal_s8(tmp, b, c);
				693
				694	// Shift right by fixed_point_position
				695	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				696
				697	// Convert back to qint8 and accumulate
				698	return vqadd_s8(a, vqmovn_s16(tmp));
				699	}
				700
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	701	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				702	{
				703	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				704
				705	// Initialize the temporary results with a constant used to round up the result
				706	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				707
				708	// Vector multiply-accumulate long
				709	tmp = vmlal_s16(tmp, b, c);
				710
				711	// Shift right by fixed_point_position
				712	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				713
				714	// Convert back to qint8 and accumulate
				715	return vqadd_s16(a, vqmovn_s32(tmp));
				716	}
				717
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	718	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				719	{
				720	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				721
				722	// Initialize the temporary results with a constant used to round up the result
				723	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				724	qint16x8_t tmp1 = tmp0;
				725
				726	// Vector multiply-accumulate long
				727	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				728	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				729
				730	// Shift right by fixed_point_position
				731	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				732	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				733
				734	// Convert back to qint8 and accumulate
				735	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				736	return vqaddq_s8(a, res);
				737	}
				738
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	739	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				740	{
				741	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				742
				743	// Initialize the temporary results with a constant used to round up the result
				744	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				745	qint32x4_t tmp1 = tmp0;
				746
				747	// Vector multiply-accumulate long
				748	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				749	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				750
				751	// Shift right by fixed_point_position
				752	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				753	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				754
				755	// Convert back to qint16 and accumulate
				756	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				757	return vqaddq_s16(a, res);
				758	}
				759
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	760	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				761	{
				762	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				763
				764	// Initialize the temporary results with a constant used to round up the result
				765	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				766
				767	// Vector multiply-accumulate long
				768	tmp = vmlal_s8(tmp, b, c);
				769
				770	// Shift right by fixed_point_position
				771	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				772
				773	// Accumulate
				774	return vaddq_s16(a, tmp);
				775	}
				776
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	777	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				778	{
				779	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				780
				781	// Initialize the temporary results with a constant used to round up the result
				782	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				783
				784	// Vector multiply-accumulate long
				785	tmp = vmlal_s16(tmp, b, c);
				786
				787	// Shift right by fixed_point_position
				788	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				789
				790	// Accumulate
				791	return vaddq_s32(a, tmp);
				792	}
				793
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	794	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				795	{
				796	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				797
				798	// Initialize the temporary results with a constant used to round up the result
				799	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				800
				801	// Vector multiply-accumulate long
				802	tmp = vmlal_s8(tmp, b, c);
				803
				804	// Shift right by fixed_point_position
				805	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				806
				807	// Accumulate
				808	return vqaddq_s16(a, tmp);
				809	}
				810
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	811	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				812	{
				813	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				814
				815	// Initialize the temporary results with a constant used to round up the result
				816	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				817
				818	// Vector multiply-accumulate long
				819	tmp = vmlal_s16(tmp, b, c);
				820
				821	// Shift right by fixed_point_position
				822	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				823
				824	// Accumulate
				825	return vqaddq_s32(a, tmp);
				826	}
				827
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	828	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	829	{
				830	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				831
				832	float32x4x2_t res_f32 =
				833	{
				834	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	835	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				836	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	837	}
				838	};
				839
				840	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				841	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				842
				843	const int32x4x2_t res_s32 =
				844	{
				845	{
				846	vcvtq_s32_f32(res_f32.val[0]),
				847	vcvtq_s32_f32(res_f32.val[1]),
				848	}
				849	};
				850
				851	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				852
				853	return vqmovn_s16(res_s16);
				854	}
				855
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	856	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	857	{
				858	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				859
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	860	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	861
				862	res_f32 = vmlaq_f32(res_f32, a, pow2);
				863
				864	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				865
				866	return vqmovn_s32(res_s32);
				867	}
				868
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	869	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	870	{
				871	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				872
				873	float32x4x4_t res_f32 =
				874	{
				875	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	876	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				877	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				878	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				879	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	880	}
				881	};
				882
				883	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				884	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				885	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				886	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				887
				888	const int32x4x4_t res_s32 =
				889	{
				890	{
				891	vcvtq_s32_f32(res_f32.val[0]),
				892	vcvtq_s32_f32(res_f32.val[1]),
				893	vcvtq_s32_f32(res_f32.val[2]),
				894	vcvtq_s32_f32(res_f32.val[3]),
				895	}
				896	};
				897
				898	const int16x8x2_t res_s16 =
				899	{
				900	{
				901	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				902	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				903	}
				904	};
				905
				906	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				907	}
				908
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	909	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	910	{
				911	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				912
				913	float32x4x2_t res_f32 =
				914	{
				915	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	916	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				917	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	918	}
				919	};
				920
				921	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				922	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				923
				924	const int32x4x2_t res_s32 =
				925	{
				926	{
				927	vcvtq_s32_f32(res_f32.val[0]),
				928	vcvtq_s32_f32(res_f32.val[1])
				929	}
				930	};
				931
				932	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				933	}
				934
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	935	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				936	{
				937	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				938
				939	const int16x8_t res_s16 = vmovl_s8(a);
				940
				941	const int32x4x2_t res_s32 =
				942	{
				943	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	944	vmovl_s16(vget_low_qs16(res_s16)),
				945	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	946	}
				947	};
				948
				949	float32x4x2_t res_f32 =
				950	{
				951	{
				952	vcvtq_f32_s32(res_s32.val[0]),
				953	vcvtq_f32_s32(res_s32.val[1])
				954	}
				955	};
				956
				957	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				958	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				959
				960	return res_f32;
				961	}
				962
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	963	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				964	{
				965	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				966	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				967
				968	return vmulq_f32(res_f32, pow2);
				969	}
				970
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	971	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				972	{
				973	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				974
				975	const int16x8x2_t res_s16 =
				976	{
				977	{
				978	vmovl_s8(vget_low_s8(a)),
				979	vmovl_s8(vget_high_s8(a)),
				980	}
				981	};
				982
				983	const int32x4x4_t res_s32 =
				984	{
				985	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	986	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				987	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				988	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				989	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	990	}
				991	};
				992
				993	float32x4x4_t res_f32 =
				994	{
				995	{
				996	vcvtq_f32_s32(res_s32.val[0]),
				997	vcvtq_f32_s32(res_s32.val[1]),
				998	vcvtq_f32_s32(res_s32.val[2]),
				999	vcvtq_f32_s32(res_s32.val[3])
				1000	}
				1001	};
				1002
				1003	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1004	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1005	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1006	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1007
				1008	return res_f32;
				1009	}
				1010
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1011	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1012	{
				1013	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1014
				1015	const int32x4x2_t res_s32 =
				1016	{
				1017	{
				1018	vmovl_s16(vget_low_qs16(a)),
				1019	vmovl_s16(vget_high_qs16(a))
				1020	}
				1021	};
				1022
				1023	float32x4x2_t res_f32 =
				1024	{
				1025	{
				1026	vcvtq_f32_s32(res_s32.val[0]),
				1027	vcvtq_f32_s32(res_s32.val[1])
				1028	}
				1029	};
				1030
				1031	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1032	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1033
				1034	return res_f32;
				1035	}
				1036
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1037	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1038	{
				1039	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1040	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1041	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1042	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1043
				1044	// Find shift value
				1045	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1046	const qint8x8_t temp = vshl_s8(a, shift_value);
				1047
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1048	qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1049
				1050	uint8x8_t set_one = vcgt_s8(x, const_one);
				1051	x = vbsl_s8(set_one, const_one, x);
				1052
				1053	// Use three iterations of Newton-Raphson method to get the result
				1054	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1055	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1056	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1057
				1058	return vshl_s8(x, shift_value);
				1059	}
				1060
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1061	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1062	{
				1063	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1064	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1065	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1066	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1067
				1068	// Find shift value
				1069	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1070	const qint16x4_t temp = vshl_s16(a, shift_value);
				1071
				1072	qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
				1073
				1074	uint16x4_t set_one = vcgt_s16(x, const_one);
				1075	x = vbsl_s16(set_one, const_one, x);
				1076
				1077	// Use five iterations of Newton-Raphson method to get the result
				1078	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1079	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1080	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1081	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1082	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1083
				1084	return vshl_s16(x, shift_value);
				1085	}
				1086
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1087	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1088	{
				1089	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1090	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1091	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1092	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1093
				1094	// Find shift value
				1095	const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1096	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1097
				1098	qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
				1099
				1100	uint8x8_t set_one = vcgt_s8(x, const_one);
				1101	x = vbsl_s8(set_one, const_one, x);
				1102
				1103	// Use three iterations of Newton-Raphson method to get the result
				1104	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1105	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1106	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1107
				1108	return vqshl_s8(x, shift_value);
				1109	}
				1110
				1111	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1112	{
				1113	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1114	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1115	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1116	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1117
				1118	// Find shift value
				1119	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1120	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1121
				1122	qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
				1123
				1124	uint16x4_t set_one = vcgt_s16(x, const_one);
				1125	x = vbsl_s16(set_one, const_one, x);
				1126
				1127	// Use five iterations of Newton-Raphson method to get the result
				1128	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1129	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1130	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1131	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1132	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1133
				1134	return vqshl_s16(x, shift_value);
				1135	}
				1136
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1137	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1138	{
				1139	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1140	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1141	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1142	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1143
				1144	// Find shift value
				1145	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1146	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1147
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1148	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1149
				1150	// Set initial guess to one if x > 1
				1151	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1152	x = vbslq_s8(set_one, const_one, x);
				1153
				1154	// Use three iterations of Newton-Raphson method to get the result
				1155	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1156	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1157	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1158
				1159	return vshlq_s8(x, shift_value);
				1160	}
				1161
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1162	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1163	{
				1164	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1165	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1166	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1167	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1168
				1169	// Find shift value
				1170	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1171	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1172
				1173	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1174
				1175	// Set initial guess to one if x > 1
				1176	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1177	x = vbslq_s16(set_one, const_one, x);
				1178
				1179	// Use five iterations of Newton-Raphson method to get the result
				1180	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1181	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1182	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1183	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1184	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1185
				1186	return vshlq_s16(x, shift_value);
				1187	}
				1188
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1189	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1190	{
				1191	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1192	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1193	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1194	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1195
				1196	// Find shift value
				1197	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1198	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1199
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame^]	1200	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1201
				1202	// Set initial guess to one if x > 1
				1203	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1204	x = vbslq_s8(set_one, const_one, x);
				1205
				1206	// Use three iterations of Newton-Raphson method to get the result
				1207	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1208	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1209	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1210
				1211	return vqshlq_s8(x, shift_value);
				1212	}
				1213
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1214	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1215	{
				1216	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1217	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1218	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1219	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1220
				1221	// Find shift value
				1222	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1223	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1224
				1225	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1226
				1227	// Set initial guess to one if x > 1
				1228	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1229	x = vbslq_s16(set_one, const_one, x);
				1230
				1231	// Use five iterations of Newton-Raphson method to get the result
				1232	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1233	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1234	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1235	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1236	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1237
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame^]	1238	// Saturate result in case of overflow
				1239	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1240	}
				1241
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1242	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1243	{
				1244	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1245	}
				1246
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1247	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1248	{
				1249	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1250	}
				1251
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1252	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1253	{
				1254	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1255	}
				1256
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1257	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1258	{
				1259	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1260	}
				1261
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1262	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1263	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1264	{
				1265	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1266	const qint8x8_t const_one = vdup_n_s8(1);
				1267	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1268	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1269	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1270	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1271	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1272	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1273	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1274	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1275	return res;
				1276	}
				1277
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1278	template <bool islog>
				1279	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1280	{
				1281	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1282	const qint16x4_t const_one = vdup_n_s16(1);
				1283	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1284	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1285	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1286	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1287	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1288	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1289	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1290	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1291	return res;
				1292	}
				1293
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1294	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1295	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1296	{
				1297	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1298	const qint8x8_t const_one = vdup_n_s8(1);
				1299	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1300	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1301	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1302	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1303	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1304	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1305	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1306	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1307	return res;
				1308	}
				1309
				1310	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1311	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1312	{
				1313	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1314	const qint16x4_t const_one = vdup_n_s16(1);
				1315	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1316	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1317	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1318	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1319	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1320	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1321	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1322	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1323	return res;
				1324	}
				1325
				1326	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1327	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1328	{
				1329	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1330	const qint8x16_t const_one = vdupq_n_s8(1);
				1331	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1332	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1333	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1334	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1335	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1336	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1337	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1338	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1339	return res;
				1340	}
				1341
				1342	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1343	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1344	{
				1345	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1346	const qint16x8_t const_one = vdupq_n_s16(1);
				1347	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1348	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1349	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1350	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1351	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1352	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1353	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1354	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1355	return res;
				1356	}
				1357
				1358	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1359	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1360	{
				1361	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1362	const qint8x16_t const_one = vdupq_n_s8(1);
				1363	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1364	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1365	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1366	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1367	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1368	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1369	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1370	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1371	return res;
				1372	}
				1373
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1374	template <bool islog>
				1375	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1376	{
				1377	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1378	const qint16x8_t const_one = vdupq_n_s16(1);
				1379	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1380	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1381	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1382	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1383	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1384	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1385	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1386	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1387	return res;
				1388	}
				1389
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1390	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1391	{
				1392	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1393	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1394	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1395	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1396
				1397	// Perform range reduction [-log(2),log(2)]
				1398	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1399
				1400	// get decimal part from m
				1401	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1402
				1403	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1404	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1405
				1406	// Polynomial Approximation
				1407	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1408	poly = vqadd_s8(poly, const_one);
				1409
				1410	// Reconstruct
				1411	poly = vqshl_s8(poly, dec_m);
				1412
				1413	return poly;
				1414	}
				1415
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1416	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1417	{
				1418	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1419	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1420	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1421	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1422
				1423	// Perform range reduction [-log(2),log(2)]
				1424	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1425
				1426	// get decimal part from m
				1427	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1428
				1429	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1430	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1431
				1432	// Polynomial Approximation
				1433	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1434	poly = vqadd_s16(poly, const_one);
				1435
				1436	// Reconstruct
				1437	poly = vqshl_s16(poly, dec_m);
				1438
				1439	return poly;
				1440	}
				1441
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1442	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1443	{
				1444	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1445	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1446	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1447	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1448
				1449	// Perform range reduction [-log(2),log(2)]
				1450	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1451
				1452	// get decimal part from m
				1453	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1454
				1455	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1456	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1457
				1458	// Polynomial Approximation
				1459	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1460	poly = vqaddq_s8(poly, const_one);
				1461
				1462	// Reconstruct
				1463	poly = vqshlq_s8(poly, dec_m);
				1464
				1465	return poly;
				1466	}
				1467
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1468	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1469	{
				1470	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1471	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1472	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1473	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1474
				1475	// Perform range reduction [-log(2),log(2)]
				1476	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1477
				1478	// get decimal part from m
				1479	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1480
				1481	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1482	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1483
				1484	// Polynomial Approximation
				1485	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1486	poly = vqaddq_s16(poly, const_one);
				1487
				1488	// Reconstruct
				1489	poly = vqshlq_s16(poly, dec_m);
				1490
				1491	return poly;
				1492	}
				1493
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1494	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1495	{
				1496	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1497	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1498	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1499
				1500	// If 0 < a < 1, calculate log(1/x)
				1501	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1502	qint8x8_t recip = vdup_n_s8(0);
				1503	recip = vbsl_s8(calc_reciprocal, recip, a);
				1504
				1505	// Calculate reciprocal
				1506	recip = vrecip_qs8(recip, fixed_point_position);
				1507	a = vbsl_s8(calc_reciprocal, recip, a);
				1508
				1509	// Get decimal part of a
				1510	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1511	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1512
				1513	// Get exponent of 2^n which is equal or less than dec_a
				1514	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1515
				1516	// Get x to range (1, 2]
				1517	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1518	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1519	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1520
				1521	// Polynomial Approximation
				1522	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1523
				1524	// Reconstruct
				1525	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1526
				1527	// Set negative value for 0 < a < 1
				1528	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1529
				1530	return poly;
				1531	}
				1532
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1533	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1534	{
				1535	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1536	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1537	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1538
				1539	// If 0 < a < 1, calculate log(1/x)
				1540	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1541	qint16x4_t recip = vdup_n_s16(0);
				1542	recip = vbsl_s16(calc_reciprocal, recip, a);
				1543
				1544	// Calculate reciprocal
				1545	recip = vrecip_qs16(recip, fixed_point_position);
				1546	a = vbsl_s16(calc_reciprocal, recip, a);
				1547
				1548	// Get decimal part of a
				1549	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1550	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1551
				1552	// Get exponent of 2^n which is equal or less than dec_a
				1553	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1554
				1555	// Get x to range (1, 2]
				1556	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1557	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1558	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1559
				1560	// Polynomial Approximation
				1561	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1562
				1563	// Reconstruct
				1564	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1565
				1566	// Set negative value for 0 < a < 1
				1567	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1568
				1569	return poly;
				1570	}
				1571
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1572	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1573	{
				1574	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1575	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1576	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1577
				1578	// If 0 < a < 1, calculate log(1/x)
				1579	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1580	qint8x16_t recip = vdupq_n_s8(0);
				1581	recip = vbslq_s8(calc_reciprocal, a, recip);
				1582
				1583	// Calculate reciprocal
				1584	recip = vrecipq_qs8(recip, fixed_point_position);
				1585	a = vbslq_s8(calc_reciprocal, recip, a);
				1586
				1587	// Get decimal part of a
				1588	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1589	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1590
				1591	// Get exponent of 2^n which is equal or less than dec_a
				1592	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1593
				1594	// Get x to range (1, 2]
				1595	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1596	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1597	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1598
				1599	// Polynomial Approximation
				1600	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1601
				1602	// Reconstruct
				1603	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1604
				1605	// Set negative value for 0 < a < 1
				1606	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1607
				1608	return poly;
				1609	}
				1610
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1611	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1612	{
				1613	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1614	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1615	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1616
				1617	// If 0 < a < 1, calculate log(1/x)
				1618	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1619	qint16x8_t recip = vdupq_n_s16(0);
				1620	recip = vbslq_s16(calc_reciprocal, a, recip);
				1621
				1622	// Calculate reciprocal
				1623	recip = vqrecipq_qs16(recip, fixed_point_position);
				1624	a = vbslq_s16(calc_reciprocal, recip, a);
				1625
				1626	// Get decimal part of a
				1627	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1628	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1629
				1630	// Get exponent of 2^n which is equal or less than dec_a
				1631	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1632
				1633	// Get x to range (1, 2]
				1634	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1635	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1636	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1637
				1638	// Polynomial Approximation
				1639	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1640
				1641	// Reconstruct
				1642	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1643
				1644	// Set negative value for 0 < a < 1
				1645	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1646
				1647	return poly;
				1648	}
				1649
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1650	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1651	{
				1652	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1653
				1654	// Find shift value. Number must be in (0.5, 2) range.
				1655	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1656
				1657	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1658	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1659	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1660	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1661	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1662
				1663	temp = vshl_s8(a, shift_value);
				1664
				1665	// Initial guess
				1666	qint8x8_t x = temp;
				1667
				1668	// Calculate (x / 2) * (3 - a * x^2)
				1669	// After three iterations we have the result for 8 bit
				1670	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1671	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1672	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1673
				1674	return vshl_s8(x, shift_value2);
				1675	}
				1676
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1677	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1678	{
				1679	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1680
				1681	// Find shift value. Number must be in (0.5, 2) range.
				1682	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1683
				1684	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1685	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1686	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1687	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1688	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1689
				1690	temp = vshl_s16(a, shift_value);
				1691
				1692	// Initial guess
				1693	qint16x4_t x = temp;
				1694
				1695	// Calculate (x / 2) * (3 - a * x^2)
				1696	// After five iterations we have the result for 8 bit
				1697	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1698	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1699	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1700	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1701	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1702
				1703	return vshl_s16(x, shift_value2);
				1704	}
				1705
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1706	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1707	{
				1708	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1709
				1710	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1711	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1712
				1713	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1714	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1715	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1716	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1717	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1718
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1719	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1720
				1721	// Initial guess
				1722	qint8x8_t x = temp;
				1723
				1724	// Calculate (x / 2) * (3 - a * x^2)
				1725	// After three iterations we have the result for 8 bit
				1726	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1727	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1728	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1729
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1730	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1731	}
				1732
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1733	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1734	{
				1735	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1736
				1737	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1738	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1739
				1740	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1741	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1742	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1743	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1744	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1745
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1746	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1747
				1748	// Initial guess
				1749	qint16x4_t x = temp;
				1750
				1751	// Calculate (x / 2) * (3 - a * x^2)
				1752	// After five iterations we have the result for 16 bit
				1753	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1754	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1755	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1756	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1757	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1758
				1759	return vqshl_s16(x, shift_value2);
				1760	}
				1761
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1762	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1763	{
				1764	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1765
				1766	// Find shift value. Number must be in (0.5, 2) range.
				1767	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1768
				1769	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1770	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1771	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1772	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1773	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1774
				1775	temp = vshlq_s8(a, shift_value);
				1776
				1777	// Initial guess
				1778	qint8x16_t x = temp;
				1779
				1780	// Calculate (x / 2) * (3 - a * x^2)
				1781	// After three iterations we have the result for 8 bit
				1782	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1783	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1784	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1785
				1786	return vshlq_s8(x, shift_value2);
				1787	}
				1788
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1789	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1790	{
				1791	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1792
				1793	// Find shift value. Number must be in (0.5, 2) range.
				1794	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1795
				1796	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1797	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1798	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1799	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1800	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1801
				1802	temp = vshlq_s16(a, shift_value);
				1803
				1804	// Initial guess
				1805	qint16x8_t x = temp;
				1806
				1807	// Calculate (x / 2) * (3 - a * x^2)
				1808	// After five iterations we have the result for 16 bit
				1809	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1810	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1811	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1812	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1813	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1814
				1815	return vshlq_s16(x, shift_value2);
				1816	}
				1817
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1818	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1819	{
				1820	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1821
				1822	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1823	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1824
				1825	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1826	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1827	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1828	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1829	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1830
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1831	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1832
				1833	// Initial guess
				1834	qint8x16_t x = temp;
				1835
				1836	// Calculate (x / 2) * (3 - a * x^2)
				1837	// After three iterations we have the result for 8 bit
				1838	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1839	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1840	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1841
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1842	return vqshlq_s8(x, shift_value2);
				1843	}
				1844
				1845	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1846	{
				1847	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1848
				1849	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1850	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1851
				1852	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1853	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1854	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1855	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1856	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1857
				1858	temp = vqshlq_s16(a, shift_value);
				1859
				1860	// Initial guess
				1861	qint16x8_t x = temp;
				1862
				1863	// Calculate (x / 2) * (3 - a * x^2)
				1864	// After five iterations we have the result for 16 bit
				1865	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1866	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1867	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1868	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1869	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1870
				1871	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1872	}
				1873
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1874	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1875	{
				1876	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1877	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1878
				1879	qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1880	qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1881	qint8x8_t den = vqadd_qs8(exp2x, const_one);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1882	qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1883
				1884	return tanh;
				1885	}
				1886
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1887	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1888	{
				1889	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1890	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1891
				1892	qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1893	qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1894	qint16x4_t den = vqadd_qs16(exp2x, const_one);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1895	qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1896
				1897	return tanh;
				1898	}
				1899
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1900	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1901	{
				1902	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1903	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1904
				1905	qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1906	qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1907	qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1908	qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
				1909
				1910	return tanh;
				1911	}
				1912
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1913	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1914	{
				1915	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1916	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1917
				1918	qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1919	qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1920	qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1921	qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
				1922
				1923	return tanh;
				1924	}
				1925
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1926	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1927	{
				1928	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1929	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1930
				1931	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1932	{
				1933	float32x4x2_t res =
				1934	{
				1935	{
				1936	vmaxq_f32(a.val[0], b.val[0]),
				1937	vmaxq_f32(a.val[1], b.val[1])
				1938	}
				1939	};
				1940	return res;
				1941	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1942	}