Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: 7cebfad9247d0201d5fdb9c00311603385feba55 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
				28	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	30	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	31	{
				32	{
				33	vdup_n_s8(0x7F), // 0.9978546
				34	vdup_n_s8(0x3F), // 0.4994721
				35	vdup_n_s8(0x16), // 0.1763723
				36	vdup_n_s8(0x05), // 0.0435108
				37	}
				38	};
				39
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	40	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				41	* Format is in Q0.15 for all elements */
				42	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				43	{
				44	{
				45	vdup_n_s16(0x7FBA), // 0.9978546
				46	vdup_n_s16(0x3FE9), // 0.4994721
				47	vdup_n_s16(0x1693), // 0.1763723
				48	vdup_n_s16(0x0592), // 0.0435108
				49	}
				50	};
				51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				53	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	54	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	{
				56	{
				57	vdupq_n_s8(0x7F), // 0.9978546
				58	vdupq_n_s8(0x3F), // 0.4994721
				59	vdupq_n_s8(0x16), // 0.1763723
				60	vdupq_n_s8(0x05), // 0.0435108
				61	}
				62	};
				63
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	64	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				65	* Format is in Q0.15 for all elements */
				66	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				67	{
				68	{
				69	vdupq_n_s16(0x7FBA), // 0.9978546
				70	vdupq_n_s16(0x3FE9), // 0.4994721
				71	vdupq_n_s16(0x1693), // 0.1763723
				72	vdupq_n_s16(0x0592), // 0.0435108
				73	}
				74	};
				75
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	76	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				77	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	78	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	79	{
				80	{
				81	vdup_n_s8(0x5C), // 1.4384189
				82	vdup_n_s8(-0x56), // -0.6771900
				83	vdup_n_s8(0x29), // 0.3218538
				84	vdup_n_s8(-0x0A), // -0.0832229
				85	}
				86	};
				87
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	88	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				89	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				90	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				91	{
				92	{
				93	vdup_n_s16(0x5C0F), // 1.4384189
				94	vdup_n_s16(-0x56AE), // -0.6771900
				95	vdup_n_s16(0x2933), // 0.3218538
				96	vdup_n_s16(-0x0AA7), // -0.0832229
				97	}
				98	};
				99
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	100	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				101	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	102	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	103	{
				104	{
				105	vdupq_n_s8(0x5C), // 1.4384189
				106	vdupq_n_s8(-0x56), // -0.6771900
				107	vdupq_n_s8(0x29), // 0.3218538
				108	vdupq_n_s8(-0x0A), // -0.0832229
				109	}
				110	};
				111
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	112	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				113	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				114	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				115	{
				116	{
				117	vdupq_n_s16(0x5C0F), // 1.4384189
				118	vdupq_n_s16(-0x56AE), // -0.6771900
				119	vdupq_n_s16(0x2933), // 0.3218538
				120	vdupq_n_s16(-0x0AA7), // -0.0832229
				121	}
				122	};
				123
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				125	{
				126	return vget_low_s8(a);
				127	}
				128
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	129	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				130	{
				131	return vget_low_s16(a);
				132	}
				133
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	134	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				135	{
				136	return vget_high_s8(a);
				137	}
				138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	139	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				140	{
				141	return vget_high_s16(a);
				142	}
				143
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				145	{
				146	return vld1_s8(addr);
				147	}
				148
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	149	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				150	{
				151	return vld1_s16(addr);
				152	}
				153
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	154	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				155	{
				156	return vld1q_s8(addr);
				157	}
				158
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				160	{
				161	return vld1q_s16(addr);
				162	}
				163
				164	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				165	{
				166	return vld1_dup_s8(addr);
				167	}
				168
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	169	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				170	{
				171	return vld1_dup_s16(addr);
				172	}
				173
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	174	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				175	{
				176	return vld1q_dup_s8(addr);
				177	}
				178
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	179	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				180	{
				181	return vld1q_dup_s16(addr);
				182	}
				183
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	184	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				185	{
				186	return vld2q_s16(addr);
				187	}
				188
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	189	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				190	{
				191	vst1_s8(addr, b);
				192	}
				193
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	194	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				195	{
				196	vst1_s16(addr, b);
				197	}
				198
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	199	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				200	{
				201	vst1q_s8(addr, b);
				202	}
				203
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				205	{
				206	vst1q_s16(addr, b);
				207	}
				208
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	209	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				210	{
				211	vst2q_s16(addr, b);
				212	}
				213
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				215	{
				216	return vqmovn_s16(a);
				217	}
				218
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	219	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				220	{
				221	return vqmovn_s32(a);
				222	}
				223
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	224	inline qint8x8_t vdup_n_qs8(qint8_t a)
				225	{
				226	return vdup_n_s8(a);
				227	}
				228
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	229	inline qint16x4_t vdup_n_qs16(qint16_t a)
				230	{
				231	return vdup_n_s16(a);
				232	}
				233
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	234	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				235	{
				236	return vdupq_n_s8(a);
				237	}
				238
				239	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				240	{
				241	float32x4x4_t res =
				242	{
				243	{
				244	vdupq_n_f32(a),
				245	vdupq_n_f32(a),
				246	vdupq_n_f32(a),
				247	vdupq_n_f32(a),
				248	}
				249	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	250	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	251	}
				252
				253	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				254	{
				255	return vdupq_n_s16(a);
				256	}
				257
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	258	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				259	{
				260	return vdupq_n_s32(a);
				261	}
				262
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	263	inline qint8x8_t vabs_qs8(qint8x8_t a)
				264	{
				265	return vabs_s8(a);
				266	}
				267
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	268	inline qint16x4_t vabs_qs16(qint16x4_t a)
				269	{
				270	return vabs_s16(a);
				271	}
				272
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	273	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				274	{
				275	return vabsq_s8(a);
				276	}
				277
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	278	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				279	{
				280	return vabsq_s16(a);
				281	}
				282
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	283	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				284	{
				285	return vqabs_s8(a);
				286	}
				287
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	288	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				289	{
				290	return vqabs_s16(a);
				291	}
				292
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	293	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				294	{
				295	return vqabsq_s8(a);
				296	}
				297
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	298	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				299	{
				300	return vqabsq_s16(a);
				301	}
				302
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	303	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				304	{
				305	return vmax_s8(a, b);
				306	}
				307
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	308	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				309	{
				310	return vmax_s16(a, b);
				311	}
				312
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	313	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				314	{
				315	return vmaxq_s8(a, b);
				316	}
				317
				318	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				319	{
				320	return vpmax_s8(a, b);
				321	}
				322
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	323	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				324	{
				325	return vpmax_s16(a, b);
				326	}
				327
				328	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				329	{
				330	return vmaxq_s16(a, b);
				331	}
				332
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	333	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				334	{
				335	return vmin_s8(a, b);
				336	}
				337
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	338	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				339	{
				340	return vmin_s16(a, b);
				341	}
				342
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	343	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				344	{
				345	return vminq_s8(a, b);
				346	}
				347
				348	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				349	{
				350	return vpmin_s8(a, b);
				351	}
				352
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	353	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				354	{
				355	return vpmin_s16(a, b);
				356	}
				357
				358	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				359	{
				360	return vminq_s16(a, b);
				361	}
				362
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	363	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				364	{
				365	return vadd_s8(a, b);
				366	}
				367
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	368	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				369	{
				370	return vadd_s16(a, b);
				371	}
				372
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	373	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				374	{
				375	return vaddq_s8(a, b);
				376	}
				377
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	378	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				379	{
				380	return vaddq_s16(a, b);
				381	}
				382
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	383	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				384	{
				385	return vqadd_s8(a, b);
				386	}
				387
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	388	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				389	{
				390	return vqadd_s16(a, b);
				391	}
				392
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	393	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				394	{
				395	return vqadd_s32(a, b);
				396	}
				397
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	398	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				399	{
				400	return vqaddq_s8(a, b);
				401	}
				402
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	403	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				404	{
				405	return vqaddq_s16(a, b);
				406	}
				407
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	408	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				409	{
				410	return vqaddq_s32(a, b);
				411	}
				412
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	413	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				414	{
				415	return vpaddl_s8(a);
				416	}
				417
				418	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				419	{
				420	return vsub_s8(a, b);
				421	}
				422
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	423	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				424	{
				425	return vsub_s16(a, b);
				426	}
				427
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	428	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				429	{
				430	return vsubq_s8(a, b);
				431	}
				432
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	433	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				434	{
				435	return vsubq_s16(a, b);
				436	}
				437
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	438	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				439	{
				440	return vqsub_s8(a, b);
				441	}
				442
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	443	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				444	{
				445	return vqsub_s16(a, b);
				446	}
				447
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	448	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				449	{
				450	return vqsubq_s8(a, b);
				451	}
				452
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	453	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				454	{
				455	return vqsubq_s16(a, b);
				456	}
				457
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	458	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				459	{
				460	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				461
				462	// Initialize the temporary result with a constant used to round up the result
				463	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				464
				465	// Vector multiply-accumulate long
				466	res = vmlal_s8(res, a, b);
				467
				468	// Shift right by fixed_point_position
				469	res = vshlq_s16(res, fixed_point_position_s16);
				470
				471	// Convert back to qint8
				472	return vmovn_s16(res);
				473	}
				474
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	475	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				476	{
				477	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				478
				479	// Initialize the temporary result with a constant used to round up the result
				480	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				481
				482	// Vector multiply-accumulate long
				483	res = vmlal_s16(res, a, b);
				484
				485	// Shift right by fixed_point_position
				486	res = vshlq_s32(res, fixed_point_position_s32);
				487
				488	// Convert back to qint16
				489	return vmovn_s32(res);
				490	}
				491
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	492	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				493	{
				494	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				495
				496	// Initialize the temporary results with a constant used to round up the result
				497	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				498	qint16x8_t res1 = res0;
				499
				500	// Vector multiply-accumulate long
				501	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				502	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				503
				504	// Shift right by fixed_point_position
				505	res0 = vshlq_s16(res0, fixed_point_position_s16);
				506	res1 = vshlq_s16(res1, fixed_point_position_s16);
				507
				508	// Convert back to qint8
				509	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				510	}
				511
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	512	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				513	{
				514	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				515
				516	// Initialize the temporary results with a constant used to round up the result
				517	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				518	qint32x4_t res1 = res0;
				519
				520	// Vector multiply-accumulate long
				521	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				522	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				523
				524	// Shift right by fixed_point_position
				525	res0 = vshlq_s32(res0, fixed_point_position_s32);
				526	res1 = vshlq_s32(res1, fixed_point_position_s32);
				527
				528	// Convert back to qint16
				529	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				530	}
				531
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	532	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				533	{
				534	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				535
				536	// Initialize the temporary result with a constant used to round up the result
				537	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				538
				539	// Vector multiply-accumulate long
				540	res = vmlal_s8(res, a, b);
				541
				542	// Shift right by fixed_point_position
				543	res = vqshlq_s16(res, fixed_point_position_s16);
				544
				545	// Convert back to qint8 and saturate
				546	return vqmovn_s16(res);
				547	}
				548
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	549	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				550	{
				551	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				552
				553	// Initialize the temporary result with a constant used to round up the result
				554	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				555
				556	// Vector multiply-accumulate long
				557	res = vmlal_s16(res, a, b);
				558
				559	// Shift right by fixed_point_position
				560	res = vqshlq_s32(res, fixed_point_position_s32);
				561
				562	// Convert back to qint16 and saturate
				563	return vqmovn_s32(res);
				564	}
				565
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	566	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				567	{
				568	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				569
				570	// Initialize the temporary results with a constant used to round up the result
				571	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				572	qint16x8_t res1 = res0;
				573
				574	// Vector multiply-accumulate long
				575	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				576	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				577
				578	// Shift right by fixed_point_position
				579	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				580	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				581
				582	// Convert back to qint8 and saturate
				583	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				584	}
				585
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	586	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				587	{
				588	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				589
				590	// Initialize the temporary results with a constant used to round up the result
				591	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				592	qint32x4_t res1 = res0;
				593
				594	// Vector multiply-accumulate long
				595	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				596	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				597
				598	// Shift right by fixed_point_position
				599	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				600	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				601
				602	// Convert back to qint16 and saturate
				603	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				604	}
				605
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	606	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				607	{
				608	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				609
				610	qint16x8_t res = vmull_s8(a, b);
				611
				612	return vqrshlq_s16(res, fixed_point_position_s16);
				613	}
				614
				615	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				616	{
				617	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				618
				619	// Initialize the temporary results with a constant used to round up the result
				620	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				621
				622	// Vector multiply-accumulate long
				623	tmp = vmlal_s8(tmp, b, c);
				624
				625	// Shift right by fixed_point_position
				626	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				627
				628	// Convert back to qint8 and accumulate
				629	return vadd_s8(a, vmovn_s16(tmp));
				630	}
				631
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	632	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				633	{
				634	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				635
				636	// Initialize the temporary results with a constant used to round up the result
				637	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				638
				639	// Vector multiply-accumulate long
				640	tmp = vmlal_s16(tmp, b, c);
				641
				642	// Shift right by fixed_point_position
				643	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				644
				645	// Convert back to qint16 and accumulate
				646	return vadd_s16(a, vmovn_s32(tmp));
				647	}
				648
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	649	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				650	{
				651	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				652
				653	// Initialize the temporary results with a constant used to round up the result
				654	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				655	qint16x8_t tmp1 = tmp0;
				656
				657	// Vector multiply-accumulate long
				658	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				659	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				660
				661	// Shift right by fixed_point_position
				662	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				663	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				664
				665	// Convert back to qint8 and accumulate
				666	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				667	}
				668
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	669	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				670	{
				671	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				672
				673	// Initialize the temporary results with a constant used to round up the result
				674	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				675	qint32x4_t tmp1 = tmp0;
				676
				677	// Vector multiply-accumulate long
				678	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				679	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				680
				681	// Shift right by fixed_point_position
				682	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				683	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				684
				685	// Convert back to qint16 and accumulate
				686	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				687	}
				688
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	689	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				690	{
				691	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				692
				693	// Initialize the temporary results with a constant used to round up the result
				694	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				695
				696	// Vector multiply-accumulate long
				697	tmp = vmlal_s8(tmp, b, c);
				698
				699	// Shift right by fixed_point_position
				700	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				701
				702	// Convert back to qint8 and accumulate
				703	return vqadd_s8(a, vqmovn_s16(tmp));
				704	}
				705
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	706	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				707	{
				708	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				709
				710	// Initialize the temporary results with a constant used to round up the result
				711	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				712
				713	// Vector multiply-accumulate long
				714	tmp = vmlal_s16(tmp, b, c);
				715
				716	// Shift right by fixed_point_position
				717	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				718
				719	// Convert back to qint8 and accumulate
				720	return vqadd_s16(a, vqmovn_s32(tmp));
				721	}
				722
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	723	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				724	{
				725	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				726
				727	// Initialize the temporary results with a constant used to round up the result
				728	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				729	qint16x8_t tmp1 = tmp0;
				730
				731	// Vector multiply-accumulate long
				732	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				733	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				734
				735	// Shift right by fixed_point_position
				736	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				737	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				738
				739	// Convert back to qint8 and accumulate
				740	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				741	return vqaddq_s8(a, res);
				742	}
				743
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	744	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				745	{
				746	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				747
				748	// Initialize the temporary results with a constant used to round up the result
				749	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				750	qint32x4_t tmp1 = tmp0;
				751
				752	// Vector multiply-accumulate long
				753	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				754	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				755
				756	// Shift right by fixed_point_position
				757	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				758	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				759
				760	// Convert back to qint16 and accumulate
				761	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				762	return vqaddq_s16(a, res);
				763	}
				764
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	765	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				766	{
				767	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				768
				769	// Initialize the temporary results with a constant used to round up the result
				770	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				771
				772	// Vector multiply-accumulate long
				773	tmp = vmlal_s8(tmp, b, c);
				774
				775	// Shift right by fixed_point_position
				776	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				777
				778	// Accumulate
				779	return vaddq_s16(a, tmp);
				780	}
				781
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	782	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				783	{
				784	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				785
				786	// Initialize the temporary results with a constant used to round up the result
				787	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				788
				789	// Vector multiply-accumulate long
				790	tmp = vmlal_s16(tmp, b, c);
				791
				792	// Shift right by fixed_point_position
				793	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				794
				795	// Accumulate
				796	return vaddq_s32(a, tmp);
				797	}
				798
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	799	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				800	{
				801	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				802
				803	// Initialize the temporary results with a constant used to round up the result
				804	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				805
				806	// Vector multiply-accumulate long
				807	tmp = vmlal_s8(tmp, b, c);
				808
				809	// Shift right by fixed_point_position
				810	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				811
				812	// Accumulate
				813	return vqaddq_s16(a, tmp);
				814	}
				815
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	816	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				817	{
				818	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				819
				820	// Initialize the temporary results with a constant used to round up the result
				821	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				822
				823	// Vector multiply-accumulate long
				824	tmp = vmlal_s16(tmp, b, c);
				825
				826	// Shift right by fixed_point_position
				827	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				828
				829	// Accumulate
				830	return vqaddq_s32(a, tmp);
				831	}
				832
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	833	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	834	{
				835	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				836
				837	float32x4x2_t res_f32 =
				838	{
				839	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	840	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				841	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	842	}
				843	};
				844
				845	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				846	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				847
				848	const int32x4x2_t res_s32 =
				849	{
				850	{
				851	vcvtq_s32_f32(res_f32.val[0]),
				852	vcvtq_s32_f32(res_f32.val[1]),
				853	}
				854	};
				855
				856	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				857
				858	return vqmovn_s16(res_s16);
				859	}
				860
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	861	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	862	{
				863	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				864
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	865	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	866
				867	res_f32 = vmlaq_f32(res_f32, a, pow2);
				868
				869	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				870
				871	return vqmovn_s32(res_s32);
				872	}
				873
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	874	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	875	{
				876	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				877
				878	float32x4x4_t res_f32 =
				879	{
				880	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	881	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				882	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				883	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				884	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	885	}
				886	};
				887
				888	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				889	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				890	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				891	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				892
				893	const int32x4x4_t res_s32 =
				894	{
				895	{
				896	vcvtq_s32_f32(res_f32.val[0]),
				897	vcvtq_s32_f32(res_f32.val[1]),
				898	vcvtq_s32_f32(res_f32.val[2]),
				899	vcvtq_s32_f32(res_f32.val[3]),
				900	}
				901	};
				902
				903	const int16x8x2_t res_s16 =
				904	{
				905	{
				906	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				907	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				908	}
				909	};
				910
				911	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				912	}
				913
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	914	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	915	{
				916	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				917
				918	float32x4x2_t res_f32 =
				919	{
				920	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	921	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				922	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	923	}
				924	};
				925
				926	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				927	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				928
				929	const int32x4x2_t res_s32 =
				930	{
				931	{
				932	vcvtq_s32_f32(res_f32.val[0]),
				933	vcvtq_s32_f32(res_f32.val[1])
				934	}
				935	};
				936
				937	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				938	}
				939
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	940	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				941	{
				942	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				943
				944	const int16x8_t res_s16 = vmovl_s8(a);
				945
				946	const int32x4x2_t res_s32 =
				947	{
				948	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	949	vmovl_s16(vget_low_qs16(res_s16)),
				950	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	951	}
				952	};
				953
				954	float32x4x2_t res_f32 =
				955	{
				956	{
				957	vcvtq_f32_s32(res_s32.val[0]),
				958	vcvtq_f32_s32(res_s32.val[1])
				959	}
				960	};
				961
				962	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				963	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				964
				965	return res_f32;
				966	}
				967
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	968	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				969	{
				970	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				971	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				972
				973	return vmulq_f32(res_f32, pow2);
				974	}
				975
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	976	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				977	{
				978	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				979
				980	const int16x8x2_t res_s16 =
				981	{
				982	{
				983	vmovl_s8(vget_low_s8(a)),
				984	vmovl_s8(vget_high_s8(a)),
				985	}
				986	};
				987
				988	const int32x4x4_t res_s32 =
				989	{
				990	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	991	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				992	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				993	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				994	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	995	}
				996	};
				997
				998	float32x4x4_t res_f32 =
				999	{
				1000	{
				1001	vcvtq_f32_s32(res_s32.val[0]),
				1002	vcvtq_f32_s32(res_s32.val[1]),
				1003	vcvtq_f32_s32(res_s32.val[2]),
				1004	vcvtq_f32_s32(res_s32.val[3])
				1005	}
				1006	};
				1007
				1008	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1009	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1010	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1011	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1012
				1013	return res_f32;
				1014	}
				1015
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1016	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1017	{
				1018	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1019
				1020	const int32x4x2_t res_s32 =
				1021	{
				1022	{
				1023	vmovl_s16(vget_low_qs16(a)),
				1024	vmovl_s16(vget_high_qs16(a))
				1025	}
				1026	};
				1027
				1028	float32x4x2_t res_f32 =
				1029	{
				1030	{
				1031	vcvtq_f32_s32(res_s32.val[0]),
				1032	vcvtq_f32_s32(res_s32.val[1])
				1033	}
				1034	};
				1035
				1036	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1037	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1038
				1039	return res_f32;
				1040	}
				1041
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1042	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1043	{
				1044	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1045	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1046	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1047	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1048
				1049	// Find shift value
				1050	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1051	const qint8x8_t temp = vshl_s8(a, shift_value);
				1052
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1053	// Newton-Raphson division initial estimate X0 calculation
				1054	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1055
				1056	uint8x8_t set_one = vcgt_s8(x, const_one);
				1057	x = vbsl_s8(set_one, const_one, x);
				1058
				1059	// Use three iterations of Newton-Raphson method to get the result
				1060	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1061	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1062	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1063
				1064	return vshl_s8(x, shift_value);
				1065	}
				1066
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1067	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1068	{
				1069	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1070	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1071	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1072	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1073
				1074	// Find shift value
				1075	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1076	const qint16x4_t temp = vshl_s16(a, shift_value);
				1077
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1078	// Newton-Raphson division initial estimate X0 calculation
				1079	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1080
				1081	uint16x4_t set_one = vcgt_s16(x, const_one);
				1082	x = vbsl_s16(set_one, const_one, x);
				1083
				1084	// Use five iterations of Newton-Raphson method to get the result
				1085	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1086	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1087	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1088	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1089	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1090
				1091	return vshl_s16(x, shift_value);
				1092	}
				1093
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1094	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1095	{
				1096	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1097	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1098	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1099	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1100
				1101	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1102	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1103	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1104
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1105	// Newton-Raphson division initial estimate X0 calculation
				1106	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1107
				1108	uint8x8_t set_one = vcgt_s8(x, const_one);
				1109	x = vbsl_s8(set_one, const_one, x);
				1110
				1111	// Use three iterations of Newton-Raphson method to get the result
				1112	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1113	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1114	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1115
				1116	return vqshl_s8(x, shift_value);
				1117	}
				1118
				1119	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1120	{
				1121	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1122	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1123	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1124	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1125
				1126	// Find shift value
				1127	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1128	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1129
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1130	// Newton-Raphson division initial estimate X0 calculation
				1131	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1132
				1133	uint16x4_t set_one = vcgt_s16(x, const_one);
				1134	x = vbsl_s16(set_one, const_one, x);
				1135
				1136	// Use five iterations of Newton-Raphson method to get the result
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1137	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1138	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1139	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1140	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1141	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1142
				1143	return vqshl_s16(x, shift_value);
				1144	}
				1145
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1146	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1147	{
				1148	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1149	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1150	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1151	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1152
				1153	// Find shift value
				1154	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1155	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1156
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1157	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1158	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1159
				1160	// Set initial guess to one if x > 1
				1161	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1162	x = vbslq_s8(set_one, const_one, x);
				1163
				1164	// Use three iterations of Newton-Raphson method to get the result
				1165	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1166	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1167	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1168
				1169	return vshlq_s8(x, shift_value);
				1170	}
				1171
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1172	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1173	{
				1174	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1175	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1176	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1177	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1178
				1179	// Find shift value
				1180	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1181	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1182
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1183	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1184	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1185
				1186	// Set initial guess to one if x > 1
				1187	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1188	x = vbslq_s16(set_one, const_one, x);
				1189
				1190	// Use five iterations of Newton-Raphson method to get the result
				1191	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1192	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1193	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1194	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1195	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1196
				1197	return vshlq_s16(x, shift_value);
				1198	}
				1199
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1200	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1201	{
				1202	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1203	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1204	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1205	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1206
				1207	// Find shift value
				1208	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1209	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1210
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1211	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1212	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1213
				1214	// Set initial guess to one if x > 1
				1215	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1216	x = vbslq_s8(set_one, const_one, x);
				1217
				1218	// Use three iterations of Newton-Raphson method to get the result
				1219	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1220	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1221	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1222
				1223	return vqshlq_s8(x, shift_value);
				1224	}
				1225
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1226	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1227	{
				1228	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1229	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1230	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1231	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1232
				1233	// Find shift value
				1234	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1235	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1236
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1237	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1238	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1239
				1240	// Set initial guess to one if x > 1
				1241	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1242	x = vbslq_s16(set_one, const_one, x);
				1243
				1244	// Use five iterations of Newton-Raphson method to get the result
				1245	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1246	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1247	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1248	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1249	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1250
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1251	// Saturate result in case of overflow
				1252	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1253	}
				1254
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1255	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1256	{
				1257	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1258	}
				1259
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1260	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1261	{
				1262	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1263	}
				1264
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1265	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1266	{
				1267	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1268	}
				1269
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1270	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1271	{
				1272	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1273	}
				1274
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1275	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1276	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1277	{
				1278	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1279	const qint8x8_t const_one = vdup_n_s8(1);
				1280	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1281	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1282	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1283	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1284	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1285	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1286	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1287	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1288	return res;
				1289	}
				1290
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1291	template <bool islog>
				1292	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1293	{
				1294	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1295	const qint16x4_t const_one = vdup_n_s16(1);
				1296	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1297	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1298	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1299	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1300	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1301	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1302	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1303	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1304	return res;
				1305	}
				1306
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1307	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1308	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1309	{
				1310	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1311	const qint8x8_t const_one = vdup_n_s8(1);
				1312	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1313	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1314	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1315	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1316	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1317	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1318	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1319	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1320	return res;
				1321	}
				1322
				1323	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1324	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1325	{
				1326	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1327	const qint16x4_t const_one = vdup_n_s16(1);
				1328	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1329	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1330	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1331	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1332	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1333	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1334	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1335	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1336	return res;
				1337	}
				1338
				1339	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1340	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1341	{
				1342	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1343	const qint8x16_t const_one = vdupq_n_s8(1);
				1344	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1345	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1346	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1347	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1348	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1349	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1350	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1351	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1352	return res;
				1353	}
				1354
				1355	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1356	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1357	{
				1358	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1359	const qint16x8_t const_one = vdupq_n_s16(1);
				1360	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1361	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1362	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1363	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1364	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1365	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1366	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1367	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1368	return res;
				1369	}
				1370
				1371	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1372	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1373	{
				1374	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1375	const qint8x16_t const_one = vdupq_n_s8(1);
				1376	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1377	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1378	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1379	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1380	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1381	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1382	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1383	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1384	return res;
				1385	}
				1386
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1387	template <bool islog>
				1388	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1389	{
				1390	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1391	const qint16x8_t const_one = vdupq_n_s16(1);
				1392	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1393	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1394	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1395	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1396	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1397	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1398	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1399	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1400	return res;
				1401	}
				1402
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1403	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1404	{
				1405	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1406	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1407	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1408	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1409
				1410	// Perform range reduction [-log(2),log(2)]
				1411	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1412
				1413	// get decimal part from m
				1414	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1415
				1416	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1417	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1418
				1419	// Polynomial Approximation
				1420	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1421	poly = vqadd_s8(poly, const_one);
				1422
				1423	// Reconstruct
				1424	poly = vqshl_s8(poly, dec_m);
				1425
				1426	return poly;
				1427	}
				1428
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1429	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1430	{
				1431	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1432	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1433	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1434	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1435
				1436	// Perform range reduction [-log(2),log(2)]
				1437	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1438
				1439	// get decimal part from m
				1440	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1441
				1442	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1443	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1444
				1445	// Polynomial Approximation
				1446	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1447	poly = vqadd_s16(poly, const_one);
				1448
				1449	// Reconstruct
				1450	poly = vqshl_s16(poly, dec_m);
				1451
				1452	return poly;
				1453	}
				1454
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1455	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1456	{
				1457	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1458	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1459	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1460	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1461
				1462	// Perform range reduction [-log(2),log(2)]
				1463	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1464
				1465	// get decimal part from m
				1466	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1467
				1468	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1469	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1470
				1471	// Polynomial Approximation
				1472	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1473	poly = vqaddq_s8(poly, const_one);
				1474
				1475	// Reconstruct
				1476	poly = vqshlq_s8(poly, dec_m);
				1477
				1478	return poly;
				1479	}
				1480
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1481	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1482	{
				1483	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1484	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1485	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1486	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1487
				1488	// Perform range reduction [-log(2),log(2)]
				1489	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1490
				1491	// get decimal part from m
				1492	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1493
				1494	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1495	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1496
				1497	// Polynomial Approximation
				1498	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1499	poly = vqaddq_s16(poly, const_one);
				1500
				1501	// Reconstruct
				1502	poly = vqshlq_s16(poly, dec_m);
				1503
				1504	return poly;
				1505	}
				1506
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1507	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1508	{
				1509	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1510	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1511	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1512
				1513	// If 0 < a < 1, calculate log(1/x)
				1514	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1515	qint8x8_t recip = vdup_n_s8(0);
				1516	recip = vbsl_s8(calc_reciprocal, recip, a);
				1517
				1518	// Calculate reciprocal
				1519	recip = vrecip_qs8(recip, fixed_point_position);
				1520	a = vbsl_s8(calc_reciprocal, recip, a);
				1521
				1522	// Get decimal part of a
				1523	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1524	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1525
				1526	// Get exponent of 2^n which is equal or less than dec_a
				1527	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1528
				1529	// Get x to range (1, 2]
				1530	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1531	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1532	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1533
				1534	// Polynomial Approximation
				1535	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1536
				1537	// Reconstruct
				1538	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1539
				1540	// Set negative value for 0 < a < 1
				1541	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1542
				1543	return poly;
				1544	}
				1545
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1546	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1547	{
				1548	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1549	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1550	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1551
				1552	// If 0 < a < 1, calculate log(1/x)
				1553	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1554	qint16x4_t recip = vdup_n_s16(0);
				1555	recip = vbsl_s16(calc_reciprocal, recip, a);
				1556
				1557	// Calculate reciprocal
				1558	recip = vrecip_qs16(recip, fixed_point_position);
				1559	a = vbsl_s16(calc_reciprocal, recip, a);
				1560
				1561	// Get decimal part of a
				1562	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1563	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1564
				1565	// Get exponent of 2^n which is equal or less than dec_a
				1566	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1567
				1568	// Get x to range (1, 2]
				1569	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1570	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1571	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1572
				1573	// Polynomial Approximation
				1574	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1575
				1576	// Reconstruct
				1577	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1578
				1579	// Set negative value for 0 < a < 1
				1580	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1581
				1582	return poly;
				1583	}
				1584
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1585	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1586	{
				1587	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1588	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1589	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1590
				1591	// If 0 < a < 1, calculate log(1/x)
				1592	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1593	qint8x16_t recip = vdupq_n_s8(0);
				1594	recip = vbslq_s8(calc_reciprocal, a, recip);
				1595
				1596	// Calculate reciprocal
				1597	recip = vrecipq_qs8(recip, fixed_point_position);
				1598	a = vbslq_s8(calc_reciprocal, recip, a);
				1599
				1600	// Get decimal part of a
				1601	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1602	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1603
				1604	// Get exponent of 2^n which is equal or less than dec_a
				1605	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1606
				1607	// Get x to range (1, 2]
				1608	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1609	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1610	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1611
				1612	// Polynomial Approximation
				1613	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1614
				1615	// Reconstruct
				1616	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1617
				1618	// Set negative value for 0 < a < 1
				1619	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1620
				1621	return poly;
				1622	}
				1623
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1624	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1625	{
				1626	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1627	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1628	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1629
				1630	// If 0 < a < 1, calculate log(1/x)
				1631	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1632	qint16x8_t recip = vdupq_n_s16(0);
				1633	recip = vbslq_s16(calc_reciprocal, a, recip);
				1634
				1635	// Calculate reciprocal
				1636	recip = vqrecipq_qs16(recip, fixed_point_position);
				1637	a = vbslq_s16(calc_reciprocal, recip, a);
				1638
				1639	// Get decimal part of a
				1640	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1641	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1642
				1643	// Get exponent of 2^n which is equal or less than dec_a
				1644	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1645
				1646	// Get x to range (1, 2]
				1647	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1648	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1649	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1650
				1651	// Polynomial Approximation
				1652	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1653
				1654	// Reconstruct
				1655	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1656
				1657	// Set negative value for 0 < a < 1
				1658	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1659
				1660	return poly;
				1661	}
				1662
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1663	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1664	{
				1665	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1666
				1667	// Find shift value. Number must be in (0.5, 2) range.
				1668	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1669
				1670	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1671	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1672	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1673	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1674	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1675
				1676	temp = vshl_s8(a, shift_value);
				1677
				1678	// Initial guess
				1679	qint8x8_t x = temp;
				1680
				1681	// Calculate (x / 2) * (3 - a * x^2)
				1682	// After three iterations we have the result for 8 bit
				1683	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1684	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1685	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1686
				1687	return vshl_s8(x, shift_value2);
				1688	}
				1689
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1690	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1691	{
				1692	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1693
				1694	// Find shift value. Number must be in (0.5, 2) range.
				1695	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1696
				1697	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1698	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1699	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1700	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1701	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1702
				1703	temp = vshl_s16(a, shift_value);
				1704
				1705	// Initial guess
				1706	qint16x4_t x = temp;
				1707
				1708	// Calculate (x / 2) * (3 - a * x^2)
				1709	// After five iterations we have the result for 8 bit
				1710	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1711	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1712	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1713	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1714	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1715
				1716	return vshl_s16(x, shift_value2);
				1717	}
				1718
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1719	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1720	{
				1721	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1722
				1723	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1724	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1725
				1726	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1727	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1728	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1729	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1730	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1731
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1732	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1733
				1734	// Initial guess
				1735	qint8x8_t x = temp;
				1736
				1737	// Calculate (x / 2) * (3 - a * x^2)
				1738	// After three iterations we have the result for 8 bit
				1739	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1740	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1741	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1742
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1743	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1744	}
				1745
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1746	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1747	{
				1748	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1749
				1750	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1751	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1752
				1753	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1754	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1755	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1756	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1757	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1758
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1759	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1760
				1761	// Initial guess
				1762	qint16x4_t x = temp;
				1763
				1764	// Calculate (x / 2) * (3 - a * x^2)
				1765	// After five iterations we have the result for 16 bit
				1766	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1767	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1768	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1769	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1770	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1771
				1772	return vqshl_s16(x, shift_value2);
				1773	}
				1774
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1775	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1776	{
				1777	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1778
				1779	// Find shift value. Number must be in (0.5, 2) range.
				1780	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1781
				1782	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1783	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1784	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1785	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1786	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1787
				1788	temp = vshlq_s8(a, shift_value);
				1789
				1790	// Initial guess
				1791	qint8x16_t x = temp;
				1792
				1793	// Calculate (x / 2) * (3 - a * x^2)
				1794	// After three iterations we have the result for 8 bit
				1795	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1796	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1797	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1798
				1799	return vshlq_s8(x, shift_value2);
				1800	}
				1801
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1802	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1803	{
				1804	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1805
				1806	// Find shift value. Number must be in (0.5, 2) range.
				1807	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1808
				1809	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1810	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1811	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1812	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1813	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1814
				1815	temp = vshlq_s16(a, shift_value);
				1816
				1817	// Initial guess
				1818	qint16x8_t x = temp;
				1819
				1820	// Calculate (x / 2) * (3 - a * x^2)
				1821	// After five iterations we have the result for 16 bit
				1822	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1823	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1824	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1825	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1826	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1827
				1828	return vshlq_s16(x, shift_value2);
				1829	}
				1830
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1831	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1832	{
				1833	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1834
				1835	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1836	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1837
				1838	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1839	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1840	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1841	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1842	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1843
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1844	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1845
				1846	// Initial guess
				1847	qint8x16_t x = temp;
				1848
				1849	// Calculate (x / 2) * (3 - a * x^2)
				1850	// After three iterations we have the result for 8 bit
				1851	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1852	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1853	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1854
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1855	return vqshlq_s8(x, shift_value2);
				1856	}
				1857
				1858	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1859	{
				1860	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1861
				1862	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1863	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1864
				1865	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1866	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1867	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1868	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1869	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1870
				1871	temp = vqshlq_s16(a, shift_value);
				1872
				1873	// Initial guess
				1874	qint16x8_t x = temp;
				1875
				1876	// Calculate (x / 2) * (3 - a * x^2)
				1877	// After five iterations we have the result for 16 bit
				1878	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1879	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1880	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1881	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1882	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1883
				1884	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1885	}
				1886
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1887	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1888	{
				1889	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1890	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1891
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1892	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1893	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1894	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1895	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1896
				1897	return tanh;
				1898	}
				1899
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1900	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1901	{
				1902	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1903	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1904
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1905	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1906	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1907	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1908	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1909
				1910	return tanh;
				1911	}
				1912
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1913	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1914	{
				1915	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1916	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1917
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1918	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1919	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1920	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1921	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1922
				1923	return tanh;
				1924	}
				1925
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1926	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1927	{
				1928	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1929	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1930
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1931	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1932	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1933	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1934	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1935
				1936	return tanh;
				1937	}
				1938
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1939	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1940	{
				1941	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1942	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1943
				1944	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1945	{
				1946	float32x4x2_t res =
				1947	{
				1948	{
				1949	vmaxq_f32(a.val[0], b.val[0]),
				1950	vmaxq_f32(a.val[1], b.val[1])
				1951	}
				1952	};
				1953	return res;
				1954	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1955	}