Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: 86b789dc56bb4b2fc95a889f61d90177b952437b [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	28	/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements
				30	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	31	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	32	{
				33	{
				34	vdup_n_s8(0x7F), // 0.9978546
				35	vdup_n_s8(0x3F), // 0.4994721
				36	vdup_n_s8(0x16), // 0.1763723
				37	vdup_n_s8(0x05), // 0.0435108
				38	}
				39	};
				40
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	41	/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				42	* Format is in Q0.15 for all elements
				43	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	44	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				45	{
				46	{
				47	vdup_n_s16(0x7FBA), // 0.9978546
				48	vdup_n_s16(0x3FE9), // 0.4994721
				49	vdup_n_s16(0x1693), // 0.1763723
				50	vdup_n_s16(0x0592), // 0.0435108
				51	}
				52	};
				53
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	54	/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				55	* Format is in Q0.7 for all elements
				56	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	57	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	58	{
				59	{
				60	vdupq_n_s8(0x7F), // 0.9978546
				61	vdupq_n_s8(0x3F), // 0.4994721
				62	vdupq_n_s8(0x16), // 0.1763723
				63	vdupq_n_s8(0x05), // 0.0435108
				64	}
				65	};
				66
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	67	/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				68	* Format is in Q0.15 for all elements
				69	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	70	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				71	{
				72	{
				73	vdupq_n_s16(0x7FBA), // 0.9978546
				74	vdupq_n_s16(0x3FE9), // 0.4994721
				75	vdupq_n_s16(0x1693), // 0.1763723
				76	vdupq_n_s16(0x0592), // 0.0435108
				77	}
				78	};
				79
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	80	/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				81	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				82	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	83	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	84	{
				85	{
				86	vdup_n_s8(0x5C), // 1.4384189
				87	vdup_n_s8(-0x56), // -0.6771900
				88	vdup_n_s8(0x29), // 0.3218538
				89	vdup_n_s8(-0x0A), // -0.0832229
				90	}
				91	};
				92
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	93	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				94	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				95	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	96	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				97	{
				98	{
				99	vdup_n_s16(0x5C0F), // 1.4384189
				100	vdup_n_s16(-0x56AE), // -0.6771900
				101	vdup_n_s16(0x2933), // 0.3218538
				102	vdup_n_s16(-0x0AA7), // -0.0832229
				103	}
				104	};
				105
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	106	/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				107	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				108	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	109	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	110	{
				111	{
				112	vdupq_n_s8(0x5C), // 1.4384189
				113	vdupq_n_s8(-0x56), // -0.6771900
				114	vdupq_n_s8(0x29), // 0.3218538
				115	vdupq_n_s8(-0x0A), // -0.0832229
				116	}
				117	};
				118
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	119	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				120	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				121	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	122	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				123	{
				124	{
				125	vdupq_n_s16(0x5C0F), // 1.4384189
				126	vdupq_n_s16(-0x56AE), // -0.6771900
				127	vdupq_n_s16(0x2933), // 0.3218538
				128	vdupq_n_s16(-0x0AA7), // -0.0832229
				129	}
				130	};
				131
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	132	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				133	{
				134	return vget_low_s8(a);
				135	}
				136
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	137	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				138	{
				139	return vget_low_s16(a);
				140	}
				141
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	142	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				143	{
				144	return vget_high_s8(a);
				145	}
				146
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	147	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				148	{
				149	return vget_high_s16(a);
				150	}
				151
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	152	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				153	{
				154	return vld1_s8(addr);
				155	}
				156
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	157	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				158	{
				159	return vld1_s16(addr);
				160	}
				161
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	162	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				163	{
				164	return vld1q_s8(addr);
				165	}
				166
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	167	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				168	{
				169	return vld1q_s16(addr);
				170	}
				171
				172	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				173	{
				174	return vld1_dup_s8(addr);
				175	}
				176
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	177	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				178	{
				179	return vld1_dup_s16(addr);
				180	}
				181
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	182	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				183	{
				184	return vld1q_dup_s8(addr);
				185	}
				186
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	187	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				188	{
				189	return vld1q_dup_s16(addr);
				190	}
				191
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	192	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				193	{
				194	return vld2q_s16(addr);
				195	}
				196
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	197	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				198	{
				199	vst1_s8(addr, b);
				200	}
				201
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	202	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				203	{
				204	vst1_s16(addr, b);
				205	}
				206
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	207	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				208	{
				209	vst1q_s8(addr, b);
				210	}
				211
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	212	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				213	{
				214	vst1q_s16(addr, b);
				215	}
				216
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	217	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				218	{
				219	vst2q_s16(addr, b);
				220	}
				221
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	222	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				223	{
				224	return vqmovn_s16(a);
				225	}
				226
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	227	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				228	{
				229	return vqmovn_s32(a);
				230	}
				231
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	232	inline qint8x8_t vdup_n_qs8(qint8_t a)
				233	{
				234	return vdup_n_s8(a);
				235	}
				236
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	237	inline qint16x4_t vdup_n_qs16(qint16_t a)
				238	{
				239	return vdup_n_s16(a);
				240	}
				241
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	242	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				243	{
				244	return vdupq_n_s8(a);
				245	}
				246
				247	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				248	{
				249	float32x4x4_t res =
				250	{
				251	{
				252	vdupq_n_f32(a),
				253	vdupq_n_f32(a),
				254	vdupq_n_f32(a),
				255	vdupq_n_f32(a),
				256	}
				257	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	258	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	259	}
				260
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	261	inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
				262	{
				263	float32x4x2_t res =
				264	{
				265	{
				266	vdupq_n_f32(a),
				267	vdupq_n_f32(a),
				268	}
				269	};
				270	return vqcvtq_qs16_f32(res, fixed_point_position);
				271	}
				272
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	273	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				274	{
				275	return vdupq_n_s16(a);
				276	}
				277
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	278	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				279	{
				280	return vdupq_n_s32(a);
				281	}
				282
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	283	inline qint8x8_t vabs_qs8(qint8x8_t a)
				284	{
				285	return vabs_s8(a);
				286	}
				287
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	288	inline qint16x4_t vabs_qs16(qint16x4_t a)
				289	{
				290	return vabs_s16(a);
				291	}
				292
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	293	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				294	{
				295	return vabsq_s8(a);
				296	}
				297
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	298	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				299	{
				300	return vabsq_s16(a);
				301	}
				302
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	303	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				304	{
				305	return vqabs_s8(a);
				306	}
				307
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	308	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				309	{
				310	return vqabs_s16(a);
				311	}
				312
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	313	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				314	{
				315	return vqabsq_s8(a);
				316	}
				317
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	318	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				319	{
				320	return vqabsq_s16(a);
				321	}
				322
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	323	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				324	{
				325	return vmax_s8(a, b);
				326	}
				327
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	328	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				329	{
				330	return vmax_s16(a, b);
				331	}
				332
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	333	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				334	{
				335	return vmaxq_s8(a, b);
				336	}
				337
				338	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				339	{
				340	return vpmax_s8(a, b);
				341	}
				342
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	343	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				344	{
				345	return vpmax_s16(a, b);
				346	}
				347
				348	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				349	{
				350	return vmaxq_s16(a, b);
				351	}
				352
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	353	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				354	{
				355	return vmin_s8(a, b);
				356	}
				357
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	358	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				359	{
				360	return vmin_s16(a, b);
				361	}
				362
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	363	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				364	{
				365	return vminq_s8(a, b);
				366	}
				367
				368	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				369	{
				370	return vpmin_s8(a, b);
				371	}
				372
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	373	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				374	{
				375	return vpmin_s16(a, b);
				376	}
				377
				378	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				379	{
				380	return vminq_s16(a, b);
				381	}
				382
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	383	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				384	{
				385	return vadd_s8(a, b);
				386	}
				387
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	388	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				389	{
				390	return vadd_s16(a, b);
				391	}
				392
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	393	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				394	{
				395	return vaddq_s8(a, b);
				396	}
				397
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	398	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				399	{
				400	return vaddq_s16(a, b);
				401	}
				402
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	403	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				404	{
				405	return vqadd_s8(a, b);
				406	}
				407
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	408	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				409	{
				410	return vqadd_s16(a, b);
				411	}
				412
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	413	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				414	{
				415	return vqadd_s32(a, b);
				416	}
				417
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	418	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				419	{
				420	return vqaddq_s8(a, b);
				421	}
				422
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	423	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				424	{
				425	return vqaddq_s16(a, b);
				426	}
				427
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	428	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				429	{
				430	return vqaddq_s32(a, b);
				431	}
				432
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	433	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				434	{
				435	return vpaddl_s8(a);
				436	}
				437
				438	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				439	{
				440	return vsub_s8(a, b);
				441	}
				442
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	443	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				444	{
				445	return vsub_s16(a, b);
				446	}
				447
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	448	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				449	{
				450	return vsubq_s8(a, b);
				451	}
				452
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	453	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				454	{
				455	return vsubq_s16(a, b);
				456	}
				457
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	458	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				459	{
				460	return vqsub_s8(a, b);
				461	}
				462
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	463	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				464	{
				465	return vqsub_s16(a, b);
				466	}
				467
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	468	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				469	{
				470	return vqsubq_s8(a, b);
				471	}
				472
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	473	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				474	{
				475	return vqsubq_s16(a, b);
				476	}
				477
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	478	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				479	{
				480	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				481
				482	// Initialize the temporary result with a constant used to round up the result
				483	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				484
				485	// Vector multiply-accumulate long
				486	res = vmlal_s8(res, a, b);
				487
				488	// Shift right by fixed_point_position
				489	res = vshlq_s16(res, fixed_point_position_s16);
				490
				491	// Convert back to qint8
				492	return vmovn_s16(res);
				493	}
				494
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	495	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				496	{
				497	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				498
				499	// Initialize the temporary result with a constant used to round up the result
				500	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				501
				502	// Vector multiply-accumulate long
				503	res = vmlal_s16(res, a, b);
				504
				505	// Shift right by fixed_point_position
				506	res = vshlq_s32(res, fixed_point_position_s32);
				507
				508	// Convert back to qint16
				509	return vmovn_s32(res);
				510	}
				511
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	512	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				513	{
				514	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				515
				516	// Initialize the temporary results with a constant used to round up the result
				517	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				518	qint16x8_t res1 = res0;
				519
				520	// Vector multiply-accumulate long
				521	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				522	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				523
				524	// Shift right by fixed_point_position
				525	res0 = vshlq_s16(res0, fixed_point_position_s16);
				526	res1 = vshlq_s16(res1, fixed_point_position_s16);
				527
				528	// Convert back to qint8
				529	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				530	}
				531
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	532	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				533	{
				534	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				535
				536	// Initialize the temporary results with a constant used to round up the result
				537	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				538	qint32x4_t res1 = res0;
				539
				540	// Vector multiply-accumulate long
				541	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				542	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				543
				544	// Shift right by fixed_point_position
				545	res0 = vshlq_s32(res0, fixed_point_position_s32);
				546	res1 = vshlq_s32(res1, fixed_point_position_s32);
				547
				548	// Convert back to qint16
				549	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				550	}
				551
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	552	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				553	{
				554	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				555
				556	// Initialize the temporary result with a constant used to round up the result
				557	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				558
				559	// Vector multiply-accumulate long
				560	res = vmlal_s8(res, a, b);
				561
				562	// Shift right by fixed_point_position
				563	res = vqshlq_s16(res, fixed_point_position_s16);
				564
				565	// Convert back to qint8 and saturate
				566	return vqmovn_s16(res);
				567	}
				568
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	569	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				570	{
				571	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				572
				573	// Initialize the temporary result with a constant used to round up the result
				574	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				575
				576	// Vector multiply-accumulate long
				577	res = vmlal_s16(res, a, b);
				578
				579	// Shift right by fixed_point_position
				580	res = vqshlq_s32(res, fixed_point_position_s32);
				581
				582	// Convert back to qint16 and saturate
				583	return vqmovn_s32(res);
				584	}
				585
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	586	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				587	{
				588	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				589
				590	// Initialize the temporary results with a constant used to round up the result
				591	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				592	qint16x8_t res1 = res0;
				593
				594	// Vector multiply-accumulate long
				595	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				596	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				597
				598	// Shift right by fixed_point_position
				599	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				600	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				601
				602	// Convert back to qint8 and saturate
				603	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				604	}
				605
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	606	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				607	{
				608	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				609
				610	// Initialize the temporary results with a constant used to round up the result
				611	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				612	qint32x4_t res1 = res0;
				613
				614	// Vector multiply-accumulate long
				615	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				616	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				617
				618	// Shift right by fixed_point_position
				619	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				620	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				621
				622	// Convert back to qint16 and saturate
				623	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				624	}
				625
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	626	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				627	{
				628	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				629
				630	qint16x8_t res = vmull_s8(a, b);
				631
				632	return vqrshlq_s16(res, fixed_point_position_s16);
				633	}
				634
Pablo Tello	f87cc7f	2017-07-26 10:28:40 +0100	[diff] [blame]	635	inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				636	{
				637	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				638
				639	// Initialize the temporary results with a constant used to round up the result
				640	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				641
				642	// Vector multiply-accumulate long
				643	tmp = vmull_s16(a, b);
				644
				645	// Shift right by fixed_point_position
				646	return vqshlq_s32(tmp, fixed_point_position_s32);
				647	}
				648
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	649	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				650	{
				651	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				652
				653	// Initialize the temporary results with a constant used to round up the result
				654	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				655
				656	// Vector multiply-accumulate long
				657	tmp = vmlal_s8(tmp, b, c);
				658
				659	// Shift right by fixed_point_position
				660	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				661
				662	// Convert back to qint8 and accumulate
				663	return vadd_s8(a, vmovn_s16(tmp));
				664	}
				665
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	666	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				667	{
				668	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				669
				670	// Initialize the temporary results with a constant used to round up the result
				671	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				672
				673	// Vector multiply-accumulate long
				674	tmp = vmlal_s16(tmp, b, c);
				675
				676	// Shift right by fixed_point_position
				677	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				678
				679	// Convert back to qint16 and accumulate
				680	return vadd_s16(a, vmovn_s32(tmp));
				681	}
				682
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	683	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				684	{
				685	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				686
				687	// Initialize the temporary results with a constant used to round up the result
				688	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				689	qint16x8_t tmp1 = tmp0;
				690
				691	// Vector multiply-accumulate long
				692	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				693	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				694
				695	// Shift right by fixed_point_position
				696	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				697	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				698
				699	// Convert back to qint8 and accumulate
				700	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				701	}
				702
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	703	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				704	{
				705	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				706
				707	// Initialize the temporary results with a constant used to round up the result
				708	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				709	qint32x4_t tmp1 = tmp0;
				710
				711	// Vector multiply-accumulate long
				712	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				713	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				714
				715	// Shift right by fixed_point_position
				716	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				717	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				718
				719	// Convert back to qint16 and accumulate
				720	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				721	}
				722
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	723	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				724	{
				725	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				726
				727	// Initialize the temporary results with a constant used to round up the result
				728	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				729
				730	// Vector multiply-accumulate long
				731	tmp = vmlal_s8(tmp, b, c);
				732
				733	// Shift right by fixed_point_position
				734	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				735
				736	// Convert back to qint8 and accumulate
				737	return vqadd_s8(a, vqmovn_s16(tmp));
				738	}
				739
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	740	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				741	{
				742	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				743
				744	// Initialize the temporary results with a constant used to round up the result
				745	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				746
				747	// Vector multiply-accumulate long
				748	tmp = vmlal_s16(tmp, b, c);
				749
				750	// Shift right by fixed_point_position
				751	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				752
				753	// Convert back to qint8 and accumulate
				754	return vqadd_s16(a, vqmovn_s32(tmp));
				755	}
				756
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	757	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				758	{
				759	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				760
				761	// Initialize the temporary results with a constant used to round up the result
				762	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				763	qint16x8_t tmp1 = tmp0;
				764
				765	// Vector multiply-accumulate long
				766	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				767	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				768
				769	// Shift right by fixed_point_position
				770	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				771	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				772
				773	// Convert back to qint8 and accumulate
				774	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				775	return vqaddq_s8(a, res);
				776	}
				777
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	778	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				779	{
				780	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				781
				782	// Initialize the temporary results with a constant used to round up the result
				783	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				784	qint32x4_t tmp1 = tmp0;
				785
				786	// Vector multiply-accumulate long
				787	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				788	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				789
				790	// Shift right by fixed_point_position
				791	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				792	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				793
				794	// Convert back to qint16 and accumulate
				795	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				796	return vqaddq_s16(a, res);
				797	}
				798
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	799	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				800	{
				801	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				802
				803	// Initialize the temporary results with a constant used to round up the result
				804	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				805
				806	// Vector multiply-accumulate long
				807	tmp = vmlal_s8(tmp, b, c);
				808
				809	// Shift right by fixed_point_position
				810	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				811
				812	// Accumulate
				813	return vaddq_s16(a, tmp);
				814	}
				815
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	816	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				817	{
				818	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				819
				820	// Initialize the temporary results with a constant used to round up the result
				821	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				822
				823	// Vector multiply-accumulate long
				824	tmp = vmlal_s16(tmp, b, c);
				825
				826	// Shift right by fixed_point_position
				827	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				828
				829	// Accumulate
				830	return vaddq_s32(a, tmp);
				831	}
				832
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	833	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				834	{
				835	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				836
				837	// Initialize the temporary results with a constant used to round up the result
				838	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				839
				840	// Vector multiply-accumulate long
				841	tmp = vmlal_s8(tmp, b, c);
				842
				843	// Shift right by fixed_point_position
				844	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				845
				846	// Accumulate
				847	return vqaddq_s16(a, tmp);
				848	}
				849
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	850	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				851	{
				852	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				853
				854	// Initialize the temporary results with a constant used to round up the result
				855	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				856
				857	// Vector multiply-accumulate long
				858	tmp = vmlal_s16(tmp, b, c);
				859
				860	// Shift right by fixed_point_position
				861	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				862
				863	// Accumulate
				864	return vqaddq_s32(a, tmp);
				865	}
				866
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	867	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	868	{
				869	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				870
				871	float32x4x2_t res_f32 =
				872	{
				873	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	874	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				875	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	876	}
				877	};
				878
				879	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				880	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				881
				882	const int32x4x2_t res_s32 =
				883	{
				884	{
				885	vcvtq_s32_f32(res_f32.val[0]),
				886	vcvtq_s32_f32(res_f32.val[1]),
				887	}
				888	};
				889
				890	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				891
				892	return vqmovn_s16(res_s16);
				893	}
				894
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	895	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	896	{
				897	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				898
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	899	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	900
				901	res_f32 = vmlaq_f32(res_f32, a, pow2);
				902
				903	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				904
				905	return vqmovn_s32(res_s32);
				906	}
				907
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	908	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	909	{
				910	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				911
				912	float32x4x4_t res_f32 =
				913	{
				914	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	915	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				916	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				917	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				918	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	919	}
				920	};
				921
				922	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				923	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				924	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				925	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				926
				927	const int32x4x4_t res_s32 =
				928	{
				929	{
				930	vcvtq_s32_f32(res_f32.val[0]),
				931	vcvtq_s32_f32(res_f32.val[1]),
				932	vcvtq_s32_f32(res_f32.val[2]),
				933	vcvtq_s32_f32(res_f32.val[3]),
				934	}
				935	};
				936
				937	const int16x8x2_t res_s16 =
				938	{
				939	{
				940	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				941	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				942	}
				943	};
				944
				945	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				946	}
				947
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	948	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	949	{
				950	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				951
				952	float32x4x2_t res_f32 =
				953	{
				954	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	955	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				956	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	957	}
				958	};
				959
				960	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				961	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				962
				963	const int32x4x2_t res_s32 =
				964	{
				965	{
				966	vcvtq_s32_f32(res_f32.val[0]),
				967	vcvtq_s32_f32(res_f32.val[1])
				968	}
				969	};
				970
				971	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				972	}
				973
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	974	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				975	{
				976	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				977
				978	const int16x8_t res_s16 = vmovl_s8(a);
				979
				980	const int32x4x2_t res_s32 =
				981	{
				982	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	983	vmovl_s16(vget_low_qs16(res_s16)),
				984	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	985	}
				986	};
				987
				988	float32x4x2_t res_f32 =
				989	{
				990	{
				991	vcvtq_f32_s32(res_s32.val[0]),
				992	vcvtq_f32_s32(res_s32.val[1])
				993	}
				994	};
				995
				996	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				997	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				998
				999	return res_f32;
				1000	}
				1001
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1002	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				1003	{
				1004	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1005	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				1006
				1007	return vmulq_f32(res_f32, pow2);
				1008	}
				1009
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1010	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				1011	{
				1012	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1013
				1014	const int16x8x2_t res_s16 =
				1015	{
				1016	{
				1017	vmovl_s8(vget_low_s8(a)),
				1018	vmovl_s8(vget_high_s8(a)),
				1019	}
				1020	};
				1021
				1022	const int32x4x4_t res_s32 =
				1023	{
				1024	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1025	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				1026	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				1027	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				1028	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1029	}
				1030	};
				1031
				1032	float32x4x4_t res_f32 =
				1033	{
				1034	{
				1035	vcvtq_f32_s32(res_s32.val[0]),
				1036	vcvtq_f32_s32(res_s32.val[1]),
				1037	vcvtq_f32_s32(res_s32.val[2]),
				1038	vcvtq_f32_s32(res_s32.val[3])
				1039	}
				1040	};
				1041
				1042	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1043	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1044	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1045	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1046
				1047	return res_f32;
				1048	}
				1049
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1050	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1051	{
				1052	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1053
				1054	const int32x4x2_t res_s32 =
				1055	{
				1056	{
				1057	vmovl_s16(vget_low_qs16(a)),
				1058	vmovl_s16(vget_high_qs16(a))
				1059	}
				1060	};
				1061
				1062	float32x4x2_t res_f32 =
				1063	{
				1064	{
				1065	vcvtq_f32_s32(res_s32.val[0]),
				1066	vcvtq_f32_s32(res_s32.val[1])
				1067	}
				1068	};
				1069
				1070	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1071	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1072
				1073	return res_f32;
				1074	}
				1075
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1076	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1077	{
				1078	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1079	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1080	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1081	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1082	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1083
				1084	// Find shift value
				1085	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1086	const qint8x8_t temp = vshl_s8(a, shift_value);
				1087
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1088	// Newton-Raphson division initial estimate X0 calculation
				1089	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1090
				1091	uint8x8_t set_one = vcgt_s8(x, const_one);
				1092	x = vbsl_s8(set_one, const_one, x);
				1093
				1094	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1095	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1096	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1097	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1098
				1099	return vshl_s8(x, shift_value);
				1100	}
				1101
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1102	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1103	{
				1104	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1105	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1106	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1107	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1108	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1109
				1110	// Find shift value
				1111	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1112	const qint16x4_t temp = vshl_s16(a, shift_value);
				1113
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1114	// Newton-Raphson division initial estimate X0 calculation
				1115	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1116
				1117	uint16x4_t set_one = vcgt_s16(x, const_one);
				1118	x = vbsl_s16(set_one, const_one, x);
				1119
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1120	// Use four iterations of Newton-Raphson method to get the result
				1121	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1122	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1123	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1124	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1125
				1126	return vshl_s16(x, shift_value);
				1127	}
				1128
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1129	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1130	{
				1131	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1132	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1133	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1134	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1135	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1136
				1137	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1138	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1139	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1140
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1141	// Newton-Raphson division initial estimate X0 calculation
				1142	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1143
				1144	uint8x8_t set_one = vcgt_s8(x, const_one);
				1145	x = vbsl_s8(set_one, const_one, x);
				1146
				1147	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1148	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1149	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1150	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1151
				1152	return vqshl_s8(x, shift_value);
				1153	}
				1154
				1155	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1156	{
				1157	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1158	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1159	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1160	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1161	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1162
				1163	// Find shift value
				1164	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1165	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1166
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1167	// Newton-Raphson division initial estimate X0 calculation
				1168	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1169
				1170	uint16x4_t set_one = vcgt_s16(x, const_one);
				1171	x = vbsl_s16(set_one, const_one, x);
				1172
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1173	// Use four iterations of Newton-Raphson method to get the result
				1174	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1175	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1176	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1177	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1178
				1179	return vqshl_s16(x, shift_value);
				1180	}
				1181
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1182	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1183	{
				1184	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1185	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1186	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1187	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1188	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1189
				1190	// Find shift value
				1191	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1192	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1193
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1194	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1195	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1196
				1197	// Set initial guess to one if x > 1
				1198	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1199	x = vbslq_s8(set_one, const_one, x);
				1200
				1201	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1202	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1203	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1204	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1205
				1206	return vshlq_s8(x, shift_value);
				1207	}
				1208
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1209	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1210	{
				1211	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1212	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1213	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1214	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1215	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1216
				1217	// Find shift value
				1218	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1219	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1220
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1221	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1222	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1223
				1224	// Set initial guess to one if x > 1
				1225	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1226	x = vbslq_s16(set_one, const_one, x);
				1227
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1228	// Use four iterations of Newton-Raphson method to get the result
				1229	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1230	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1231	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1232	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1233
				1234	return vshlq_s16(x, shift_value);
				1235	}
				1236
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1237	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1238	{
				1239	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1240	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1241	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1242	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1243	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1244
				1245	// Find shift value
				1246	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1247	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1248
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1249	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1250	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1251
				1252	// Set initial guess to one if x > 1
				1253	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1254	x = vbslq_s8(set_one, const_one, x);
				1255
				1256	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1257	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1258	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1259	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1260
				1261	return vqshlq_s8(x, shift_value);
				1262	}
				1263
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1264	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1265	{
				1266	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1267	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1268	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1269	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1270	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1271
				1272	// Find shift value
				1273	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1274	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1275
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1276	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1277	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1278
				1279	// Set initial guess to one if x > 1
				1280	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1281	x = vbslq_s16(set_one, const_one, x);
				1282
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame^]	1283	// Use four iterations of Newton-Raphson method to get the result
				1284	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1285	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1286	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1287	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1288
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1289	// Saturate result in case of overflow
				1290	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1291	}
				1292
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1293	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1294	{
				1295	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1296	}
				1297
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1298	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1299	{
				1300	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1301	}
				1302
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1303	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1304	{
				1305	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1306	}
				1307
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1308	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1309	{
				1310	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1311	}
				1312
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1313	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1314	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1315	{
				1316	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1317	const qint8x8_t const_one = vdup_n_s8(1);
				1318	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1319	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1320	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1321	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1322	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1323	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1324	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1325	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1326	return res;
				1327	}
				1328
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1329	template <bool islog>
				1330	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1331	{
				1332	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1333	const qint16x4_t const_one = vdup_n_s16(1);
				1334	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1335	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1336	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1337	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1338	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1339	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1340	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1341	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1342	return res;
				1343	}
				1344
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1345	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1346	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1347	{
				1348	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1349	const qint8x8_t const_one = vdup_n_s8(1);
				1350	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1351	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1352	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1353	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1354	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1355	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1356	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1357	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1358	return res;
				1359	}
				1360
				1361	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1362	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1363	{
				1364	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1365	const qint16x4_t const_one = vdup_n_s16(1);
				1366	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1367	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1368	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1369	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1370	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1371	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1372	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1373	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1374	return res;
				1375	}
				1376
				1377	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1378	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1379	{
				1380	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1381	const qint8x16_t const_one = vdupq_n_s8(1);
				1382	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1383	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1384	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1385	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1386	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1387	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1388	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1389	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1390	return res;
				1391	}
				1392
				1393	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1394	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1395	{
				1396	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1397	const qint16x8_t const_one = vdupq_n_s16(1);
				1398	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1399	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1400	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1401	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1402	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1403	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1404	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1405	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1406	return res;
				1407	}
				1408
				1409	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1410	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1411	{
				1412	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1413	const qint8x16_t const_one = vdupq_n_s8(1);
				1414	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1415	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1416	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1417	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1418	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1419	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1420	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1421	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1422	return res;
				1423	}
				1424
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1425	template <bool islog>
				1426	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1427	{
				1428	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1429	const qint16x8_t const_one = vdupq_n_s16(1);
				1430	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1431	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1432	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1433	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1434	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1435	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1436	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1437	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1438	return res;
				1439	}
				1440
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1441	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1442	{
				1443	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1444	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1445	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1446	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1447
				1448	// Perform range reduction [-log(2),log(2)]
				1449	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1450
				1451	// get decimal part from m
				1452	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1453
				1454	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1455	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1456
				1457	// Polynomial Approximation
				1458	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1459	poly = vqadd_s8(poly, const_one);
				1460
				1461	// Reconstruct
				1462	poly = vqshl_s8(poly, dec_m);
				1463
				1464	return poly;
				1465	}
				1466
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1467	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1468	{
				1469	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1470	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1471	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1472	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1473
				1474	// Perform range reduction [-log(2),log(2)]
				1475	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1476
				1477	// get decimal part from m
				1478	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1479
				1480	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1481	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1482
				1483	// Polynomial Approximation
				1484	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1485	poly = vqadd_s16(poly, const_one);
				1486
				1487	// Reconstruct
				1488	poly = vqshl_s16(poly, dec_m);
				1489
				1490	return poly;
				1491	}
				1492
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1493	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1494	{
				1495	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1496	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1497	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1498	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1499
				1500	// Perform range reduction [-log(2),log(2)]
				1501	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1502
				1503	// get decimal part from m
				1504	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1505
				1506	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1507	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1508
				1509	// Polynomial Approximation
				1510	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1511	poly = vqaddq_s8(poly, const_one);
				1512
				1513	// Reconstruct
				1514	poly = vqshlq_s8(poly, dec_m);
				1515
				1516	return poly;
				1517	}
				1518
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1519	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1520	{
				1521	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1522	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1523	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1524	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1525
				1526	// Perform range reduction [-log(2),log(2)]
				1527	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1528
				1529	// get decimal part from m
				1530	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1531
				1532	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1533	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1534
				1535	// Polynomial Approximation
				1536	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1537	poly = vqaddq_s16(poly, const_one);
				1538
				1539	// Reconstruct
				1540	poly = vqshlq_s16(poly, dec_m);
				1541
				1542	return poly;
				1543	}
				1544
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1545	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1546	{
				1547	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1548	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1549	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1550
				1551	// If 0 < a < 1, calculate log(1/x)
				1552	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1553	qint8x8_t recip = vdup_n_s8(0);
				1554	recip = vbsl_s8(calc_reciprocal, recip, a);
				1555
				1556	// Calculate reciprocal
				1557	recip = vrecip_qs8(recip, fixed_point_position);
				1558	a = vbsl_s8(calc_reciprocal, recip, a);
				1559
				1560	// Get decimal part of a
				1561	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1562	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1563
				1564	// Get exponent of 2^n which is equal or less than dec_a
				1565	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1566
				1567	// Get x to range (1, 2]
				1568	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1569	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1570	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1571
				1572	// Polynomial Approximation
				1573	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1574
				1575	// Reconstruct
				1576	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1577
				1578	// Set negative value for 0 < a < 1
				1579	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1580
				1581	return poly;
				1582	}
				1583
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1584	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1585	{
				1586	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1587	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1588	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1589
				1590	// If 0 < a < 1, calculate log(1/x)
				1591	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1592	qint16x4_t recip = vdup_n_s16(0);
				1593	recip = vbsl_s16(calc_reciprocal, recip, a);
				1594
				1595	// Calculate reciprocal
				1596	recip = vrecip_qs16(recip, fixed_point_position);
				1597	a = vbsl_s16(calc_reciprocal, recip, a);
				1598
				1599	// Get decimal part of a
				1600	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1601	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1602
				1603	// Get exponent of 2^n which is equal or less than dec_a
				1604	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1605
				1606	// Get x to range (1, 2]
				1607	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1608	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1609	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1610
				1611	// Polynomial Approximation
				1612	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1613
				1614	// Reconstruct
				1615	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1616
				1617	// Set negative value for 0 < a < 1
				1618	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1619
				1620	return poly;
				1621	}
				1622
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1623	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1624	{
				1625	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1626	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1627	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1628
				1629	// If 0 < a < 1, calculate log(1/x)
				1630	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1631	qint8x16_t recip = vdupq_n_s8(0);
				1632	recip = vbslq_s8(calc_reciprocal, a, recip);
				1633
				1634	// Calculate reciprocal
				1635	recip = vrecipq_qs8(recip, fixed_point_position);
				1636	a = vbslq_s8(calc_reciprocal, recip, a);
				1637
				1638	// Get decimal part of a
				1639	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1640	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1641
				1642	// Get exponent of 2^n which is equal or less than dec_a
				1643	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1644
				1645	// Get x to range (1, 2]
				1646	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1647	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1648	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1649
				1650	// Polynomial Approximation
				1651	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1652
				1653	// Reconstruct
				1654	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1655
				1656	// Set negative value for 0 < a < 1
				1657	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1658
				1659	return poly;
				1660	}
				1661
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1662	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1663	{
				1664	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1665	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1666	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1667
				1668	// If 0 < a < 1, calculate log(1/x)
				1669	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1670	qint16x8_t recip = vdupq_n_s16(0);
				1671	recip = vbslq_s16(calc_reciprocal, a, recip);
				1672
				1673	// Calculate reciprocal
				1674	recip = vqrecipq_qs16(recip, fixed_point_position);
				1675	a = vbslq_s16(calc_reciprocal, recip, a);
				1676
				1677	// Get decimal part of a
				1678	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1679	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1680
				1681	// Get exponent of 2^n which is equal or less than dec_a
				1682	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1683
				1684	// Get x to range (1, 2]
				1685	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1686	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1687	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1688
				1689	// Polynomial Approximation
				1690	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1691
				1692	// Reconstruct
				1693	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1694
				1695	// Set negative value for 0 < a < 1
				1696	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1697
				1698	return poly;
				1699	}
				1700
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1701	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1702	{
				1703	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1704
				1705	// Find shift value. Number must be in (0.5, 2) range.
				1706	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1707
				1708	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1709	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1710	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1711	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1712	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1713
				1714	temp = vshl_s8(a, shift_value);
				1715
				1716	// Initial guess
				1717	qint8x8_t x = temp;
				1718
				1719	// Calculate (x / 2) * (3 - a * x^2)
				1720	// After three iterations we have the result for 8 bit
				1721	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1722	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1723	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1724
				1725	return vshl_s8(x, shift_value2);
				1726	}
				1727
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1728	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1729	{
				1730	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1731
				1732	// Find shift value. Number must be in (0.5, 2) range.
				1733	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1734
				1735	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1736	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1737	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1738	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1739	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1740
				1741	temp = vshl_s16(a, shift_value);
				1742
				1743	// Initial guess
				1744	qint16x4_t x = temp;
				1745
				1746	// Calculate (x / 2) * (3 - a * x^2)
				1747	// After five iterations we have the result for 8 bit
				1748	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1749	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1750	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1751	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1752	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1753
				1754	return vshl_s16(x, shift_value2);
				1755	}
				1756
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1757	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1758	{
				1759	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1760
				1761	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1762	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1763
				1764	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1765	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1766	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1767	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1768	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1769
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1770	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1771
				1772	// Initial guess
				1773	qint8x8_t x = temp;
				1774
				1775	// Calculate (x / 2) * (3 - a * x^2)
				1776	// After three iterations we have the result for 8 bit
				1777	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1778	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1779	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1780
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1781	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1782	}
				1783
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1784	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1785	{
				1786	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1787
				1788	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1789	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1790
				1791	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1792	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1793	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1794	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1795	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1796
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1797	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1798
				1799	// Initial guess
				1800	qint16x4_t x = temp;
				1801
				1802	// Calculate (x / 2) * (3 - a * x^2)
				1803	// After five iterations we have the result for 16 bit
				1804	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1805	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1806	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1807	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1808	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1809
				1810	return vqshl_s16(x, shift_value2);
				1811	}
				1812
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1813	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1814	{
				1815	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1816
				1817	// Find shift value. Number must be in (0.5, 2) range.
				1818	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1819
				1820	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1821	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1822	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1823	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1824	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1825
				1826	temp = vshlq_s8(a, shift_value);
				1827
				1828	// Initial guess
				1829	qint8x16_t x = temp;
				1830
				1831	// Calculate (x / 2) * (3 - a * x^2)
				1832	// After three iterations we have the result for 8 bit
				1833	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1834	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1835	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1836
				1837	return vshlq_s8(x, shift_value2);
				1838	}
				1839
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1840	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1841	{
				1842	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1843
				1844	// Find shift value. Number must be in (0.5, 2) range.
				1845	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1846
				1847	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1848	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1849	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1850	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1851	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1852
				1853	temp = vshlq_s16(a, shift_value);
				1854
				1855	// Initial guess
				1856	qint16x8_t x = temp;
				1857
				1858	// Calculate (x / 2) * (3 - a * x^2)
				1859	// After five iterations we have the result for 16 bit
				1860	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1861	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1862	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1863	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1864	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1865
				1866	return vshlq_s16(x, shift_value2);
				1867	}
				1868
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1869	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1870	{
				1871	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1872
				1873	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1874	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1875
				1876	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1877	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1878	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1879	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1880	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1881
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1882	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1883
				1884	// Initial guess
				1885	qint8x16_t x = temp;
				1886
				1887	// Calculate (x / 2) * (3 - a * x^2)
				1888	// After three iterations we have the result for 8 bit
				1889	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1890	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1891	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1892
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1893	return vqshlq_s8(x, shift_value2);
				1894	}
				1895
				1896	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1897	{
				1898	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1899
				1900	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1901	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1902
				1903	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1904	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1905	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1906	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1907	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1908
				1909	temp = vqshlq_s16(a, shift_value);
				1910
				1911	// Initial guess
				1912	qint16x8_t x = temp;
				1913
				1914	// Calculate (x / 2) * (3 - a * x^2)
				1915	// After five iterations we have the result for 16 bit
				1916	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1917	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1918	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1919	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1920	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1921
				1922	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1923	}
				1924
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1925	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1926	{
				1927	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1928	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1929
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1930	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1931	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1932	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1933	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1934
				1935	return tanh;
				1936	}
				1937
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1938	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1939	{
				1940	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1941	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1942
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1943	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1944	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1945	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1946	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1947
				1948	return tanh;
				1949	}
				1950
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1951	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1952	{
				1953	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1954	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1955
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1956	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1957	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1958	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1959	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1960
				1961	return tanh;
				1962	}
				1963
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1964	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1965	{
				1966	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1967	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1968
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1969	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1970	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1971	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1972	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1973
				1974	return tanh;
				1975	}
				1976
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1977	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1978	{
				1979	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1980	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1981
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	1982	inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1983	{
				1984	return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1985	}
				1986
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1987	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1988	{
				1989	float32x4x2_t res =
				1990	{
				1991	{
				1992	vmaxq_f32(a.val[0], b.val[0]),
				1993	vmaxq_f32(a.val[1], b.val[1])
				1994	}
				1995	};
				1996	return res;
				1997	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1998	}