Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: a5d9e7685d591801553989e4edae98633257a716 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	28	/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements
				30	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	31	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	32	{
				33	{
				34	vdup_n_s8(0x7F), // 0.9978546
				35	vdup_n_s8(0x3F), // 0.4994721
				36	vdup_n_s8(0x16), // 0.1763723
				37	vdup_n_s8(0x05), // 0.0435108
				38	}
				39	};
				40
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	41	/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				42	* Format is in Q0.15 for all elements
				43	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	44	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				45	{
				46	{
				47	vdup_n_s16(0x7FBA), // 0.9978546
				48	vdup_n_s16(0x3FE9), // 0.4994721
				49	vdup_n_s16(0x1693), // 0.1763723
				50	vdup_n_s16(0x0592), // 0.0435108
				51	}
				52	};
				53
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	54	/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				55	* Format is in Q0.7 for all elements
				56	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	57	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	58	{
				59	{
				60	vdupq_n_s8(0x7F), // 0.9978546
				61	vdupq_n_s8(0x3F), // 0.4994721
				62	vdupq_n_s8(0x16), // 0.1763723
				63	vdupq_n_s8(0x05), // 0.0435108
				64	}
				65	};
				66
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	67	/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				68	* Format is in Q0.15 for all elements
				69	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	70	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				71	{
				72	{
				73	vdupq_n_s16(0x7FBA), // 0.9978546
				74	vdupq_n_s16(0x3FE9), // 0.4994721
				75	vdupq_n_s16(0x1693), // 0.1763723
				76	vdupq_n_s16(0x0592), // 0.0435108
				77	}
				78	};
				79
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	80	/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				81	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				82	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	83	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	84	{
				85	{
				86	vdup_n_s8(0x5C), // 1.4384189
				87	vdup_n_s8(-0x56), // -0.6771900
				88	vdup_n_s8(0x29), // 0.3218538
				89	vdup_n_s8(-0x0A), // -0.0832229
				90	}
				91	};
				92
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	93	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				94	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				95	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	96	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				97	{
				98	{
				99	vdup_n_s16(0x5C0F), // 1.4384189
				100	vdup_n_s16(-0x56AE), // -0.6771900
				101	vdup_n_s16(0x2933), // 0.3218538
				102	vdup_n_s16(-0x0AA7), // -0.0832229
				103	}
				104	};
				105
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	106	/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				107	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				108	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	109	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	110	{
				111	{
				112	vdupq_n_s8(0x5C), // 1.4384189
				113	vdupq_n_s8(-0x56), // -0.6771900
				114	vdupq_n_s8(0x29), // 0.3218538
				115	vdupq_n_s8(-0x0A), // -0.0832229
				116	}
				117	};
				118
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame^]	119	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				120	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				121	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	122	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				123	{
				124	{
				125	vdupq_n_s16(0x5C0F), // 1.4384189
				126	vdupq_n_s16(-0x56AE), // -0.6771900
				127	vdupq_n_s16(0x2933), // 0.3218538
				128	vdupq_n_s16(-0x0AA7), // -0.0832229
				129	}
				130	};
				131
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	132	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				133	{
				134	return vget_low_s8(a);
				135	}
				136
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	137	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				138	{
				139	return vget_low_s16(a);
				140	}
				141
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	142	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				143	{
				144	return vget_high_s8(a);
				145	}
				146
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	147	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				148	{
				149	return vget_high_s16(a);
				150	}
				151
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	152	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				153	{
				154	return vld1_s8(addr);
				155	}
				156
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	157	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				158	{
				159	return vld1_s16(addr);
				160	}
				161
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	162	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				163	{
				164	return vld1q_s8(addr);
				165	}
				166
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	167	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				168	{
				169	return vld1q_s16(addr);
				170	}
				171
				172	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				173	{
				174	return vld1_dup_s8(addr);
				175	}
				176
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	177	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				178	{
				179	return vld1_dup_s16(addr);
				180	}
				181
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	182	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				183	{
				184	return vld1q_dup_s8(addr);
				185	}
				186
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	187	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				188	{
				189	return vld1q_dup_s16(addr);
				190	}
				191
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	192	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				193	{
				194	return vld2q_s16(addr);
				195	}
				196
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	197	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				198	{
				199	vst1_s8(addr, b);
				200	}
				201
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	202	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				203	{
				204	vst1_s16(addr, b);
				205	}
				206
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	207	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				208	{
				209	vst1q_s8(addr, b);
				210	}
				211
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	212	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				213	{
				214	vst1q_s16(addr, b);
				215	}
				216
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	217	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				218	{
				219	vst2q_s16(addr, b);
				220	}
				221
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	222	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				223	{
				224	return vqmovn_s16(a);
				225	}
				226
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	227	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				228	{
				229	return vqmovn_s32(a);
				230	}
				231
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	232	inline qint8x8_t vdup_n_qs8(qint8_t a)
				233	{
				234	return vdup_n_s8(a);
				235	}
				236
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	237	inline qint16x4_t vdup_n_qs16(qint16_t a)
				238	{
				239	return vdup_n_s16(a);
				240	}
				241
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	242	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				243	{
				244	return vdupq_n_s8(a);
				245	}
				246
				247	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				248	{
				249	float32x4x4_t res =
				250	{
				251	{
				252	vdupq_n_f32(a),
				253	vdupq_n_f32(a),
				254	vdupq_n_f32(a),
				255	vdupq_n_f32(a),
				256	}
				257	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	258	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	259	}
				260
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	261	inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
				262	{
				263	float32x4x2_t res =
				264	{
				265	{
				266	vdupq_n_f32(a),
				267	vdupq_n_f32(a),
				268	}
				269	};
				270	return vqcvtq_qs16_f32(res, fixed_point_position);
				271	}
				272
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	273	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				274	{
				275	return vdupq_n_s16(a);
				276	}
				277
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	278	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				279	{
				280	return vdupq_n_s32(a);
				281	}
				282
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	283	inline qint8x8_t vabs_qs8(qint8x8_t a)
				284	{
				285	return vabs_s8(a);
				286	}
				287
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	288	inline qint16x4_t vabs_qs16(qint16x4_t a)
				289	{
				290	return vabs_s16(a);
				291	}
				292
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	293	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				294	{
				295	return vabsq_s8(a);
				296	}
				297
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	298	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				299	{
				300	return vabsq_s16(a);
				301	}
				302
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	303	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				304	{
				305	return vqabs_s8(a);
				306	}
				307
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	308	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				309	{
				310	return vqabs_s16(a);
				311	}
				312
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	313	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				314	{
				315	return vqabsq_s8(a);
				316	}
				317
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	318	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				319	{
				320	return vqabsq_s16(a);
				321	}
				322
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	323	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				324	{
				325	return vmax_s8(a, b);
				326	}
				327
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	328	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				329	{
				330	return vmax_s16(a, b);
				331	}
				332
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	333	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				334	{
				335	return vmaxq_s8(a, b);
				336	}
				337
				338	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				339	{
				340	return vpmax_s8(a, b);
				341	}
				342
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	343	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				344	{
				345	return vpmax_s16(a, b);
				346	}
				347
				348	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				349	{
				350	return vmaxq_s16(a, b);
				351	}
				352
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	353	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				354	{
				355	return vmin_s8(a, b);
				356	}
				357
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	358	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				359	{
				360	return vmin_s16(a, b);
				361	}
				362
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	363	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				364	{
				365	return vminq_s8(a, b);
				366	}
				367
				368	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				369	{
				370	return vpmin_s8(a, b);
				371	}
				372
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	373	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				374	{
				375	return vpmin_s16(a, b);
				376	}
				377
				378	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				379	{
				380	return vminq_s16(a, b);
				381	}
				382
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	383	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				384	{
				385	return vadd_s8(a, b);
				386	}
				387
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	388	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				389	{
				390	return vadd_s16(a, b);
				391	}
				392
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	393	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				394	{
				395	return vaddq_s8(a, b);
				396	}
				397
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	398	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				399	{
				400	return vaddq_s16(a, b);
				401	}
				402
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	403	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				404	{
				405	return vqadd_s8(a, b);
				406	}
				407
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	408	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				409	{
				410	return vqadd_s16(a, b);
				411	}
				412
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	413	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				414	{
				415	return vqadd_s32(a, b);
				416	}
				417
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	418	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				419	{
				420	return vqaddq_s8(a, b);
				421	}
				422
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	423	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				424	{
				425	return vqaddq_s16(a, b);
				426	}
				427
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	428	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				429	{
				430	return vqaddq_s32(a, b);
				431	}
				432
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	433	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				434	{
				435	return vpaddl_s8(a);
				436	}
				437
				438	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				439	{
				440	return vsub_s8(a, b);
				441	}
				442
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	443	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				444	{
				445	return vsub_s16(a, b);
				446	}
				447
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	448	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				449	{
				450	return vsubq_s8(a, b);
				451	}
				452
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	453	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				454	{
				455	return vsubq_s16(a, b);
				456	}
				457
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	458	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				459	{
				460	return vqsub_s8(a, b);
				461	}
				462
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	463	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				464	{
				465	return vqsub_s16(a, b);
				466	}
				467
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	468	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				469	{
				470	return vqsubq_s8(a, b);
				471	}
				472
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	473	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				474	{
				475	return vqsubq_s16(a, b);
				476	}
				477
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	478	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				479	{
				480	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				481
				482	// Initialize the temporary result with a constant used to round up the result
				483	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				484
				485	// Vector multiply-accumulate long
				486	res = vmlal_s8(res, a, b);
				487
				488	// Shift right by fixed_point_position
				489	res = vshlq_s16(res, fixed_point_position_s16);
				490
				491	// Convert back to qint8
				492	return vmovn_s16(res);
				493	}
				494
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	495	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				496	{
				497	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				498
				499	// Initialize the temporary result with a constant used to round up the result
				500	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				501
				502	// Vector multiply-accumulate long
				503	res = vmlal_s16(res, a, b);
				504
				505	// Shift right by fixed_point_position
				506	res = vshlq_s32(res, fixed_point_position_s32);
				507
				508	// Convert back to qint16
				509	return vmovn_s32(res);
				510	}
				511
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	512	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				513	{
				514	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				515
				516	// Initialize the temporary results with a constant used to round up the result
				517	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				518	qint16x8_t res1 = res0;
				519
				520	// Vector multiply-accumulate long
				521	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				522	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				523
				524	// Shift right by fixed_point_position
				525	res0 = vshlq_s16(res0, fixed_point_position_s16);
				526	res1 = vshlq_s16(res1, fixed_point_position_s16);
				527
				528	// Convert back to qint8
				529	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				530	}
				531
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	532	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				533	{
				534	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				535
				536	// Initialize the temporary results with a constant used to round up the result
				537	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				538	qint32x4_t res1 = res0;
				539
				540	// Vector multiply-accumulate long
				541	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				542	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				543
				544	// Shift right by fixed_point_position
				545	res0 = vshlq_s32(res0, fixed_point_position_s32);
				546	res1 = vshlq_s32(res1, fixed_point_position_s32);
				547
				548	// Convert back to qint16
				549	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				550	}
				551
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	552	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				553	{
				554	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				555
				556	// Initialize the temporary result with a constant used to round up the result
				557	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				558
				559	// Vector multiply-accumulate long
				560	res = vmlal_s8(res, a, b);
				561
				562	// Shift right by fixed_point_position
				563	res = vqshlq_s16(res, fixed_point_position_s16);
				564
				565	// Convert back to qint8 and saturate
				566	return vqmovn_s16(res);
				567	}
				568
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	569	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				570	{
				571	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				572
				573	// Initialize the temporary result with a constant used to round up the result
				574	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				575
				576	// Vector multiply-accumulate long
				577	res = vmlal_s16(res, a, b);
				578
				579	// Shift right by fixed_point_position
				580	res = vqshlq_s32(res, fixed_point_position_s32);
				581
				582	// Convert back to qint16 and saturate
				583	return vqmovn_s32(res);
				584	}
				585
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	586	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				587	{
				588	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				589
				590	// Initialize the temporary results with a constant used to round up the result
				591	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				592	qint16x8_t res1 = res0;
				593
				594	// Vector multiply-accumulate long
				595	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				596	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				597
				598	// Shift right by fixed_point_position
				599	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				600	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				601
				602	// Convert back to qint8 and saturate
				603	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				604	}
				605
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	606	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				607	{
				608	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				609
				610	// Initialize the temporary results with a constant used to round up the result
				611	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				612	qint32x4_t res1 = res0;
				613
				614	// Vector multiply-accumulate long
				615	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				616	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				617
				618	// Shift right by fixed_point_position
				619	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				620	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				621
				622	// Convert back to qint16 and saturate
				623	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				624	}
				625
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	626	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				627	{
				628	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				629
				630	qint16x8_t res = vmull_s8(a, b);
				631
				632	return vqrshlq_s16(res, fixed_point_position_s16);
				633	}
				634
Pablo Tello	f87cc7f	2017-07-26 10:28:40 +0100	[diff] [blame]	635	inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				636	{
				637	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				638
				639	// Initialize the temporary results with a constant used to round up the result
				640	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				641
				642	// Vector multiply-accumulate long
				643	tmp = vmull_s16(a, b);
				644
				645	// Shift right by fixed_point_position
				646	return vqshlq_s32(tmp, fixed_point_position_s32);
				647	}
				648
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	649	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				650	{
				651	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				652
				653	// Initialize the temporary results with a constant used to round up the result
				654	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				655
				656	// Vector multiply-accumulate long
				657	tmp = vmlal_s8(tmp, b, c);
				658
				659	// Shift right by fixed_point_position
				660	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				661
				662	// Convert back to qint8 and accumulate
				663	return vadd_s8(a, vmovn_s16(tmp));
				664	}
				665
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	666	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				667	{
				668	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				669
				670	// Initialize the temporary results with a constant used to round up the result
				671	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				672
				673	// Vector multiply-accumulate long
				674	tmp = vmlal_s16(tmp, b, c);
				675
				676	// Shift right by fixed_point_position
				677	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				678
				679	// Convert back to qint16 and accumulate
				680	return vadd_s16(a, vmovn_s32(tmp));
				681	}
				682
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	683	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				684	{
				685	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				686
				687	// Initialize the temporary results with a constant used to round up the result
				688	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				689	qint16x8_t tmp1 = tmp0;
				690
				691	// Vector multiply-accumulate long
				692	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				693	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				694
				695	// Shift right by fixed_point_position
				696	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				697	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				698
				699	// Convert back to qint8 and accumulate
				700	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				701	}
				702
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	703	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				704	{
				705	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				706
				707	// Initialize the temporary results with a constant used to round up the result
				708	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				709	qint32x4_t tmp1 = tmp0;
				710
				711	// Vector multiply-accumulate long
				712	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				713	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				714
				715	// Shift right by fixed_point_position
				716	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				717	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				718
				719	// Convert back to qint16 and accumulate
				720	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				721	}
				722
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	723	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				724	{
				725	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				726
				727	// Initialize the temporary results with a constant used to round up the result
				728	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				729
				730	// Vector multiply-accumulate long
				731	tmp = vmlal_s8(tmp, b, c);
				732
				733	// Shift right by fixed_point_position
				734	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				735
				736	// Convert back to qint8 and accumulate
				737	return vqadd_s8(a, vqmovn_s16(tmp));
				738	}
				739
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	740	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				741	{
				742	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				743
				744	// Initialize the temporary results with a constant used to round up the result
				745	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				746
				747	// Vector multiply-accumulate long
				748	tmp = vmlal_s16(tmp, b, c);
				749
				750	// Shift right by fixed_point_position
				751	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				752
				753	// Convert back to qint8 and accumulate
				754	return vqadd_s16(a, vqmovn_s32(tmp));
				755	}
				756
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	757	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				758	{
				759	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				760
				761	// Initialize the temporary results with a constant used to round up the result
				762	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				763	qint16x8_t tmp1 = tmp0;
				764
				765	// Vector multiply-accumulate long
				766	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				767	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				768
				769	// Shift right by fixed_point_position
				770	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				771	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				772
				773	// Convert back to qint8 and accumulate
				774	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				775	return vqaddq_s8(a, res);
				776	}
				777
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	778	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				779	{
				780	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				781
				782	// Initialize the temporary results with a constant used to round up the result
				783	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				784	qint32x4_t tmp1 = tmp0;
				785
				786	// Vector multiply-accumulate long
				787	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				788	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				789
				790	// Shift right by fixed_point_position
				791	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				792	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				793
				794	// Convert back to qint16 and accumulate
				795	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				796	return vqaddq_s16(a, res);
				797	}
				798
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	799	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				800	{
				801	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				802
				803	// Initialize the temporary results with a constant used to round up the result
				804	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				805
				806	// Vector multiply-accumulate long
				807	tmp = vmlal_s8(tmp, b, c);
				808
				809	// Shift right by fixed_point_position
				810	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				811
				812	// Accumulate
				813	return vaddq_s16(a, tmp);
				814	}
				815
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	816	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				817	{
				818	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				819
				820	// Initialize the temporary results with a constant used to round up the result
				821	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				822
				823	// Vector multiply-accumulate long
				824	tmp = vmlal_s16(tmp, b, c);
				825
				826	// Shift right by fixed_point_position
				827	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				828
				829	// Accumulate
				830	return vaddq_s32(a, tmp);
				831	}
				832
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	833	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				834	{
				835	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				836
				837	// Initialize the temporary results with a constant used to round up the result
				838	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				839
				840	// Vector multiply-accumulate long
				841	tmp = vmlal_s8(tmp, b, c);
				842
				843	// Shift right by fixed_point_position
				844	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				845
				846	// Accumulate
				847	return vqaddq_s16(a, tmp);
				848	}
				849
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	850	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				851	{
				852	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				853
				854	// Initialize the temporary results with a constant used to round up the result
				855	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				856
				857	// Vector multiply-accumulate long
				858	tmp = vmlal_s16(tmp, b, c);
				859
				860	// Shift right by fixed_point_position
				861	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				862
				863	// Accumulate
				864	return vqaddq_s32(a, tmp);
				865	}
				866
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	867	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	868	{
				869	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				870
				871	float32x4x2_t res_f32 =
				872	{
				873	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	874	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				875	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	876	}
				877	};
				878
				879	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				880	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				881
				882	const int32x4x2_t res_s32 =
				883	{
				884	{
				885	vcvtq_s32_f32(res_f32.val[0]),
				886	vcvtq_s32_f32(res_f32.val[1]),
				887	}
				888	};
				889
				890	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				891
				892	return vqmovn_s16(res_s16);
				893	}
				894
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	895	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	896	{
				897	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				898
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	899	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	900
				901	res_f32 = vmlaq_f32(res_f32, a, pow2);
				902
				903	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				904
				905	return vqmovn_s32(res_s32);
				906	}
				907
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	908	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	909	{
				910	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				911
				912	float32x4x4_t res_f32 =
				913	{
				914	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	915	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				916	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				917	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				918	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	919	}
				920	};
				921
				922	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				923	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				924	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				925	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				926
				927	const int32x4x4_t res_s32 =
				928	{
				929	{
				930	vcvtq_s32_f32(res_f32.val[0]),
				931	vcvtq_s32_f32(res_f32.val[1]),
				932	vcvtq_s32_f32(res_f32.val[2]),
				933	vcvtq_s32_f32(res_f32.val[3]),
				934	}
				935	};
				936
				937	const int16x8x2_t res_s16 =
				938	{
				939	{
				940	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				941	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				942	}
				943	};
				944
				945	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				946	}
				947
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	948	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	949	{
				950	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				951
				952	float32x4x2_t res_f32 =
				953	{
				954	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	955	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				956	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	957	}
				958	};
				959
				960	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				961	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				962
				963	const int32x4x2_t res_s32 =
				964	{
				965	{
				966	vcvtq_s32_f32(res_f32.val[0]),
				967	vcvtq_s32_f32(res_f32.val[1])
				968	}
				969	};
				970
				971	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				972	}
				973
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	974	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				975	{
				976	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				977
				978	const int16x8_t res_s16 = vmovl_s8(a);
				979
				980	const int32x4x2_t res_s32 =
				981	{
				982	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	983	vmovl_s16(vget_low_qs16(res_s16)),
				984	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	985	}
				986	};
				987
				988	float32x4x2_t res_f32 =
				989	{
				990	{
				991	vcvtq_f32_s32(res_s32.val[0]),
				992	vcvtq_f32_s32(res_s32.val[1])
				993	}
				994	};
				995
				996	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				997	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				998
				999	return res_f32;
				1000	}
				1001
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1002	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				1003	{
				1004	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1005	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				1006
				1007	return vmulq_f32(res_f32, pow2);
				1008	}
				1009
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1010	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				1011	{
				1012	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1013
				1014	const int16x8x2_t res_s16 =
				1015	{
				1016	{
				1017	vmovl_s8(vget_low_s8(a)),
				1018	vmovl_s8(vget_high_s8(a)),
				1019	}
				1020	};
				1021
				1022	const int32x4x4_t res_s32 =
				1023	{
				1024	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1025	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				1026	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				1027	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				1028	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1029	}
				1030	};
				1031
				1032	float32x4x4_t res_f32 =
				1033	{
				1034	{
				1035	vcvtq_f32_s32(res_s32.val[0]),
				1036	vcvtq_f32_s32(res_s32.val[1]),
				1037	vcvtq_f32_s32(res_s32.val[2]),
				1038	vcvtq_f32_s32(res_s32.val[3])
				1039	}
				1040	};
				1041
				1042	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1043	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1044	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1045	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1046
				1047	return res_f32;
				1048	}
				1049
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1050	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1051	{
				1052	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1053
				1054	const int32x4x2_t res_s32 =
				1055	{
				1056	{
				1057	vmovl_s16(vget_low_qs16(a)),
				1058	vmovl_s16(vget_high_qs16(a))
				1059	}
				1060	};
				1061
				1062	float32x4x2_t res_f32 =
				1063	{
				1064	{
				1065	vcvtq_f32_s32(res_s32.val[0]),
				1066	vcvtq_f32_s32(res_s32.val[1])
				1067	}
				1068	};
				1069
				1070	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1071	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1072
				1073	return res_f32;
				1074	}
				1075
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1076	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1077	{
				1078	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1079	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1080	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1081	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1082
				1083	// Find shift value
				1084	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1085	const qint8x8_t temp = vshl_s8(a, shift_value);
				1086
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1087	// Newton-Raphson division initial estimate X0 calculation
				1088	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1089
				1090	uint8x8_t set_one = vcgt_s8(x, const_one);
				1091	x = vbsl_s8(set_one, const_one, x);
				1092
				1093	// Use three iterations of Newton-Raphson method to get the result
				1094	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1095	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1096	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1097
				1098	return vshl_s8(x, shift_value);
				1099	}
				1100
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1101	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1102	{
				1103	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1104	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1105	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1106	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1107
				1108	// Find shift value
				1109	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1110	const qint16x4_t temp = vshl_s16(a, shift_value);
				1111
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1112	// Newton-Raphson division initial estimate X0 calculation
				1113	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1114
				1115	uint16x4_t set_one = vcgt_s16(x, const_one);
				1116	x = vbsl_s16(set_one, const_one, x);
				1117
				1118	// Use five iterations of Newton-Raphson method to get the result
				1119	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1120	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1121	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1122	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1123	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1124
				1125	return vshl_s16(x, shift_value);
				1126	}
				1127
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1128	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1129	{
				1130	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1131	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1132	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1133	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1134
				1135	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1136	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1137	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1138
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1139	// Newton-Raphson division initial estimate X0 calculation
				1140	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1141
				1142	uint8x8_t set_one = vcgt_s8(x, const_one);
				1143	x = vbsl_s8(set_one, const_one, x);
				1144
				1145	// Use three iterations of Newton-Raphson method to get the result
				1146	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1147	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1148	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1149
				1150	return vqshl_s8(x, shift_value);
				1151	}
				1152
				1153	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1154	{
				1155	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1156	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1157	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1158	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1159
				1160	// Find shift value
				1161	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1162	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1163
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1164	// Newton-Raphson division initial estimate X0 calculation
				1165	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1166
				1167	uint16x4_t set_one = vcgt_s16(x, const_one);
				1168	x = vbsl_s16(set_one, const_one, x);
				1169
				1170	// Use five iterations of Newton-Raphson method to get the result
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1171	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1172	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1173	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1174	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1175	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1176
				1177	return vqshl_s16(x, shift_value);
				1178	}
				1179
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1180	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1181	{
				1182	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1183	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1184	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1185	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1186
				1187	// Find shift value
				1188	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1189	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1190
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1191	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1192	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1193
				1194	// Set initial guess to one if x > 1
				1195	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1196	x = vbslq_s8(set_one, const_one, x);
				1197
				1198	// Use three iterations of Newton-Raphson method to get the result
				1199	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1200	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1201	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1202
				1203	return vshlq_s8(x, shift_value);
				1204	}
				1205
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1206	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1207	{
				1208	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1209	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1210	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1211	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1212
				1213	// Find shift value
				1214	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1215	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1216
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1217	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1218	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1219
				1220	// Set initial guess to one if x > 1
				1221	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1222	x = vbslq_s16(set_one, const_one, x);
				1223
				1224	// Use five iterations of Newton-Raphson method to get the result
				1225	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1226	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1227	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1228	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1229	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1230
				1231	return vshlq_s16(x, shift_value);
				1232	}
				1233
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1234	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1235	{
				1236	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1237	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1238	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1239	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1240
				1241	// Find shift value
				1242	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1243	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1244
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1245	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1246	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1247
				1248	// Set initial guess to one if x > 1
				1249	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1250	x = vbslq_s8(set_one, const_one, x);
				1251
				1252	// Use three iterations of Newton-Raphson method to get the result
				1253	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1254	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1255	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1256
				1257	return vqshlq_s8(x, shift_value);
				1258	}
				1259
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1260	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1261	{
				1262	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1263	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1264	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1265	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1266
				1267	// Find shift value
				1268	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1269	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1270
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1271	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1272	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1273
				1274	// Set initial guess to one if x > 1
				1275	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1276	x = vbslq_s16(set_one, const_one, x);
				1277
				1278	// Use five iterations of Newton-Raphson method to get the result
				1279	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1280	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1281	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1282	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1283	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1284
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1285	// Saturate result in case of overflow
				1286	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1287	}
				1288
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1289	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1290	{
				1291	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1292	}
				1293
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1294	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1295	{
				1296	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1297	}
				1298
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1299	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1300	{
				1301	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1302	}
				1303
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1304	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1305	{
				1306	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1307	}
				1308
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1309	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1310	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1311	{
				1312	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1313	const qint8x8_t const_one = vdup_n_s8(1);
				1314	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1315	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1316	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1317	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1318	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1319	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1320	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1321	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1322	return res;
				1323	}
				1324
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1325	template <bool islog>
				1326	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1327	{
				1328	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1329	const qint16x4_t const_one = vdup_n_s16(1);
				1330	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1331	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1332	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1333	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1334	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1335	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1336	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1337	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1338	return res;
				1339	}
				1340
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1341	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1342	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1343	{
				1344	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1345	const qint8x8_t const_one = vdup_n_s8(1);
				1346	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1347	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1348	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1349	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1350	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1351	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1352	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1353	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1354	return res;
				1355	}
				1356
				1357	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1358	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1359	{
				1360	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1361	const qint16x4_t const_one = vdup_n_s16(1);
				1362	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1363	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1364	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1365	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1366	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1367	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1368	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1369	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1370	return res;
				1371	}
				1372
				1373	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1374	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1375	{
				1376	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1377	const qint8x16_t const_one = vdupq_n_s8(1);
				1378	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1379	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1380	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1381	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1382	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1383	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1384	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1385	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1386	return res;
				1387	}
				1388
				1389	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1390	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1391	{
				1392	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1393	const qint16x8_t const_one = vdupq_n_s16(1);
				1394	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1395	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1396	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1397	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1398	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1399	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1400	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1401	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1402	return res;
				1403	}
				1404
				1405	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1406	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1407	{
				1408	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1409	const qint8x16_t const_one = vdupq_n_s8(1);
				1410	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1411	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1412	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1413	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1414	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1415	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1416	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1417	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1418	return res;
				1419	}
				1420
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1421	template <bool islog>
				1422	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1423	{
				1424	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1425	const qint16x8_t const_one = vdupq_n_s16(1);
				1426	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1427	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1428	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1429	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1430	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1431	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1432	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1433	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1434	return res;
				1435	}
				1436
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1437	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1438	{
				1439	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1440	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1441	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1442	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1443
				1444	// Perform range reduction [-log(2),log(2)]
				1445	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1446
				1447	// get decimal part from m
				1448	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1449
				1450	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1451	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1452
				1453	// Polynomial Approximation
				1454	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1455	poly = vqadd_s8(poly, const_one);
				1456
				1457	// Reconstruct
				1458	poly = vqshl_s8(poly, dec_m);
				1459
				1460	return poly;
				1461	}
				1462
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1463	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1464	{
				1465	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1466	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1467	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1468	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1469
				1470	// Perform range reduction [-log(2),log(2)]
				1471	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1472
				1473	// get decimal part from m
				1474	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1475
				1476	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1477	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1478
				1479	// Polynomial Approximation
				1480	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1481	poly = vqadd_s16(poly, const_one);
				1482
				1483	// Reconstruct
				1484	poly = vqshl_s16(poly, dec_m);
				1485
				1486	return poly;
				1487	}
				1488
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1489	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1490	{
				1491	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1492	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1493	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1494	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1495
				1496	// Perform range reduction [-log(2),log(2)]
				1497	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1498
				1499	// get decimal part from m
				1500	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1501
				1502	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1503	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1504
				1505	// Polynomial Approximation
				1506	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1507	poly = vqaddq_s8(poly, const_one);
				1508
				1509	// Reconstruct
				1510	poly = vqshlq_s8(poly, dec_m);
				1511
				1512	return poly;
				1513	}
				1514
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1515	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1516	{
				1517	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1518	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1519	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1520	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1521
				1522	// Perform range reduction [-log(2),log(2)]
				1523	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1524
				1525	// get decimal part from m
				1526	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1527
				1528	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1529	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1530
				1531	// Polynomial Approximation
				1532	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1533	poly = vqaddq_s16(poly, const_one);
				1534
				1535	// Reconstruct
				1536	poly = vqshlq_s16(poly, dec_m);
				1537
				1538	return poly;
				1539	}
				1540
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1541	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1542	{
				1543	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1544	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1545	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1546
				1547	// If 0 < a < 1, calculate log(1/x)
				1548	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1549	qint8x8_t recip = vdup_n_s8(0);
				1550	recip = vbsl_s8(calc_reciprocal, recip, a);
				1551
				1552	// Calculate reciprocal
				1553	recip = vrecip_qs8(recip, fixed_point_position);
				1554	a = vbsl_s8(calc_reciprocal, recip, a);
				1555
				1556	// Get decimal part of a
				1557	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1558	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1559
				1560	// Get exponent of 2^n which is equal or less than dec_a
				1561	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1562
				1563	// Get x to range (1, 2]
				1564	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1565	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1566	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1567
				1568	// Polynomial Approximation
				1569	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1570
				1571	// Reconstruct
				1572	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1573
				1574	// Set negative value for 0 < a < 1
				1575	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1576
				1577	return poly;
				1578	}
				1579
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1580	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1581	{
				1582	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1583	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1584	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1585
				1586	// If 0 < a < 1, calculate log(1/x)
				1587	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1588	qint16x4_t recip = vdup_n_s16(0);
				1589	recip = vbsl_s16(calc_reciprocal, recip, a);
				1590
				1591	// Calculate reciprocal
				1592	recip = vrecip_qs16(recip, fixed_point_position);
				1593	a = vbsl_s16(calc_reciprocal, recip, a);
				1594
				1595	// Get decimal part of a
				1596	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1597	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1598
				1599	// Get exponent of 2^n which is equal or less than dec_a
				1600	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1601
				1602	// Get x to range (1, 2]
				1603	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1604	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1605	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1606
				1607	// Polynomial Approximation
				1608	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1609
				1610	// Reconstruct
				1611	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1612
				1613	// Set negative value for 0 < a < 1
				1614	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1615
				1616	return poly;
				1617	}
				1618
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1619	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1620	{
				1621	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1622	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1623	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1624
				1625	// If 0 < a < 1, calculate log(1/x)
				1626	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1627	qint8x16_t recip = vdupq_n_s8(0);
				1628	recip = vbslq_s8(calc_reciprocal, a, recip);
				1629
				1630	// Calculate reciprocal
				1631	recip = vrecipq_qs8(recip, fixed_point_position);
				1632	a = vbslq_s8(calc_reciprocal, recip, a);
				1633
				1634	// Get decimal part of a
				1635	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1636	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1637
				1638	// Get exponent of 2^n which is equal or less than dec_a
				1639	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1640
				1641	// Get x to range (1, 2]
				1642	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1643	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1644	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1645
				1646	// Polynomial Approximation
				1647	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1648
				1649	// Reconstruct
				1650	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1651
				1652	// Set negative value for 0 < a < 1
				1653	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1654
				1655	return poly;
				1656	}
				1657
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1658	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1659	{
				1660	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1661	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1662	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1663
				1664	// If 0 < a < 1, calculate log(1/x)
				1665	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1666	qint16x8_t recip = vdupq_n_s16(0);
				1667	recip = vbslq_s16(calc_reciprocal, a, recip);
				1668
				1669	// Calculate reciprocal
				1670	recip = vqrecipq_qs16(recip, fixed_point_position);
				1671	a = vbslq_s16(calc_reciprocal, recip, a);
				1672
				1673	// Get decimal part of a
				1674	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1675	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1676
				1677	// Get exponent of 2^n which is equal or less than dec_a
				1678	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1679
				1680	// Get x to range (1, 2]
				1681	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1682	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1683	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1684
				1685	// Polynomial Approximation
				1686	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1687
				1688	// Reconstruct
				1689	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1690
				1691	// Set negative value for 0 < a < 1
				1692	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1693
				1694	return poly;
				1695	}
				1696
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1697	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1698	{
				1699	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1700
				1701	// Find shift value. Number must be in (0.5, 2) range.
				1702	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1703
				1704	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1705	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1706	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1707	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1708	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1709
				1710	temp = vshl_s8(a, shift_value);
				1711
				1712	// Initial guess
				1713	qint8x8_t x = temp;
				1714
				1715	// Calculate (x / 2) * (3 - a * x^2)
				1716	// After three iterations we have the result for 8 bit
				1717	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1718	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1719	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1720
				1721	return vshl_s8(x, shift_value2);
				1722	}
				1723
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1724	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1725	{
				1726	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1727
				1728	// Find shift value. Number must be in (0.5, 2) range.
				1729	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1730
				1731	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1732	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1733	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1734	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1735	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1736
				1737	temp = vshl_s16(a, shift_value);
				1738
				1739	// Initial guess
				1740	qint16x4_t x = temp;
				1741
				1742	// Calculate (x / 2) * (3 - a * x^2)
				1743	// After five iterations we have the result for 8 bit
				1744	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1745	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1746	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1747	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1748	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1749
				1750	return vshl_s16(x, shift_value2);
				1751	}
				1752
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1753	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1754	{
				1755	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1756
				1757	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1758	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1759
				1760	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1761	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1762	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1763	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1764	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1765
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1766	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1767
				1768	// Initial guess
				1769	qint8x8_t x = temp;
				1770
				1771	// Calculate (x / 2) * (3 - a * x^2)
				1772	// After three iterations we have the result for 8 bit
				1773	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1774	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1775	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1776
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1777	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1778	}
				1779
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1780	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1781	{
				1782	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1783
				1784	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1785	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1786
				1787	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1788	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1789	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1790	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1791	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1792
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1793	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1794
				1795	// Initial guess
				1796	qint16x4_t x = temp;
				1797
				1798	// Calculate (x / 2) * (3 - a * x^2)
				1799	// After five iterations we have the result for 16 bit
				1800	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1801	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1802	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1803	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1804	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1805
				1806	return vqshl_s16(x, shift_value2);
				1807	}
				1808
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1809	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1810	{
				1811	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1812
				1813	// Find shift value. Number must be in (0.5, 2) range.
				1814	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1815
				1816	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1817	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1818	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1819	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1820	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1821
				1822	temp = vshlq_s8(a, shift_value);
				1823
				1824	// Initial guess
				1825	qint8x16_t x = temp;
				1826
				1827	// Calculate (x / 2) * (3 - a * x^2)
				1828	// After three iterations we have the result for 8 bit
				1829	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1830	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1831	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1832
				1833	return vshlq_s8(x, shift_value2);
				1834	}
				1835
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1836	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1837	{
				1838	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1839
				1840	// Find shift value. Number must be in (0.5, 2) range.
				1841	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1842
				1843	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1844	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1845	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1846	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1847	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1848
				1849	temp = vshlq_s16(a, shift_value);
				1850
				1851	// Initial guess
				1852	qint16x8_t x = temp;
				1853
				1854	// Calculate (x / 2) * (3 - a * x^2)
				1855	// After five iterations we have the result for 16 bit
				1856	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1857	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1858	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1859	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1860	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1861
				1862	return vshlq_s16(x, shift_value2);
				1863	}
				1864
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1865	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1866	{
				1867	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1868
				1869	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1870	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1871
				1872	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1873	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1874	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1875	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1876	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1877
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1878	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1879
				1880	// Initial guess
				1881	qint8x16_t x = temp;
				1882
				1883	// Calculate (x / 2) * (3 - a * x^2)
				1884	// After three iterations we have the result for 8 bit
				1885	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1886	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1887	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1888
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1889	return vqshlq_s8(x, shift_value2);
				1890	}
				1891
				1892	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1893	{
				1894	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1895
				1896	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1897	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1898
				1899	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1900	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1901	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1902	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1903	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1904
				1905	temp = vqshlq_s16(a, shift_value);
				1906
				1907	// Initial guess
				1908	qint16x8_t x = temp;
				1909
				1910	// Calculate (x / 2) * (3 - a * x^2)
				1911	// After five iterations we have the result for 16 bit
				1912	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1913	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1914	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1915	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1916	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1917
				1918	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1919	}
				1920
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1921	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1922	{
				1923	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1924	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1925
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1926	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1927	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1928	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1929	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1930
				1931	return tanh;
				1932	}
				1933
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1934	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1935	{
				1936	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1937	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1938
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1939	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1940	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1941	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1942	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1943
				1944	return tanh;
				1945	}
				1946
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1947	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1948	{
				1949	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1950	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1951
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1952	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1953	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1954	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1955	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1956
				1957	return tanh;
				1958	}
				1959
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1960	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1961	{
				1962	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1963	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1964
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1965	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1966	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1967	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1968	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1969
				1970	return tanh;
				1971	}
				1972
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1973	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1974	{
				1975	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1976	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1977
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	1978	inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1979	{
				1980	return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1981	}
				1982
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1983	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1984	{
				1985	float32x4x2_t res =
				1986	{
				1987	{
				1988	vmaxq_f32(a.val[0], b.val[0]),
				1989	vmaxq_f32(a.val[1], b.val[1])
				1990	}
				1991	};
				1992	return res;
				1993	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1994	}