Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: c879d3e275db2aa181e8e0e5f4125fbd4b183517 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
				28	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	30	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	31	{
				32	{
				33	vdup_n_s8(0x7F), // 0.9978546
				34	vdup_n_s8(0x3F), // 0.4994721
				35	vdup_n_s8(0x16), // 0.1763723
				36	vdup_n_s8(0x05), // 0.0435108
				37	}
				38	};
				39
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	40	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				41	* Format is in Q0.15 for all elements */
				42	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				43	{
				44	{
				45	vdup_n_s16(0x7FBA), // 0.9978546
				46	vdup_n_s16(0x3FE9), // 0.4994721
				47	vdup_n_s16(0x1693), // 0.1763723
				48	vdup_n_s16(0x0592), // 0.0435108
				49	}
				50	};
				51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				53	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	54	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	{
				56	{
				57	vdupq_n_s8(0x7F), // 0.9978546
				58	vdupq_n_s8(0x3F), // 0.4994721
				59	vdupq_n_s8(0x16), // 0.1763723
				60	vdupq_n_s8(0x05), // 0.0435108
				61	}
				62	};
				63
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	64	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				65	* Format is in Q0.15 for all elements */
				66	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				67	{
				68	{
				69	vdupq_n_s16(0x7FBA), // 0.9978546
				70	vdupq_n_s16(0x3FE9), // 0.4994721
				71	vdupq_n_s16(0x1693), // 0.1763723
				72	vdupq_n_s16(0x0592), // 0.0435108
				73	}
				74	};
				75
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	76	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				77	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	78	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	79	{
				80	{
				81	vdup_n_s8(0x5C), // 1.4384189
				82	vdup_n_s8(-0x56), // -0.6771900
				83	vdup_n_s8(0x29), // 0.3218538
				84	vdup_n_s8(-0x0A), // -0.0832229
				85	}
				86	};
				87
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	88	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				89	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				90	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				91	{
				92	{
				93	vdup_n_s16(0x5C0F), // 1.4384189
				94	vdup_n_s16(-0x56AE), // -0.6771900
				95	vdup_n_s16(0x2933), // 0.3218538
				96	vdup_n_s16(-0x0AA7), // -0.0832229
				97	}
				98	};
				99
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	100	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				101	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	102	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	103	{
				104	{
				105	vdupq_n_s8(0x5C), // 1.4384189
				106	vdupq_n_s8(-0x56), // -0.6771900
				107	vdupq_n_s8(0x29), // 0.3218538
				108	vdupq_n_s8(-0x0A), // -0.0832229
				109	}
				110	};
				111
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	112	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				113	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				114	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				115	{
				116	{
				117	vdupq_n_s16(0x5C0F), // 1.4384189
				118	vdupq_n_s16(-0x56AE), // -0.6771900
				119	vdupq_n_s16(0x2933), // 0.3218538
				120	vdupq_n_s16(-0x0AA7), // -0.0832229
				121	}
				122	};
				123
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				125	{
				126	return vget_low_s8(a);
				127	}
				128
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	129	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				130	{
				131	return vget_low_s16(a);
				132	}
				133
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	134	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				135	{
				136	return vget_high_s8(a);
				137	}
				138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	139	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				140	{
				141	return vget_high_s16(a);
				142	}
				143
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				145	{
				146	return vld1_s8(addr);
				147	}
				148
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	149	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				150	{
				151	return vld1_s16(addr);
				152	}
				153
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	154	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				155	{
				156	return vld1q_s8(addr);
				157	}
				158
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				160	{
				161	return vld1q_s16(addr);
				162	}
				163
				164	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				165	{
				166	return vld1_dup_s8(addr);
				167	}
				168
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	169	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				170	{
				171	return vld1_dup_s16(addr);
				172	}
				173
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	174	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				175	{
				176	return vld1q_dup_s8(addr);
				177	}
				178
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	179	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				180	{
				181	return vld1q_dup_s16(addr);
				182	}
				183
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	184	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				185	{
				186	return vld2q_s16(addr);
				187	}
				188
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	189	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				190	{
				191	vst1_s8(addr, b);
				192	}
				193
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	194	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				195	{
				196	vst1_s16(addr, b);
				197	}
				198
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	199	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				200	{
				201	vst1q_s8(addr, b);
				202	}
				203
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				205	{
				206	vst1q_s16(addr, b);
				207	}
				208
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	209	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				210	{
				211	vst2q_s16(addr, b);
				212	}
				213
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				215	{
				216	return vqmovn_s16(a);
				217	}
				218
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	219	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				220	{
				221	return vqmovn_s32(a);
				222	}
				223
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	224	inline qint8x8_t vdup_n_qs8(qint8_t a)
				225	{
				226	return vdup_n_s8(a);
				227	}
				228
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	229	inline qint16x4_t vdup_n_qs16(qint16_t a)
				230	{
				231	return vdup_n_s16(a);
				232	}
				233
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	234	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				235	{
				236	return vdupq_n_s8(a);
				237	}
				238
				239	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				240	{
				241	float32x4x4_t res =
				242	{
				243	{
				244	vdupq_n_f32(a),
				245	vdupq_n_f32(a),
				246	vdupq_n_f32(a),
				247	vdupq_n_f32(a),
				248	}
				249	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	250	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	251	}
				252
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame^]	253	inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
				254	{
				255	float32x4x2_t res =
				256	{
				257	{
				258	vdupq_n_f32(a),
				259	vdupq_n_f32(a),
				260	}
				261	};
				262	return vqcvtq_qs16_f32(res, fixed_point_position);
				263	}
				264
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	265	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				266	{
				267	return vdupq_n_s16(a);
				268	}
				269
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	270	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				271	{
				272	return vdupq_n_s32(a);
				273	}
				274
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	275	inline qint8x8_t vabs_qs8(qint8x8_t a)
				276	{
				277	return vabs_s8(a);
				278	}
				279
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	280	inline qint16x4_t vabs_qs16(qint16x4_t a)
				281	{
				282	return vabs_s16(a);
				283	}
				284
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	285	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				286	{
				287	return vabsq_s8(a);
				288	}
				289
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	290	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				291	{
				292	return vabsq_s16(a);
				293	}
				294
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	295	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				296	{
				297	return vqabs_s8(a);
				298	}
				299
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	300	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				301	{
				302	return vqabs_s16(a);
				303	}
				304
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	305	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				306	{
				307	return vqabsq_s8(a);
				308	}
				309
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	310	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				311	{
				312	return vqabsq_s16(a);
				313	}
				314
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	315	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				316	{
				317	return vmax_s8(a, b);
				318	}
				319
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	320	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				321	{
				322	return vmax_s16(a, b);
				323	}
				324
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	325	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				326	{
				327	return vmaxq_s8(a, b);
				328	}
				329
				330	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				331	{
				332	return vpmax_s8(a, b);
				333	}
				334
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	335	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				336	{
				337	return vpmax_s16(a, b);
				338	}
				339
				340	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				341	{
				342	return vmaxq_s16(a, b);
				343	}
				344
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	345	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				346	{
				347	return vmin_s8(a, b);
				348	}
				349
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	350	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				351	{
				352	return vmin_s16(a, b);
				353	}
				354
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	355	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				356	{
				357	return vminq_s8(a, b);
				358	}
				359
				360	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				361	{
				362	return vpmin_s8(a, b);
				363	}
				364
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	365	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				366	{
				367	return vpmin_s16(a, b);
				368	}
				369
				370	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				371	{
				372	return vminq_s16(a, b);
				373	}
				374
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	375	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				376	{
				377	return vadd_s8(a, b);
				378	}
				379
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	380	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				381	{
				382	return vadd_s16(a, b);
				383	}
				384
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	385	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				386	{
				387	return vaddq_s8(a, b);
				388	}
				389
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	390	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				391	{
				392	return vaddq_s16(a, b);
				393	}
				394
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	395	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				396	{
				397	return vqadd_s8(a, b);
				398	}
				399
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	400	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				401	{
				402	return vqadd_s16(a, b);
				403	}
				404
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	405	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				406	{
				407	return vqadd_s32(a, b);
				408	}
				409
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	410	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				411	{
				412	return vqaddq_s8(a, b);
				413	}
				414
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	415	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				416	{
				417	return vqaddq_s16(a, b);
				418	}
				419
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	420	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				421	{
				422	return vqaddq_s32(a, b);
				423	}
				424
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	425	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				426	{
				427	return vpaddl_s8(a);
				428	}
				429
				430	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				431	{
				432	return vsub_s8(a, b);
				433	}
				434
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	435	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				436	{
				437	return vsub_s16(a, b);
				438	}
				439
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	440	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				441	{
				442	return vsubq_s8(a, b);
				443	}
				444
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	445	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				446	{
				447	return vsubq_s16(a, b);
				448	}
				449
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	450	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				451	{
				452	return vqsub_s8(a, b);
				453	}
				454
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	455	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				456	{
				457	return vqsub_s16(a, b);
				458	}
				459
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	460	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				461	{
				462	return vqsubq_s8(a, b);
				463	}
				464
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	465	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				466	{
				467	return vqsubq_s16(a, b);
				468	}
				469
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	470	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				471	{
				472	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				473
				474	// Initialize the temporary result with a constant used to round up the result
				475	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				476
				477	// Vector multiply-accumulate long
				478	res = vmlal_s8(res, a, b);
				479
				480	// Shift right by fixed_point_position
				481	res = vshlq_s16(res, fixed_point_position_s16);
				482
				483	// Convert back to qint8
				484	return vmovn_s16(res);
				485	}
				486
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	487	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				488	{
				489	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				490
				491	// Initialize the temporary result with a constant used to round up the result
				492	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				493
				494	// Vector multiply-accumulate long
				495	res = vmlal_s16(res, a, b);
				496
				497	// Shift right by fixed_point_position
				498	res = vshlq_s32(res, fixed_point_position_s32);
				499
				500	// Convert back to qint16
				501	return vmovn_s32(res);
				502	}
				503
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	504	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				505	{
				506	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				507
				508	// Initialize the temporary results with a constant used to round up the result
				509	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				510	qint16x8_t res1 = res0;
				511
				512	// Vector multiply-accumulate long
				513	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				514	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				515
				516	// Shift right by fixed_point_position
				517	res0 = vshlq_s16(res0, fixed_point_position_s16);
				518	res1 = vshlq_s16(res1, fixed_point_position_s16);
				519
				520	// Convert back to qint8
				521	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				522	}
				523
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	524	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				525	{
				526	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				527
				528	// Initialize the temporary results with a constant used to round up the result
				529	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				530	qint32x4_t res1 = res0;
				531
				532	// Vector multiply-accumulate long
				533	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				534	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				535
				536	// Shift right by fixed_point_position
				537	res0 = vshlq_s32(res0, fixed_point_position_s32);
				538	res1 = vshlq_s32(res1, fixed_point_position_s32);
				539
				540	// Convert back to qint16
				541	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				542	}
				543
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	544	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				545	{
				546	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				547
				548	// Initialize the temporary result with a constant used to round up the result
				549	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				550
				551	// Vector multiply-accumulate long
				552	res = vmlal_s8(res, a, b);
				553
				554	// Shift right by fixed_point_position
				555	res = vqshlq_s16(res, fixed_point_position_s16);
				556
				557	// Convert back to qint8 and saturate
				558	return vqmovn_s16(res);
				559	}
				560
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	561	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				562	{
				563	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				564
				565	// Initialize the temporary result with a constant used to round up the result
				566	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				567
				568	// Vector multiply-accumulate long
				569	res = vmlal_s16(res, a, b);
				570
				571	// Shift right by fixed_point_position
				572	res = vqshlq_s32(res, fixed_point_position_s32);
				573
				574	// Convert back to qint16 and saturate
				575	return vqmovn_s32(res);
				576	}
				577
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	578	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				579	{
				580	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				581
				582	// Initialize the temporary results with a constant used to round up the result
				583	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				584	qint16x8_t res1 = res0;
				585
				586	// Vector multiply-accumulate long
				587	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				588	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				589
				590	// Shift right by fixed_point_position
				591	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				592	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				593
				594	// Convert back to qint8 and saturate
				595	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				596	}
				597
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	598	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				599	{
				600	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				601
				602	// Initialize the temporary results with a constant used to round up the result
				603	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				604	qint32x4_t res1 = res0;
				605
				606	// Vector multiply-accumulate long
				607	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				608	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				609
				610	// Shift right by fixed_point_position
				611	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				612	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				613
				614	// Convert back to qint16 and saturate
				615	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				616	}
				617
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	618	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				619	{
				620	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				621
				622	qint16x8_t res = vmull_s8(a, b);
				623
				624	return vqrshlq_s16(res, fixed_point_position_s16);
				625	}
				626
				627	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				628	{
				629	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				630
				631	// Initialize the temporary results with a constant used to round up the result
				632	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				633
				634	// Vector multiply-accumulate long
				635	tmp = vmlal_s8(tmp, b, c);
				636
				637	// Shift right by fixed_point_position
				638	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				639
				640	// Convert back to qint8 and accumulate
				641	return vadd_s8(a, vmovn_s16(tmp));
				642	}
				643
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	644	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				645	{
				646	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				647
				648	// Initialize the temporary results with a constant used to round up the result
				649	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				650
				651	// Vector multiply-accumulate long
				652	tmp = vmlal_s16(tmp, b, c);
				653
				654	// Shift right by fixed_point_position
				655	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				656
				657	// Convert back to qint16 and accumulate
				658	return vadd_s16(a, vmovn_s32(tmp));
				659	}
				660
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	661	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				662	{
				663	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				664
				665	// Initialize the temporary results with a constant used to round up the result
				666	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				667	qint16x8_t tmp1 = tmp0;
				668
				669	// Vector multiply-accumulate long
				670	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				671	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				672
				673	// Shift right by fixed_point_position
				674	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				675	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				676
				677	// Convert back to qint8 and accumulate
				678	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				679	}
				680
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	681	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				682	{
				683	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				684
				685	// Initialize the temporary results with a constant used to round up the result
				686	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				687	qint32x4_t tmp1 = tmp0;
				688
				689	// Vector multiply-accumulate long
				690	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				691	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				692
				693	// Shift right by fixed_point_position
				694	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				695	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				696
				697	// Convert back to qint16 and accumulate
				698	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				699	}
				700
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	701	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				702	{
				703	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				704
				705	// Initialize the temporary results with a constant used to round up the result
				706	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				707
				708	// Vector multiply-accumulate long
				709	tmp = vmlal_s8(tmp, b, c);
				710
				711	// Shift right by fixed_point_position
				712	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				713
				714	// Convert back to qint8 and accumulate
				715	return vqadd_s8(a, vqmovn_s16(tmp));
				716	}
				717
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	718	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				719	{
				720	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				721
				722	// Initialize the temporary results with a constant used to round up the result
				723	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				724
				725	// Vector multiply-accumulate long
				726	tmp = vmlal_s16(tmp, b, c);
				727
				728	// Shift right by fixed_point_position
				729	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				730
				731	// Convert back to qint8 and accumulate
				732	return vqadd_s16(a, vqmovn_s32(tmp));
				733	}
				734
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	735	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				736	{
				737	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				738
				739	// Initialize the temporary results with a constant used to round up the result
				740	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				741	qint16x8_t tmp1 = tmp0;
				742
				743	// Vector multiply-accumulate long
				744	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				745	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				746
				747	// Shift right by fixed_point_position
				748	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				749	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				750
				751	// Convert back to qint8 and accumulate
				752	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				753	return vqaddq_s8(a, res);
				754	}
				755
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	756	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				757	{
				758	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				759
				760	// Initialize the temporary results with a constant used to round up the result
				761	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				762	qint32x4_t tmp1 = tmp0;
				763
				764	// Vector multiply-accumulate long
				765	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				766	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				767
				768	// Shift right by fixed_point_position
				769	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				770	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				771
				772	// Convert back to qint16 and accumulate
				773	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				774	return vqaddq_s16(a, res);
				775	}
				776
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	777	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				778	{
				779	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				780
				781	// Initialize the temporary results with a constant used to round up the result
				782	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				783
				784	// Vector multiply-accumulate long
				785	tmp = vmlal_s8(tmp, b, c);
				786
				787	// Shift right by fixed_point_position
				788	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				789
				790	// Accumulate
				791	return vaddq_s16(a, tmp);
				792	}
				793
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	794	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				795	{
				796	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				797
				798	// Initialize the temporary results with a constant used to round up the result
				799	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				800
				801	// Vector multiply-accumulate long
				802	tmp = vmlal_s16(tmp, b, c);
				803
				804	// Shift right by fixed_point_position
				805	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				806
				807	// Accumulate
				808	return vaddq_s32(a, tmp);
				809	}
				810
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	811	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				812	{
				813	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				814
				815	// Initialize the temporary results with a constant used to round up the result
				816	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				817
				818	// Vector multiply-accumulate long
				819	tmp = vmlal_s8(tmp, b, c);
				820
				821	// Shift right by fixed_point_position
				822	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				823
				824	// Accumulate
				825	return vqaddq_s16(a, tmp);
				826	}
				827
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	828	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				829	{
				830	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				831
				832	// Initialize the temporary results with a constant used to round up the result
				833	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				834
				835	// Vector multiply-accumulate long
				836	tmp = vmlal_s16(tmp, b, c);
				837
				838	// Shift right by fixed_point_position
				839	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				840
				841	// Accumulate
				842	return vqaddq_s32(a, tmp);
				843	}
				844
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	845	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	846	{
				847	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				848
				849	float32x4x2_t res_f32 =
				850	{
				851	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	852	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				853	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	854	}
				855	};
				856
				857	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				858	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				859
				860	const int32x4x2_t res_s32 =
				861	{
				862	{
				863	vcvtq_s32_f32(res_f32.val[0]),
				864	vcvtq_s32_f32(res_f32.val[1]),
				865	}
				866	};
				867
				868	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				869
				870	return vqmovn_s16(res_s16);
				871	}
				872
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	873	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	874	{
				875	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				876
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	877	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	878
				879	res_f32 = vmlaq_f32(res_f32, a, pow2);
				880
				881	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				882
				883	return vqmovn_s32(res_s32);
				884	}
				885
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	886	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	887	{
				888	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				889
				890	float32x4x4_t res_f32 =
				891	{
				892	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	893	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				894	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				895	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				896	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	897	}
				898	};
				899
				900	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				901	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				902	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				903	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				904
				905	const int32x4x4_t res_s32 =
				906	{
				907	{
				908	vcvtq_s32_f32(res_f32.val[0]),
				909	vcvtq_s32_f32(res_f32.val[1]),
				910	vcvtq_s32_f32(res_f32.val[2]),
				911	vcvtq_s32_f32(res_f32.val[3]),
				912	}
				913	};
				914
				915	const int16x8x2_t res_s16 =
				916	{
				917	{
				918	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				919	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				920	}
				921	};
				922
				923	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				924	}
				925
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	926	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	927	{
				928	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				929
				930	float32x4x2_t res_f32 =
				931	{
				932	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	933	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				934	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	935	}
				936	};
				937
				938	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				939	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				940
				941	const int32x4x2_t res_s32 =
				942	{
				943	{
				944	vcvtq_s32_f32(res_f32.val[0]),
				945	vcvtq_s32_f32(res_f32.val[1])
				946	}
				947	};
				948
				949	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				950	}
				951
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	952	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				953	{
				954	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				955
				956	const int16x8_t res_s16 = vmovl_s8(a);
				957
				958	const int32x4x2_t res_s32 =
				959	{
				960	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	961	vmovl_s16(vget_low_qs16(res_s16)),
				962	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	963	}
				964	};
				965
				966	float32x4x2_t res_f32 =
				967	{
				968	{
				969	vcvtq_f32_s32(res_s32.val[0]),
				970	vcvtq_f32_s32(res_s32.val[1])
				971	}
				972	};
				973
				974	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				975	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				976
				977	return res_f32;
				978	}
				979
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	980	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				981	{
				982	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				983	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				984
				985	return vmulq_f32(res_f32, pow2);
				986	}
				987
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	988	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				989	{
				990	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				991
				992	const int16x8x2_t res_s16 =
				993	{
				994	{
				995	vmovl_s8(vget_low_s8(a)),
				996	vmovl_s8(vget_high_s8(a)),
				997	}
				998	};
				999
				1000	const int32x4x4_t res_s32 =
				1001	{
				1002	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1003	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				1004	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				1005	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				1006	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1007	}
				1008	};
				1009
				1010	float32x4x4_t res_f32 =
				1011	{
				1012	{
				1013	vcvtq_f32_s32(res_s32.val[0]),
				1014	vcvtq_f32_s32(res_s32.val[1]),
				1015	vcvtq_f32_s32(res_s32.val[2]),
				1016	vcvtq_f32_s32(res_s32.val[3])
				1017	}
				1018	};
				1019
				1020	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1021	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1022	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1023	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1024
				1025	return res_f32;
				1026	}
				1027
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1028	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1029	{
				1030	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1031
				1032	const int32x4x2_t res_s32 =
				1033	{
				1034	{
				1035	vmovl_s16(vget_low_qs16(a)),
				1036	vmovl_s16(vget_high_qs16(a))
				1037	}
				1038	};
				1039
				1040	float32x4x2_t res_f32 =
				1041	{
				1042	{
				1043	vcvtq_f32_s32(res_s32.val[0]),
				1044	vcvtq_f32_s32(res_s32.val[1])
				1045	}
				1046	};
				1047
				1048	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1049	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1050
				1051	return res_f32;
				1052	}
				1053
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1054	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1055	{
				1056	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1057	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1058	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1059	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1060
				1061	// Find shift value
				1062	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1063	const qint8x8_t temp = vshl_s8(a, shift_value);
				1064
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1065	// Newton-Raphson division initial estimate X0 calculation
				1066	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1067
				1068	uint8x8_t set_one = vcgt_s8(x, const_one);
				1069	x = vbsl_s8(set_one, const_one, x);
				1070
				1071	// Use three iterations of Newton-Raphson method to get the result
				1072	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1073	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1074	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1075
				1076	return vshl_s8(x, shift_value);
				1077	}
				1078
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1079	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1080	{
				1081	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1082	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1083	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1084	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1085
				1086	// Find shift value
				1087	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1088	const qint16x4_t temp = vshl_s16(a, shift_value);
				1089
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1090	// Newton-Raphson division initial estimate X0 calculation
				1091	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1092
				1093	uint16x4_t set_one = vcgt_s16(x, const_one);
				1094	x = vbsl_s16(set_one, const_one, x);
				1095
				1096	// Use five iterations of Newton-Raphson method to get the result
				1097	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1098	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1099	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1100	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1101	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1102
				1103	return vshl_s16(x, shift_value);
				1104	}
				1105
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1106	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1107	{
				1108	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1109	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1110	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1111	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1112
				1113	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1114	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1115	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1116
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1117	// Newton-Raphson division initial estimate X0 calculation
				1118	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1119
				1120	uint8x8_t set_one = vcgt_s8(x, const_one);
				1121	x = vbsl_s8(set_one, const_one, x);
				1122
				1123	// Use three iterations of Newton-Raphson method to get the result
				1124	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1125	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1126	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1127
				1128	return vqshl_s8(x, shift_value);
				1129	}
				1130
				1131	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1132	{
				1133	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1134	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1135	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1136	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1137
				1138	// Find shift value
				1139	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1140	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1141
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1142	// Newton-Raphson division initial estimate X0 calculation
				1143	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1144
				1145	uint16x4_t set_one = vcgt_s16(x, const_one);
				1146	x = vbsl_s16(set_one, const_one, x);
				1147
				1148	// Use five iterations of Newton-Raphson method to get the result
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1149	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1150	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1151	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1152	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1153	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1154
				1155	return vqshl_s16(x, shift_value);
				1156	}
				1157
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1158	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1159	{
				1160	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1161	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1162	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1163	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1164
				1165	// Find shift value
				1166	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1167	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1168
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1169	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1170	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1171
				1172	// Set initial guess to one if x > 1
				1173	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1174	x = vbslq_s8(set_one, const_one, x);
				1175
				1176	// Use three iterations of Newton-Raphson method to get the result
				1177	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1178	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1179	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1180
				1181	return vshlq_s8(x, shift_value);
				1182	}
				1183
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1184	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1185	{
				1186	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1187	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1188	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1189	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1190
				1191	// Find shift value
				1192	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1193	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1194
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1195	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1196	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1197
				1198	// Set initial guess to one if x > 1
				1199	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1200	x = vbslq_s16(set_one, const_one, x);
				1201
				1202	// Use five iterations of Newton-Raphson method to get the result
				1203	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1204	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1205	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1206	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1207	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1208
				1209	return vshlq_s16(x, shift_value);
				1210	}
				1211
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1212	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1213	{
				1214	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1215	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1216	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1217	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1218
				1219	// Find shift value
				1220	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1221	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1222
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1223	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1224	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1225
				1226	// Set initial guess to one if x > 1
				1227	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1228	x = vbslq_s8(set_one, const_one, x);
				1229
				1230	// Use three iterations of Newton-Raphson method to get the result
				1231	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1232	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1233	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1234
				1235	return vqshlq_s8(x, shift_value);
				1236	}
				1237
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1238	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1239	{
				1240	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1241	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1242	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1243	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1244
				1245	// Find shift value
				1246	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1247	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1248
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1249	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1250	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1251
				1252	// Set initial guess to one if x > 1
				1253	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1254	x = vbslq_s16(set_one, const_one, x);
				1255
				1256	// Use five iterations of Newton-Raphson method to get the result
				1257	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1258	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1259	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1260	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1261	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1262
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1263	// Saturate result in case of overflow
				1264	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1265	}
				1266
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1267	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1268	{
				1269	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1270	}
				1271
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1272	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1273	{
				1274	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1275	}
				1276
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1277	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1278	{
				1279	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1280	}
				1281
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1282	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1283	{
				1284	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1285	}
				1286
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1287	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1288	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1289	{
				1290	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1291	const qint8x8_t const_one = vdup_n_s8(1);
				1292	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1293	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1294	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1295	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1296	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1297	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1298	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1299	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1300	return res;
				1301	}
				1302
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1303	template <bool islog>
				1304	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1305	{
				1306	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1307	const qint16x4_t const_one = vdup_n_s16(1);
				1308	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1309	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1310	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1311	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1312	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1313	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1314	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1315	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1316	return res;
				1317	}
				1318
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1319	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1320	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1321	{
				1322	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1323	const qint8x8_t const_one = vdup_n_s8(1);
				1324	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1325	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1326	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1327	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1328	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1329	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1330	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1331	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1332	return res;
				1333	}
				1334
				1335	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1336	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1337	{
				1338	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1339	const qint16x4_t const_one = vdup_n_s16(1);
				1340	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1341	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1342	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1343	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1344	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1345	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1346	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1347	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1348	return res;
				1349	}
				1350
				1351	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1352	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1353	{
				1354	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1355	const qint8x16_t const_one = vdupq_n_s8(1);
				1356	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1357	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1358	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1359	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1360	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1361	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1362	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1363	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1364	return res;
				1365	}
				1366
				1367	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1368	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1369	{
				1370	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1371	const qint16x8_t const_one = vdupq_n_s16(1);
				1372	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1373	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1374	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1375	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1376	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1377	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1378	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1379	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1380	return res;
				1381	}
				1382
				1383	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1384	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1385	{
				1386	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1387	const qint8x16_t const_one = vdupq_n_s8(1);
				1388	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1389	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1390	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1391	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1392	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1393	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1394	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1395	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1396	return res;
				1397	}
				1398
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1399	template <bool islog>
				1400	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1401	{
				1402	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1403	const qint16x8_t const_one = vdupq_n_s16(1);
				1404	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1405	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1406	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1407	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1408	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1409	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1410	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1411	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1412	return res;
				1413	}
				1414
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1415	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1416	{
				1417	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1418	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1419	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1420	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1421
				1422	// Perform range reduction [-log(2),log(2)]
				1423	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1424
				1425	// get decimal part from m
				1426	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1427
				1428	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1429	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1430
				1431	// Polynomial Approximation
				1432	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1433	poly = vqadd_s8(poly, const_one);
				1434
				1435	// Reconstruct
				1436	poly = vqshl_s8(poly, dec_m);
				1437
				1438	return poly;
				1439	}
				1440
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1441	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1442	{
				1443	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1444	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1445	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1446	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1447
				1448	// Perform range reduction [-log(2),log(2)]
				1449	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1450
				1451	// get decimal part from m
				1452	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1453
				1454	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1455	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1456
				1457	// Polynomial Approximation
				1458	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1459	poly = vqadd_s16(poly, const_one);
				1460
				1461	// Reconstruct
				1462	poly = vqshl_s16(poly, dec_m);
				1463
				1464	return poly;
				1465	}
				1466
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1467	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1468	{
				1469	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1470	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1471	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1472	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1473
				1474	// Perform range reduction [-log(2),log(2)]
				1475	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1476
				1477	// get decimal part from m
				1478	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1479
				1480	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1481	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1482
				1483	// Polynomial Approximation
				1484	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1485	poly = vqaddq_s8(poly, const_one);
				1486
				1487	// Reconstruct
				1488	poly = vqshlq_s8(poly, dec_m);
				1489
				1490	return poly;
				1491	}
				1492
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1493	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1494	{
				1495	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1496	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1497	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1498	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1499
				1500	// Perform range reduction [-log(2),log(2)]
				1501	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1502
				1503	// get decimal part from m
				1504	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1505
				1506	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1507	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1508
				1509	// Polynomial Approximation
				1510	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1511	poly = vqaddq_s16(poly, const_one);
				1512
				1513	// Reconstruct
				1514	poly = vqshlq_s16(poly, dec_m);
				1515
				1516	return poly;
				1517	}
				1518
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1519	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1520	{
				1521	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1522	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1523	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1524
				1525	// If 0 < a < 1, calculate log(1/x)
				1526	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1527	qint8x8_t recip = vdup_n_s8(0);
				1528	recip = vbsl_s8(calc_reciprocal, recip, a);
				1529
				1530	// Calculate reciprocal
				1531	recip = vrecip_qs8(recip, fixed_point_position);
				1532	a = vbsl_s8(calc_reciprocal, recip, a);
				1533
				1534	// Get decimal part of a
				1535	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1536	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1537
				1538	// Get exponent of 2^n which is equal or less than dec_a
				1539	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1540
				1541	// Get x to range (1, 2]
				1542	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1543	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1544	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1545
				1546	// Polynomial Approximation
				1547	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1548
				1549	// Reconstruct
				1550	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1551
				1552	// Set negative value for 0 < a < 1
				1553	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1554
				1555	return poly;
				1556	}
				1557
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1558	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1559	{
				1560	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1561	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1562	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1563
				1564	// If 0 < a < 1, calculate log(1/x)
				1565	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1566	qint16x4_t recip = vdup_n_s16(0);
				1567	recip = vbsl_s16(calc_reciprocal, recip, a);
				1568
				1569	// Calculate reciprocal
				1570	recip = vrecip_qs16(recip, fixed_point_position);
				1571	a = vbsl_s16(calc_reciprocal, recip, a);
				1572
				1573	// Get decimal part of a
				1574	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1575	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1576
				1577	// Get exponent of 2^n which is equal or less than dec_a
				1578	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1579
				1580	// Get x to range (1, 2]
				1581	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1582	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1583	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1584
				1585	// Polynomial Approximation
				1586	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1587
				1588	// Reconstruct
				1589	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1590
				1591	// Set negative value for 0 < a < 1
				1592	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1593
				1594	return poly;
				1595	}
				1596
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1597	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1598	{
				1599	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1600	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1601	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1602
				1603	// If 0 < a < 1, calculate log(1/x)
				1604	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1605	qint8x16_t recip = vdupq_n_s8(0);
				1606	recip = vbslq_s8(calc_reciprocal, a, recip);
				1607
				1608	// Calculate reciprocal
				1609	recip = vrecipq_qs8(recip, fixed_point_position);
				1610	a = vbslq_s8(calc_reciprocal, recip, a);
				1611
				1612	// Get decimal part of a
				1613	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1614	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1615
				1616	// Get exponent of 2^n which is equal or less than dec_a
				1617	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1618
				1619	// Get x to range (1, 2]
				1620	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1621	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1622	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1623
				1624	// Polynomial Approximation
				1625	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1626
				1627	// Reconstruct
				1628	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1629
				1630	// Set negative value for 0 < a < 1
				1631	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1632
				1633	return poly;
				1634	}
				1635
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1636	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1637	{
				1638	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1639	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1640	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1641
				1642	// If 0 < a < 1, calculate log(1/x)
				1643	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1644	qint16x8_t recip = vdupq_n_s16(0);
				1645	recip = vbslq_s16(calc_reciprocal, a, recip);
				1646
				1647	// Calculate reciprocal
				1648	recip = vqrecipq_qs16(recip, fixed_point_position);
				1649	a = vbslq_s16(calc_reciprocal, recip, a);
				1650
				1651	// Get decimal part of a
				1652	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1653	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1654
				1655	// Get exponent of 2^n which is equal or less than dec_a
				1656	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1657
				1658	// Get x to range (1, 2]
				1659	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1660	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1661	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1662
				1663	// Polynomial Approximation
				1664	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1665
				1666	// Reconstruct
				1667	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1668
				1669	// Set negative value for 0 < a < 1
				1670	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1671
				1672	return poly;
				1673	}
				1674
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1675	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1676	{
				1677	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1678
				1679	// Find shift value. Number must be in (0.5, 2) range.
				1680	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1681
				1682	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1683	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1684	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1685	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1686	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1687
				1688	temp = vshl_s8(a, shift_value);
				1689
				1690	// Initial guess
				1691	qint8x8_t x = temp;
				1692
				1693	// Calculate (x / 2) * (3 - a * x^2)
				1694	// After three iterations we have the result for 8 bit
				1695	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1696	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1697	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1698
				1699	return vshl_s8(x, shift_value2);
				1700	}
				1701
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1702	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1703	{
				1704	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1705
				1706	// Find shift value. Number must be in (0.5, 2) range.
				1707	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1708
				1709	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1710	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1711	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1712	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1713	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1714
				1715	temp = vshl_s16(a, shift_value);
				1716
				1717	// Initial guess
				1718	qint16x4_t x = temp;
				1719
				1720	// Calculate (x / 2) * (3 - a * x^2)
				1721	// After five iterations we have the result for 8 bit
				1722	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1723	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1724	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1725	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1726	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1727
				1728	return vshl_s16(x, shift_value2);
				1729	}
				1730
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1731	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1732	{
				1733	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1734
				1735	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1736	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1737
				1738	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1739	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1740	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1741	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1742	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1743
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1744	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1745
				1746	// Initial guess
				1747	qint8x8_t x = temp;
				1748
				1749	// Calculate (x / 2) * (3 - a * x^2)
				1750	// After three iterations we have the result for 8 bit
				1751	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1752	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1753	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1754
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1755	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1756	}
				1757
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1758	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1759	{
				1760	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1761
				1762	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1763	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1764
				1765	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1766	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1767	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1768	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1769	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1770
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1771	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1772
				1773	// Initial guess
				1774	qint16x4_t x = temp;
				1775
				1776	// Calculate (x / 2) * (3 - a * x^2)
				1777	// After five iterations we have the result for 16 bit
				1778	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1779	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1780	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1781	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1782	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1783
				1784	return vqshl_s16(x, shift_value2);
				1785	}
				1786
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1787	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1788	{
				1789	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1790
				1791	// Find shift value. Number must be in (0.5, 2) range.
				1792	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1793
				1794	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1795	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1796	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1797	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1798	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1799
				1800	temp = vshlq_s8(a, shift_value);
				1801
				1802	// Initial guess
				1803	qint8x16_t x = temp;
				1804
				1805	// Calculate (x / 2) * (3 - a * x^2)
				1806	// After three iterations we have the result for 8 bit
				1807	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1808	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1809	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1810
				1811	return vshlq_s8(x, shift_value2);
				1812	}
				1813
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1814	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1815	{
				1816	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1817
				1818	// Find shift value. Number must be in (0.5, 2) range.
				1819	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1820
				1821	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1822	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1823	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1824	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1825	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1826
				1827	temp = vshlq_s16(a, shift_value);
				1828
				1829	// Initial guess
				1830	qint16x8_t x = temp;
				1831
				1832	// Calculate (x / 2) * (3 - a * x^2)
				1833	// After five iterations we have the result for 16 bit
				1834	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1835	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1836	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1837	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1838	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1839
				1840	return vshlq_s16(x, shift_value2);
				1841	}
				1842
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1843	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1844	{
				1845	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1846
				1847	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1848	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1849
				1850	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1851	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1852	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1853	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1854	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1855
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1856	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1857
				1858	// Initial guess
				1859	qint8x16_t x = temp;
				1860
				1861	// Calculate (x / 2) * (3 - a * x^2)
				1862	// After three iterations we have the result for 8 bit
				1863	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1864	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1865	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1866
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1867	return vqshlq_s8(x, shift_value2);
				1868	}
				1869
				1870	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1871	{
				1872	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1873
				1874	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1875	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1876
				1877	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1878	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1879	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1880	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1881	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1882
				1883	temp = vqshlq_s16(a, shift_value);
				1884
				1885	// Initial guess
				1886	qint16x8_t x = temp;
				1887
				1888	// Calculate (x / 2) * (3 - a * x^2)
				1889	// After five iterations we have the result for 16 bit
				1890	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1891	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1892	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1893	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1894	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1895
				1896	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1897	}
				1898
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1899	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1900	{
				1901	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1902	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1903
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1904	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1905	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1906	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1907	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1908
				1909	return tanh;
				1910	}
				1911
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1912	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1913	{
				1914	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1915	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1916
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1917	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1918	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1919	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1920	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1921
				1922	return tanh;
				1923	}
				1924
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1925	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1926	{
				1927	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1928	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1929
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1930	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1931	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1932	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1933	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1934
				1935	return tanh;
				1936	}
				1937
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1938	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1939	{
				1940	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1941	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1942
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1943	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1944	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1945	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1946	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1947
				1948	return tanh;
				1949	}
				1950
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1951	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1952	{
				1953	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1954	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1955
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame^]	1956	inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1957	{
				1958	return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1959	}
				1960
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1961	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1962	{
				1963	float32x4x2_t res =
				1964	{
				1965	{
				1966	vmaxq_f32(a.val[0], b.val[0]),
				1967	vmaxq_f32(a.val[1], b.val[1])
				1968	}
				1969	};
				1970	return res;
				1971	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1972	}