Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: f62a338a61cc3145692f97d67b18adc698317f13 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	namespace arm_compute
				26	{
				27	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				28	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	29	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	30	{
				31	{
				32	vdup_n_s8(0x7F), // 0.9978546
				33	vdup_n_s8(0x3F), // 0.4994721
				34	vdup_n_s8(0x16), // 0.1763723
				35	vdup_n_s8(0x05), // 0.0435108
				36	}
				37	};
				38
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	39	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				40	* Format is in Q0.15 for all elements */
				41	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				42	{
				43	{
				44	vdup_n_s16(0x7FBA), // 0.9978546
				45	vdup_n_s16(0x3FE9), // 0.4994721
				46	vdup_n_s16(0x1693), // 0.1763723
				47	vdup_n_s16(0x0592), // 0.0435108
				48	}
				49	};
				50
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	51	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				52	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	53	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	54	{
				55	{
				56	vdupq_n_s8(0x7F), // 0.9978546
				57	vdupq_n_s8(0x3F), // 0.4994721
				58	vdupq_n_s8(0x16), // 0.1763723
				59	vdupq_n_s8(0x05), // 0.0435108
				60	}
				61	};
				62
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	63	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				64	* Format is in Q0.15 for all elements */
				65	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				66	{
				67	{
				68	vdupq_n_s16(0x7FBA), // 0.9978546
				69	vdupq_n_s16(0x3FE9), // 0.4994721
				70	vdupq_n_s16(0x1693), // 0.1763723
				71	vdupq_n_s16(0x0592), // 0.0435108
				72	}
				73	};
				74
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	75	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				76	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	77	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	78	{
				79	{
				80	vdup_n_s8(0x5C), // 1.4384189
				81	vdup_n_s8(-0x56), // -0.6771900
				82	vdup_n_s8(0x29), // 0.3218538
				83	vdup_n_s8(-0x0A), // -0.0832229
				84	}
				85	};
				86
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	87	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				88	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				89	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				90	{
				91	{
				92	vdup_n_s16(0x5C0F), // 1.4384189
				93	vdup_n_s16(-0x56AE), // -0.6771900
				94	vdup_n_s16(0x2933), // 0.3218538
				95	vdup_n_s16(-0x0AA7), // -0.0832229
				96	}
				97	};
				98
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	99	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				100	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	101	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	102	{
				103	{
				104	vdupq_n_s8(0x5C), // 1.4384189
				105	vdupq_n_s8(-0x56), // -0.6771900
				106	vdupq_n_s8(0x29), // 0.3218538
				107	vdupq_n_s8(-0x0A), // -0.0832229
				108	}
				109	};
				110
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	111	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				112	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				113	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				114	{
				115	{
				116	vdupq_n_s16(0x5C0F), // 1.4384189
				117	vdupq_n_s16(-0x56AE), // -0.6771900
				118	vdupq_n_s16(0x2933), // 0.3218538
				119	vdupq_n_s16(-0x0AA7), // -0.0832229
				120	}
				121	};
				122
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	123	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				124	{
				125	return vget_low_s8(a);
				126	}
				127
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	128	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				129	{
				130	return vget_low_s16(a);
				131	}
				132
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				134	{
				135	return vget_high_s8(a);
				136	}
				137
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	138	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				139	{
				140	return vget_high_s16(a);
				141	}
				142
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	143	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				144	{
				145	return vld1_s8(addr);
				146	}
				147
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	148	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				149	{
				150	return vld1_s16(addr);
				151	}
				152
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	153	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				154	{
				155	return vld1q_s8(addr);
				156	}
				157
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	158	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				159	{
				160	return vld1q_s16(addr);
				161	}
				162
				163	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				164	{
				165	return vld1_dup_s8(addr);
				166	}
				167
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	168	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				169	{
				170	return vld1_dup_s16(addr);
				171	}
				172
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	173	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				174	{
				175	return vld1q_dup_s8(addr);
				176	}
				177
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	178	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				179	{
				180	return vld1q_dup_s16(addr);
				181	}
				182
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	183	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				184	{
				185	vst1_s8(addr, b);
				186	}
				187
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	188	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				189	{
				190	vst1_s16(addr, b);
				191	}
				192
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	193	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				194	{
				195	vst1q_s8(addr, b);
				196	}
				197
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	198	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				199	{
				200	vst1q_s16(addr, b);
				201	}
				202
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	203	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				204	{
				205	vst2q_s16(addr, b);
				206	}
				207
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	208	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				209	{
				210	return vqmovn_s16(a);
				211	}
				212
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	213	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				214	{
				215	return vqmovn_s32(a);
				216	}
				217
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	218	inline qint8x8_t vdup_n_qs8(qint8_t a)
				219	{
				220	return vdup_n_s8(a);
				221	}
				222
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	223	inline qint16x4_t vdup_n_qs16(qint16_t a)
				224	{
				225	return vdup_n_s16(a);
				226	}
				227
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	228	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				229	{
				230	return vdupq_n_s8(a);
				231	}
				232
				233	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				234	{
				235	float32x4x4_t res =
				236	{
				237	{
				238	vdupq_n_f32(a),
				239	vdupq_n_f32(a),
				240	vdupq_n_f32(a),
				241	vdupq_n_f32(a),
				242	}
				243	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	244	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	245	}
				246
				247	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				248	{
				249	return vdupq_n_s16(a);
				250	}
				251
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	252	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				253	{
				254	return vdupq_n_s32(a);
				255	}
				256
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	257	inline qint8x8_t vabs_qs8(qint8x8_t a)
				258	{
				259	return vabs_s8(a);
				260	}
				261
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	262	inline qint16x4_t vabs_qs16(qint16x4_t a)
				263	{
				264	return vabs_s16(a);
				265	}
				266
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	267	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				268	{
				269	return vabsq_s8(a);
				270	}
				271
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	272	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				273	{
				274	return vabsq_s16(a);
				275	}
				276
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	277	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				278	{
				279	return vqabs_s8(a);
				280	}
				281
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	282	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				283	{
				284	return vqabs_s16(a);
				285	}
				286
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	287	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				288	{
				289	return vqabsq_s8(a);
				290	}
				291
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	292	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				293	{
				294	return vqabsq_s16(a);
				295	}
				296
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	297	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				298	{
				299	return vmax_s8(a, b);
				300	}
				301
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	302	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				303	{
				304	return vmax_s16(a, b);
				305	}
				306
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	307	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				308	{
				309	return vmaxq_s8(a, b);
				310	}
				311
				312	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				313	{
				314	return vpmax_s8(a, b);
				315	}
				316
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	317	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				318	{
				319	return vpmax_s16(a, b);
				320	}
				321
				322	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				323	{
				324	return vmaxq_s16(a, b);
				325	}
				326
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	327	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				328	{
				329	return vmin_s8(a, b);
				330	}
				331
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	332	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				333	{
				334	return vmin_s16(a, b);
				335	}
				336
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	337	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				338	{
				339	return vminq_s8(a, b);
				340	}
				341
				342	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				343	{
				344	return vpmin_s8(a, b);
				345	}
				346
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	347	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				348	{
				349	return vpmin_s16(a, b);
				350	}
				351
				352	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				353	{
				354	return vminq_s16(a, b);
				355	}
				356
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	357	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				358	{
				359	return vadd_s8(a, b);
				360	}
				361
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	362	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				363	{
				364	return vadd_s16(a, b);
				365	}
				366
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	367	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				368	{
				369	return vaddq_s8(a, b);
				370	}
				371
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	372	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				373	{
				374	return vaddq_s16(a, b);
				375	}
				376
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	377	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				378	{
				379	return vqadd_s8(a, b);
				380	}
				381
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	382	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				383	{
				384	return vqadd_s16(a, b);
				385	}
				386
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame^]	387	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				388	{
				389	return vqadd_s32(a, b);
				390	}
				391
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	392	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				393	{
				394	return vqaddq_s8(a, b);
				395	}
				396
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	397	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				398	{
				399	return vqaddq_s16(a, b);
				400	}
				401
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame^]	402	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				403	{
				404	return vqaddq_s32(a, b);
				405	}
				406
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	407	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				408	{
				409	return vpaddl_s8(a);
				410	}
				411
				412	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				413	{
				414	return vsub_s8(a, b);
				415	}
				416
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	417	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				418	{
				419	return vsub_s16(a, b);
				420	}
				421
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	422	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				423	{
				424	return vsubq_s8(a, b);
				425	}
				426
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	427	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				428	{
				429	return vsubq_s16(a, b);
				430	}
				431
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	432	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				433	{
				434	return vqsub_s8(a, b);
				435	}
				436
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	437	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				438	{
				439	return vqsub_s16(a, b);
				440	}
				441
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	442	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				443	{
				444	return vqsubq_s8(a, b);
				445	}
				446
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	447	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				448	{
				449	return vqsubq_s16(a, b);
				450	}
				451
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	452	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				453	{
				454	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				455
				456	// Initialize the temporary result with a constant used to round up the result
				457	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				458
				459	// Vector multiply-accumulate long
				460	res = vmlal_s8(res, a, b);
				461
				462	// Shift right by fixed_point_position
				463	res = vshlq_s16(res, fixed_point_position_s16);
				464
				465	// Convert back to qint8
				466	return vmovn_s16(res);
				467	}
				468
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	469	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				470	{
				471	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				472
				473	// Initialize the temporary result with a constant used to round up the result
				474	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				475
				476	// Vector multiply-accumulate long
				477	res = vmlal_s16(res, a, b);
				478
				479	// Shift right by fixed_point_position
				480	res = vshlq_s32(res, fixed_point_position_s32);
				481
				482	// Convert back to qint16
				483	return vmovn_s32(res);
				484	}
				485
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	486	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				487	{
				488	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				489
				490	// Initialize the temporary results with a constant used to round up the result
				491	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				492	qint16x8_t res1 = res0;
				493
				494	// Vector multiply-accumulate long
				495	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				496	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				497
				498	// Shift right by fixed_point_position
				499	res0 = vshlq_s16(res0, fixed_point_position_s16);
				500	res1 = vshlq_s16(res1, fixed_point_position_s16);
				501
				502	// Convert back to qint8
				503	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				504	}
				505
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	506	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				507	{
				508	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				509
				510	// Initialize the temporary results with a constant used to round up the result
				511	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				512	qint32x4_t res1 = res0;
				513
				514	// Vector multiply-accumulate long
				515	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				516	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				517
				518	// Shift right by fixed_point_position
				519	res0 = vshlq_s32(res0, fixed_point_position_s32);
				520	res1 = vshlq_s32(res1, fixed_point_position_s32);
				521
				522	// Convert back to qint16
				523	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				524	}
				525
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	526	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				527	{
				528	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				529
				530	// Initialize the temporary result with a constant used to round up the result
				531	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				532
				533	// Vector multiply-accumulate long
				534	res = vmlal_s8(res, a, b);
				535
				536	// Shift right by fixed_point_position
				537	res = vqshlq_s16(res, fixed_point_position_s16);
				538
				539	// Convert back to qint8 and saturate
				540	return vqmovn_s16(res);
				541	}
				542
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	543	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				544	{
				545	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				546
				547	// Initialize the temporary result with a constant used to round up the result
				548	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				549
				550	// Vector multiply-accumulate long
				551	res = vmlal_s16(res, a, b);
				552
				553	// Shift right by fixed_point_position
				554	res = vqshlq_s32(res, fixed_point_position_s32);
				555
				556	// Convert back to qint16 and saturate
				557	return vqmovn_s32(res);
				558	}
				559
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	560	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				561	{
				562	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				563
				564	// Initialize the temporary results with a constant used to round up the result
				565	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				566	qint16x8_t res1 = res0;
				567
				568	// Vector multiply-accumulate long
				569	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				570	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				571
				572	// Shift right by fixed_point_position
				573	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				574	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				575
				576	// Convert back to qint8 and saturate
				577	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				578	}
				579
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	580	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				581	{
				582	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				583
				584	// Initialize the temporary results with a constant used to round up the result
				585	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				586	qint32x4_t res1 = res0;
				587
				588	// Vector multiply-accumulate long
				589	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				590	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				591
				592	// Shift right by fixed_point_position
				593	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				594	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				595
				596	// Convert back to qint16 and saturate
				597	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				598	}
				599
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	600	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				601	{
				602	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				603
				604	qint16x8_t res = vmull_s8(a, b);
				605
				606	return vqrshlq_s16(res, fixed_point_position_s16);
				607	}
				608
				609	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				610	{
				611	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				612
				613	// Initialize the temporary results with a constant used to round up the result
				614	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				615
				616	// Vector multiply-accumulate long
				617	tmp = vmlal_s8(tmp, b, c);
				618
				619	// Shift right by fixed_point_position
				620	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				621
				622	// Convert back to qint8 and accumulate
				623	return vadd_s8(a, vmovn_s16(tmp));
				624	}
				625
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	626	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				627	{
				628	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				629
				630	// Initialize the temporary results with a constant used to round up the result
				631	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				632
				633	// Vector multiply-accumulate long
				634	tmp = vmlal_s16(tmp, b, c);
				635
				636	// Shift right by fixed_point_position
				637	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				638
				639	// Convert back to qint16 and accumulate
				640	return vadd_s16(a, vmovn_s32(tmp));
				641	}
				642
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	643	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				644	{
				645	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				646
				647	// Initialize the temporary results with a constant used to round up the result
				648	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				649	qint16x8_t tmp1 = tmp0;
				650
				651	// Vector multiply-accumulate long
				652	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				653	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				654
				655	// Shift right by fixed_point_position
				656	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				657	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				658
				659	// Convert back to qint8 and accumulate
				660	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				661	}
				662
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	663	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				664	{
				665	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				666
				667	// Initialize the temporary results with a constant used to round up the result
				668	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				669	qint32x4_t tmp1 = tmp0;
				670
				671	// Vector multiply-accumulate long
				672	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				673	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				674
				675	// Shift right by fixed_point_position
				676	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				677	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				678
				679	// Convert back to qint16 and accumulate
				680	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				681	}
				682
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	683	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				684	{
				685	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				686
				687	// Initialize the temporary results with a constant used to round up the result
				688	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				689
				690	// Vector multiply-accumulate long
				691	tmp = vmlal_s8(tmp, b, c);
				692
				693	// Shift right by fixed_point_position
				694	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				695
				696	// Convert back to qint8 and accumulate
				697	return vqadd_s8(a, vqmovn_s16(tmp));
				698	}
				699
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	700	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				701	{
				702	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				703
				704	// Initialize the temporary results with a constant used to round up the result
				705	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				706
				707	// Vector multiply-accumulate long
				708	tmp = vmlal_s16(tmp, b, c);
				709
				710	// Shift right by fixed_point_position
				711	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				712
				713	// Convert back to qint8 and accumulate
				714	return vqadd_s16(a, vqmovn_s32(tmp));
				715	}
				716
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	717	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				718	{
				719	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				720
				721	// Initialize the temporary results with a constant used to round up the result
				722	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				723	qint16x8_t tmp1 = tmp0;
				724
				725	// Vector multiply-accumulate long
				726	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				727	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				728
				729	// Shift right by fixed_point_position
				730	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				731	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				732
				733	// Convert back to qint8 and accumulate
				734	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				735	return vqaddq_s8(a, res);
				736	}
				737
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	738	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				739	{
				740	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				741
				742	// Initialize the temporary results with a constant used to round up the result
				743	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				744	qint32x4_t tmp1 = tmp0;
				745
				746	// Vector multiply-accumulate long
				747	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				748	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				749
				750	// Shift right by fixed_point_position
				751	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				752	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				753
				754	// Convert back to qint16 and accumulate
				755	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				756	return vqaddq_s16(a, res);
				757	}
				758
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	759	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				760	{
				761	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				762
				763	// Initialize the temporary results with a constant used to round up the result
				764	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				765
				766	// Vector multiply-accumulate long
				767	tmp = vmlal_s8(tmp, b, c);
				768
				769	// Shift right by fixed_point_position
				770	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				771
				772	// Accumulate
				773	return vaddq_s16(a, tmp);
				774	}
				775
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	776	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				777	{
				778	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				779
				780	// Initialize the temporary results with a constant used to round up the result
				781	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				782
				783	// Vector multiply-accumulate long
				784	tmp = vmlal_s16(tmp, b, c);
				785
				786	// Shift right by fixed_point_position
				787	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				788
				789	// Accumulate
				790	return vaddq_s32(a, tmp);
				791	}
				792
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	793	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				794	{
				795	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				796
				797	// Initialize the temporary results with a constant used to round up the result
				798	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				799
				800	// Vector multiply-accumulate long
				801	tmp = vmlal_s8(tmp, b, c);
				802
				803	// Shift right by fixed_point_position
				804	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				805
				806	// Accumulate
				807	return vqaddq_s16(a, tmp);
				808	}
				809
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	810	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				811	{
				812	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				813
				814	// Initialize the temporary results with a constant used to round up the result
				815	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				816
				817	// Vector multiply-accumulate long
				818	tmp = vmlal_s16(tmp, b, c);
				819
				820	// Shift right by fixed_point_position
				821	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				822
				823	// Accumulate
				824	return vqaddq_s32(a, tmp);
				825	}
				826
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	827	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	828	{
				829	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				830
				831	float32x4x2_t res_f32 =
				832	{
				833	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	834	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				835	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	836	}
				837	};
				838
				839	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				840	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				841
				842	const int32x4x2_t res_s32 =
				843	{
				844	{
				845	vcvtq_s32_f32(res_f32.val[0]),
				846	vcvtq_s32_f32(res_f32.val[1]),
				847	}
				848	};
				849
				850	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				851
				852	return vqmovn_s16(res_s16);
				853	}
				854
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	855	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	856	{
				857	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				858
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	859	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	860
				861	res_f32 = vmlaq_f32(res_f32, a, pow2);
				862
				863	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				864
				865	return vqmovn_s32(res_s32);
				866	}
				867
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	868	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	869	{
				870	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				871
				872	float32x4x4_t res_f32 =
				873	{
				874	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	875	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				876	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				877	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				878	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	879	}
				880	};
				881
				882	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				883	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				884	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				885	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				886
				887	const int32x4x4_t res_s32 =
				888	{
				889	{
				890	vcvtq_s32_f32(res_f32.val[0]),
				891	vcvtq_s32_f32(res_f32.val[1]),
				892	vcvtq_s32_f32(res_f32.val[2]),
				893	vcvtq_s32_f32(res_f32.val[3]),
				894	}
				895	};
				896
				897	const int16x8x2_t res_s16 =
				898	{
				899	{
				900	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				901	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				902	}
				903	};
				904
				905	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				906	}
				907
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	908	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	909	{
				910	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				911
				912	float32x4x2_t res_f32 =
				913	{
				914	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	915	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				916	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	917	}
				918	};
				919
				920	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				921	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				922
				923	const int32x4x2_t res_s32 =
				924	{
				925	{
				926	vcvtq_s32_f32(res_f32.val[0]),
				927	vcvtq_s32_f32(res_f32.val[1])
				928	}
				929	};
				930
				931	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				932	}
				933
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	934	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				935	{
				936	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				937
				938	const int16x8_t res_s16 = vmovl_s8(a);
				939
				940	const int32x4x2_t res_s32 =
				941	{
				942	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	943	vmovl_s16(vget_low_qs16(res_s16)),
				944	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	945	}
				946	};
				947
				948	float32x4x2_t res_f32 =
				949	{
				950	{
				951	vcvtq_f32_s32(res_s32.val[0]),
				952	vcvtq_f32_s32(res_s32.val[1])
				953	}
				954	};
				955
				956	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				957	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				958
				959	return res_f32;
				960	}
				961
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	962	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				963	{
				964	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				965	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				966
				967	return vmulq_f32(res_f32, pow2);
				968	}
				969
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	970	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				971	{
				972	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				973
				974	const int16x8x2_t res_s16 =
				975	{
				976	{
				977	vmovl_s8(vget_low_s8(a)),
				978	vmovl_s8(vget_high_s8(a)),
				979	}
				980	};
				981
				982	const int32x4x4_t res_s32 =
				983	{
				984	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	985	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				986	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				987	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				988	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	989	}
				990	};
				991
				992	float32x4x4_t res_f32 =
				993	{
				994	{
				995	vcvtq_f32_s32(res_s32.val[0]),
				996	vcvtq_f32_s32(res_s32.val[1]),
				997	vcvtq_f32_s32(res_s32.val[2]),
				998	vcvtq_f32_s32(res_s32.val[3])
				999	}
				1000	};
				1001
				1002	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1003	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1004	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1005	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1006
				1007	return res_f32;
				1008	}
				1009
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1010	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1011	{
				1012	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1013
				1014	const int32x4x2_t res_s32 =
				1015	{
				1016	{
				1017	vmovl_s16(vget_low_qs16(a)),
				1018	vmovl_s16(vget_high_qs16(a))
				1019	}
				1020	};
				1021
				1022	float32x4x2_t res_f32 =
				1023	{
				1024	{
				1025	vcvtq_f32_s32(res_s32.val[0]),
				1026	vcvtq_f32_s32(res_s32.val[1])
				1027	}
				1028	};
				1029
				1030	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1031	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1032
				1033	return res_f32;
				1034	}
				1035
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1036	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1037	{
				1038	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1039	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1040	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1041	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1042
				1043	// Find shift value
				1044	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1045	const qint8x8_t temp = vshl_s8(a, shift_value);
				1046
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1047	qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1048
				1049	uint8x8_t set_one = vcgt_s8(x, const_one);
				1050	x = vbsl_s8(set_one, const_one, x);
				1051
				1052	// Use three iterations of Newton-Raphson method to get the result
				1053	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1054	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1055	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1056
				1057	return vshl_s8(x, shift_value);
				1058	}
				1059
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1060	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1061	{
				1062	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1063	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1064	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1065	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1066
				1067	// Find shift value
				1068	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1069	const qint16x4_t temp = vshl_s16(a, shift_value);
				1070
				1071	qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
				1072
				1073	uint16x4_t set_one = vcgt_s16(x, const_one);
				1074	x = vbsl_s16(set_one, const_one, x);
				1075
				1076	// Use five iterations of Newton-Raphson method to get the result
				1077	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1078	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1079	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1080	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1081	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1082
				1083	return vshl_s16(x, shift_value);
				1084	}
				1085
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame^]	1086	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1087	{
				1088	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1089	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1090	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1091	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1092
				1093	// Find shift value
				1094	const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1095	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1096
				1097	qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
				1098
				1099	uint8x8_t set_one = vcgt_s8(x, const_one);
				1100	x = vbsl_s8(set_one, const_one, x);
				1101
				1102	// Use three iterations of Newton-Raphson method to get the result
				1103	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1104	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1105	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1106
				1107	return vqshl_s8(x, shift_value);
				1108	}
				1109
				1110	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1111	{
				1112	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1113	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1114	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1115	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1116
				1117	// Find shift value
				1118	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1119	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1120
				1121	qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
				1122
				1123	uint16x4_t set_one = vcgt_s16(x, const_one);
				1124	x = vbsl_s16(set_one, const_one, x);
				1125
				1126	// Use five iterations of Newton-Raphson method to get the result
				1127	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1128	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1129	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1130	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1131	x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1132
				1133	return vqshl_s16(x, shift_value);
				1134	}
				1135
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1136	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1137	{
				1138	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1139	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1140	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1141	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1142
				1143	// Find shift value
				1144	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1145	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1146
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1147	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1148
				1149	// Set initial guess to one if x > 1
				1150	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1151	x = vbslq_s8(set_one, const_one, x);
				1152
				1153	// Use three iterations of Newton-Raphson method to get the result
				1154	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1155	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1156	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1157
				1158	return vshlq_s8(x, shift_value);
				1159	}
				1160
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1161	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1162	{
				1163	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1164	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1165	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1166	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1167
				1168	// Find shift value
				1169	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1170	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1171
				1172	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1173
				1174	// Set initial guess to one if x > 1
				1175	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1176	x = vbslq_s16(set_one, const_one, x);
				1177
				1178	// Use five iterations of Newton-Raphson method to get the result
				1179	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1180	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1181	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1182	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1183	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1184
				1185	return vshlq_s16(x, shift_value);
				1186	}
				1187
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1188	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1189	{
				1190	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1191	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1192	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1193	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1194
				1195	// Find shift value
				1196	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1197	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1198
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1199	qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1200
				1201	// Set initial guess to one if x > 1
				1202	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1203	x = vbslq_s8(set_one, const_one, x);
				1204
				1205	// Use three iterations of Newton-Raphson method to get the result
				1206	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1207	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1208	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1209
				1210	return vqshlq_s8(x, shift_value);
				1211	}
				1212
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1213	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1214	{
				1215	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1216	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1217	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1218	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1219
				1220	// Find shift value
				1221	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1222	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1223
				1224	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1225
				1226	// Set initial guess to one if x > 1
				1227	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1228	x = vbslq_s16(set_one, const_one, x);
				1229
				1230	// Use five iterations of Newton-Raphson method to get the result
				1231	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1232	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1233	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1234	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1235	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1236
				1237	return vqshlq_s16(x, shift_value);
				1238	}
				1239
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1240	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1241	{
				1242	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1243	}
				1244
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1245	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1246	{
				1247	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1248	}
				1249
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1250	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1251	{
				1252	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1253	}
				1254
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1255	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1256	{
				1257	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1258	}
				1259
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1260	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1261	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1262	{
				1263	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1264	const qint8x8_t const_one = vdup_n_s8(1);
				1265	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1266	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1267	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1268	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1269	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1270	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1271	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1272	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1273	return res;
				1274	}
				1275
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1276	template <bool islog>
				1277	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1278	{
				1279	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1280	const qint16x4_t const_one = vdup_n_s16(1);
				1281	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1282	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1283	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1284	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1285	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1286	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1287	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1288	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1289	return res;
				1290	}
				1291
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1292	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1293	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1294	{
				1295	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1296	const qint8x8_t const_one = vdup_n_s8(1);
				1297	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1298	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1299	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1300	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1301	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1302	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1303	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1304	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1305	return res;
				1306	}
				1307
				1308	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1309	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1310	{
				1311	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1312	const qint16x4_t const_one = vdup_n_s16(1);
				1313	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1314	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1315	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1316	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1317	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1318	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1319	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1320	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1321	return res;
				1322	}
				1323
				1324	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1325	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1326	{
				1327	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1328	const qint8x16_t const_one = vdupq_n_s8(1);
				1329	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1330	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1331	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1332	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1333	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1334	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1335	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1336	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1337	return res;
				1338	}
				1339
				1340	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1341	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1342	{
				1343	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1344	const qint16x8_t const_one = vdupq_n_s16(1);
				1345	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1346	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1347	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1348	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1349	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1350	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1351	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1352	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1353	return res;
				1354	}
				1355
				1356	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1357	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1358	{
				1359	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1360	const qint8x16_t const_one = vdupq_n_s8(1);
				1361	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1362	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1363	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1364	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1365	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1366	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1367	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1368	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1369	return res;
				1370	}
				1371
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1372	template <bool islog>
				1373	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1374	{
				1375	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1376	const qint16x8_t const_one = vdupq_n_s16(1);
				1377	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1378	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1379	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1380	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1381	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1382	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1383	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1384	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1385	return res;
				1386	}
				1387
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1388	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1389	{
				1390	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1391	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1392	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1393	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1394
				1395	// Perform range reduction [-log(2),log(2)]
				1396	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1397
				1398	// get decimal part from m
				1399	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1400
				1401	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1402	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1403
				1404	// Polynomial Approximation
				1405	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1406	poly = vqadd_s8(poly, const_one);
				1407
				1408	// Reconstruct
				1409	poly = vqshl_s8(poly, dec_m);
				1410
				1411	return poly;
				1412	}
				1413
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1414	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1415	{
				1416	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1417	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1418	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1419	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1420
				1421	// Perform range reduction [-log(2),log(2)]
				1422	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1423
				1424	// get decimal part from m
				1425	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1426
				1427	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1428	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1429
				1430	// Polynomial Approximation
				1431	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1432	poly = vqadd_s16(poly, const_one);
				1433
				1434	// Reconstruct
				1435	poly = vqshl_s16(poly, dec_m);
				1436
				1437	return poly;
				1438	}
				1439
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1440	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1441	{
				1442	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1443	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1444	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1445	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1446
				1447	// Perform range reduction [-log(2),log(2)]
				1448	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1449
				1450	// get decimal part from m
				1451	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1452
				1453	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1454	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1455
				1456	// Polynomial Approximation
				1457	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1458	poly = vqaddq_s8(poly, const_one);
				1459
				1460	// Reconstruct
				1461	poly = vqshlq_s8(poly, dec_m);
				1462
				1463	return poly;
				1464	}
				1465
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1466	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1467	{
				1468	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1469	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1470	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1471	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1472
				1473	// Perform range reduction [-log(2),log(2)]
				1474	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1475
				1476	// get decimal part from m
				1477	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1478
				1479	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1480	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1481
				1482	// Polynomial Approximation
				1483	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1484	poly = vqaddq_s16(poly, const_one);
				1485
				1486	// Reconstruct
				1487	poly = vqshlq_s16(poly, dec_m);
				1488
				1489	return poly;
				1490	}
				1491
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1492	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1493	{
				1494	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1495	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1496	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1497
				1498	// If 0 < a < 1, calculate log(1/x)
				1499	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1500	qint8x8_t recip = vdup_n_s8(0);
				1501	recip = vbsl_s8(calc_reciprocal, recip, a);
				1502
				1503	// Calculate reciprocal
				1504	recip = vrecip_qs8(recip, fixed_point_position);
				1505	a = vbsl_s8(calc_reciprocal, recip, a);
				1506
				1507	// Get decimal part of a
				1508	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1509	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1510
				1511	// Get exponent of 2^n which is equal or less than dec_a
				1512	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1513
				1514	// Get x to range (1, 2]
				1515	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1516	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1517	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1518
				1519	// Polynomial Approximation
				1520	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1521
				1522	// Reconstruct
				1523	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1524
				1525	// Set negative value for 0 < a < 1
				1526	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1527
				1528	return poly;
				1529	}
				1530
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1531	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1532	{
				1533	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1534	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1535	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1536
				1537	// If 0 < a < 1, calculate log(1/x)
				1538	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1539	qint16x4_t recip = vdup_n_s16(0);
				1540	recip = vbsl_s16(calc_reciprocal, recip, a);
				1541
				1542	// Calculate reciprocal
				1543	recip = vrecip_qs16(recip, fixed_point_position);
				1544	a = vbsl_s16(calc_reciprocal, recip, a);
				1545
				1546	// Get decimal part of a
				1547	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1548	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1549
				1550	// Get exponent of 2^n which is equal or less than dec_a
				1551	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1552
				1553	// Get x to range (1, 2]
				1554	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1555	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1556	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1557
				1558	// Polynomial Approximation
				1559	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1560
				1561	// Reconstruct
				1562	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1563
				1564	// Set negative value for 0 < a < 1
				1565	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1566
				1567	return poly;
				1568	}
				1569
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1570	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1571	{
				1572	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1573	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1574	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1575
				1576	// If 0 < a < 1, calculate log(1/x)
				1577	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1578	qint8x16_t recip = vdupq_n_s8(0);
				1579	recip = vbslq_s8(calc_reciprocal, a, recip);
				1580
				1581	// Calculate reciprocal
				1582	recip = vrecipq_qs8(recip, fixed_point_position);
				1583	a = vbslq_s8(calc_reciprocal, recip, a);
				1584
				1585	// Get decimal part of a
				1586	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1587	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1588
				1589	// Get exponent of 2^n which is equal or less than dec_a
				1590	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1591
				1592	// Get x to range (1, 2]
				1593	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1594	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1595	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1596
				1597	// Polynomial Approximation
				1598	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1599
				1600	// Reconstruct
				1601	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1602
				1603	// Set negative value for 0 < a < 1
				1604	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1605
				1606	return poly;
				1607	}
				1608
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1609	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1610	{
				1611	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1612	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1613	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1614
				1615	// If 0 < a < 1, calculate log(1/x)
				1616	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1617	qint16x8_t recip = vdupq_n_s16(0);
				1618	recip = vbslq_s16(calc_reciprocal, a, recip);
				1619
				1620	// Calculate reciprocal
				1621	recip = vqrecipq_qs16(recip, fixed_point_position);
				1622	a = vbslq_s16(calc_reciprocal, recip, a);
				1623
				1624	// Get decimal part of a
				1625	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1626	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1627
				1628	// Get exponent of 2^n which is equal or less than dec_a
				1629	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1630
				1631	// Get x to range (1, 2]
				1632	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1633	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1634	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1635
				1636	// Polynomial Approximation
				1637	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1638
				1639	// Reconstruct
				1640	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1641
				1642	// Set negative value for 0 < a < 1
				1643	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1644
				1645	return poly;
				1646	}
				1647
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1648	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1649	{
				1650	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1651
				1652	// Find shift value. Number must be in (0.5, 2) range.
				1653	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1654
				1655	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1656	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1657	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1658	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1659	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1660
				1661	temp = vshl_s8(a, shift_value);
				1662
				1663	// Initial guess
				1664	qint8x8_t x = temp;
				1665
				1666	// Calculate (x / 2) * (3 - a * x^2)
				1667	// After three iterations we have the result for 8 bit
				1668	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1669	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1670	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1671
				1672	return vshl_s8(x, shift_value2);
				1673	}
				1674
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1675	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1676	{
				1677	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1678
				1679	// Find shift value. Number must be in (0.5, 2) range.
				1680	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1681
				1682	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1683	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1684	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1685	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1686	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1687
				1688	temp = vshl_s16(a, shift_value);
				1689
				1690	// Initial guess
				1691	qint16x4_t x = temp;
				1692
				1693	// Calculate (x / 2) * (3 - a * x^2)
				1694	// After five iterations we have the result for 8 bit
				1695	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1696	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1697	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1698	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1699	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1700
				1701	return vshl_s16(x, shift_value2);
				1702	}
				1703
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1704	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1705	{
				1706	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1707
				1708	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1709	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1710
				1711	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1712	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1713	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1714	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1715	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1716
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1717	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1718
				1719	// Initial guess
				1720	qint8x8_t x = temp;
				1721
				1722	// Calculate (x / 2) * (3 - a * x^2)
				1723	// After three iterations we have the result for 8 bit
				1724	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1725	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1726	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1727
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1728	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1729	}
				1730
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1731	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1732	{
				1733	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1734
				1735	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1736	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1737
				1738	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1739	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1740	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1741	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1742	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1743
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1744	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1745
				1746	// Initial guess
				1747	qint16x4_t x = temp;
				1748
				1749	// Calculate (x / 2) * (3 - a * x^2)
				1750	// After five iterations we have the result for 16 bit
				1751	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1752	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1753	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1754	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1755	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1756
				1757	return vqshl_s16(x, shift_value2);
				1758	}
				1759
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1760	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1761	{
				1762	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1763
				1764	// Find shift value. Number must be in (0.5, 2) range.
				1765	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1766
				1767	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1768	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1769	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1770	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1771	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1772
				1773	temp = vshlq_s8(a, shift_value);
				1774
				1775	// Initial guess
				1776	qint8x16_t x = temp;
				1777
				1778	// Calculate (x / 2) * (3 - a * x^2)
				1779	// After three iterations we have the result for 8 bit
				1780	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1781	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1782	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1783
				1784	return vshlq_s8(x, shift_value2);
				1785	}
				1786
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1787	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1788	{
				1789	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1790
				1791	// Find shift value. Number must be in (0.5, 2) range.
				1792	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1793
				1794	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1795	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1796	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1797	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1798	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1799
				1800	temp = vshlq_s16(a, shift_value);
				1801
				1802	// Initial guess
				1803	qint16x8_t x = temp;
				1804
				1805	// Calculate (x / 2) * (3 - a * x^2)
				1806	// After five iterations we have the result for 16 bit
				1807	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1808	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1809	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1810	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1811	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1812
				1813	return vshlq_s16(x, shift_value2);
				1814	}
				1815
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1816	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1817	{
				1818	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1819
				1820	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1821	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1822
				1823	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1824	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1825	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1826	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1827	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1828
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1829	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1830
				1831	// Initial guess
				1832	qint8x16_t x = temp;
				1833
				1834	// Calculate (x / 2) * (3 - a * x^2)
				1835	// After three iterations we have the result for 8 bit
				1836	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1837	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1838	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1839
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1840	return vqshlq_s8(x, shift_value2);
				1841	}
				1842
				1843	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1844	{
				1845	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1846
				1847	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1848	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1849
				1850	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1851	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1852	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1853	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1854	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1855
				1856	temp = vqshlq_s16(a, shift_value);
				1857
				1858	// Initial guess
				1859	qint16x8_t x = temp;
				1860
				1861	// Calculate (x / 2) * (3 - a * x^2)
				1862	// After five iterations we have the result for 16 bit
				1863	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1864	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1865	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1866	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1867	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1868
				1869	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1870	}
				1871
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1872	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1873	{
				1874	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1875	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1876
				1877	qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1878	qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1879	qint8x8_t den = vqadd_qs8(exp2x, const_one);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame^]	1880	qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1881
				1882	return tanh;
				1883	}
				1884
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1885	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1886	{
				1887	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1888	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1889
				1890	qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1891	qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1892	qint16x4_t den = vqadd_qs16(exp2x, const_one);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame^]	1893	qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1894
				1895	return tanh;
				1896	}
				1897
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1898	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1899	{
				1900	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1901	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1902
				1903	qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1904	qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1905	qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1906	qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
				1907
				1908	return tanh;
				1909	}
				1910
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1911	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1912	{
				1913	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1914	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1915
				1916	qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1917	qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1918	qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1919	qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
				1920
				1921	return tanh;
				1922	}
				1923
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1924	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1925	{
				1926	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1927	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1928
				1929	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1930	{
				1931	float32x4x2_t res =
				1932	{
				1933	{
				1934	vmaxq_f32(a.val[0], b.val[0]),
				1935	vmaxq_f32(a.val[1], b.val[1])
				1936	}
				1937	};
				1938	return res;
				1939	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1940	}