Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: 966313d58b63eddbe2a5b42b74aca0147c72ccef [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Abe Mbise	4bd2cb8	2017-09-27 18:39:19 +0100	[diff] [blame]	24	#include <array>
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	25	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	26
				27	namespace arm_compute
				28	{
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	29	/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				30	* Format is in Q0.7 for all elements
				31	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	32	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	33	{
				34	{
				35	vdup_n_s8(0x7F), // 0.9978546
				36	vdup_n_s8(0x3F), // 0.4994721
				37	vdup_n_s8(0x16), // 0.1763723
				38	vdup_n_s8(0x05), // 0.0435108
				39	}
				40	};
				41
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	42	/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				43	* Format is in Q0.15 for all elements
				44	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	45	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				46	{
				47	{
				48	vdup_n_s16(0x7FBA), // 0.9978546
				49	vdup_n_s16(0x3FE9), // 0.4994721
				50	vdup_n_s16(0x1693), // 0.1763723
				51	vdup_n_s16(0x0592), // 0.0435108
				52	}
				53	};
				54
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	55	/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				56	* Format is in Q0.7 for all elements
				57	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	58	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	59	{
				60	{
				61	vdupq_n_s8(0x7F), // 0.9978546
				62	vdupq_n_s8(0x3F), // 0.4994721
				63	vdupq_n_s8(0x16), // 0.1763723
				64	vdupq_n_s8(0x05), // 0.0435108
				65	}
				66	};
				67
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	68	/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				69	* Format is in Q0.15 for all elements
				70	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	71	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				72	{
				73	{
				74	vdupq_n_s16(0x7FBA), // 0.9978546
				75	vdupq_n_s16(0x3FE9), // 0.4994721
				76	vdupq_n_s16(0x1693), // 0.1763723
				77	vdupq_n_s16(0x0592), // 0.0435108
				78	}
				79	};
				80
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	81	/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				82	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				83	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	84	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	85	{
				86	{
				87	vdup_n_s8(0x5C), // 1.4384189
				88	vdup_n_s8(-0x56), // -0.6771900
				89	vdup_n_s8(0x29), // 0.3218538
				90	vdup_n_s8(-0x0A), // -0.0832229
				91	}
				92	};
				93
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	94	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				95	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				96	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	97	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				98	{
				99	{
				100	vdup_n_s16(0x5C0F), // 1.4384189
				101	vdup_n_s16(-0x56AE), // -0.6771900
				102	vdup_n_s16(0x2933), // 0.3218538
				103	vdup_n_s16(-0x0AA7), // -0.0832229
				104	}
				105	};
				106
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	107	/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				108	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				109	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	110	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	111	{
				112	{
				113	vdupq_n_s8(0x5C), // 1.4384189
				114	vdupq_n_s8(-0x56), // -0.6771900
				115	vdupq_n_s8(0x29), // 0.3218538
				116	vdupq_n_s8(-0x0A), // -0.0832229
				117	}
				118	};
				119
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	120	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				121	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				122	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	123	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				124	{
				125	{
				126	vdupq_n_s16(0x5C0F), // 1.4384189
				127	vdupq_n_s16(-0x56AE), // -0.6771900
				128	vdupq_n_s16(0x2933), // 0.3218538
				129	vdupq_n_s16(-0x0AA7), // -0.0832229
				130	}
				131	};
				132
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				134	{
				135	return vget_low_s8(a);
				136	}
				137
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	138	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				139	{
				140	return vget_low_s16(a);
				141	}
				142
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	143	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				144	{
				145	return vget_high_s8(a);
				146	}
				147
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	148	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				149	{
				150	return vget_high_s16(a);
				151	}
				152
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	153	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				154	{
				155	return vld1_s8(addr);
				156	}
				157
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	158	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				159	{
				160	return vld1_s16(addr);
				161	}
				162
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	163	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				164	{
				165	return vld1q_s8(addr);
				166	}
				167
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	168	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				169	{
				170	return vld1q_s16(addr);
				171	}
				172
				173	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				174	{
				175	return vld1_dup_s8(addr);
				176	}
				177
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	178	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				179	{
				180	return vld1_dup_s16(addr);
				181	}
				182
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	183	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				184	{
				185	return vld1q_dup_s8(addr);
				186	}
				187
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	188	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				189	{
				190	return vld1q_dup_s16(addr);
				191	}
				192
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	193	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				194	{
				195	return vld2q_s16(addr);
				196	}
				197
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	198	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				199	{
				200	vst1_s8(addr, b);
				201	}
				202
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	203	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				204	{
				205	vst1_s16(addr, b);
				206	}
				207
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	208	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				209	{
				210	vst1q_s8(addr, b);
				211	}
				212
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	213	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				214	{
				215	vst1q_s16(addr, b);
				216	}
				217
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	218	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				219	{
				220	vst2q_s16(addr, b);
				221	}
				222
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	223	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				224	{
				225	return vqmovn_s16(a);
				226	}
				227
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	228	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				229	{
				230	return vqmovn_s32(a);
				231	}
				232
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	233	inline qint8x8_t vdup_n_qs8(qint8_t a)
				234	{
				235	return vdup_n_s8(a);
				236	}
				237
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	238	inline qint16x4_t vdup_n_qs16(qint16_t a)
				239	{
				240	return vdup_n_s16(a);
				241	}
				242
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	243	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				244	{
				245	return vdupq_n_s8(a);
				246	}
				247
				248	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				249	{
				250	float32x4x4_t res =
				251	{
				252	{
				253	vdupq_n_f32(a),
				254	vdupq_n_f32(a),
				255	vdupq_n_f32(a),
				256	vdupq_n_f32(a),
				257	}
				258	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	259	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	260	}
				261
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	262	inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
				263	{
				264	float32x4x2_t res =
				265	{
				266	{
				267	vdupq_n_f32(a),
				268	vdupq_n_f32(a),
				269	}
				270	};
				271	return vqcvtq_qs16_f32(res, fixed_point_position);
				272	}
				273
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	274	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				275	{
				276	return vdupq_n_s16(a);
				277	}
				278
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	279	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				280	{
				281	return vdupq_n_s32(a);
				282	}
				283
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	284	inline qint8x8_t vabs_qs8(qint8x8_t a)
				285	{
				286	return vabs_s8(a);
				287	}
				288
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	289	inline qint16x4_t vabs_qs16(qint16x4_t a)
				290	{
				291	return vabs_s16(a);
				292	}
				293
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	294	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				295	{
				296	return vabsq_s8(a);
				297	}
				298
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	299	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				300	{
				301	return vabsq_s16(a);
				302	}
				303
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	304	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				305	{
				306	return vqabs_s8(a);
				307	}
				308
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	309	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				310	{
				311	return vqabs_s16(a);
				312	}
				313
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	314	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				315	{
				316	return vqabsq_s8(a);
				317	}
				318
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	319	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				320	{
				321	return vqabsq_s16(a);
				322	}
				323
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	324	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				325	{
				326	return vmax_s8(a, b);
				327	}
				328
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	329	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				330	{
				331	return vmax_s16(a, b);
				332	}
				333
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	334	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				335	{
				336	return vmaxq_s8(a, b);
				337	}
				338
				339	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				340	{
				341	return vpmax_s8(a, b);
				342	}
				343
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	344	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				345	{
				346	return vpmax_s16(a, b);
				347	}
				348
				349	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				350	{
				351	return vmaxq_s16(a, b);
				352	}
				353
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	354	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				355	{
				356	return vmin_s8(a, b);
				357	}
				358
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	359	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				360	{
				361	return vmin_s16(a, b);
				362	}
				363
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	364	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				365	{
				366	return vminq_s8(a, b);
				367	}
				368
				369	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				370	{
				371	return vpmin_s8(a, b);
				372	}
				373
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	374	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				375	{
				376	return vpmin_s16(a, b);
				377	}
				378
				379	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				380	{
				381	return vminq_s16(a, b);
				382	}
				383
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	384	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				385	{
				386	return vadd_s8(a, b);
				387	}
				388
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	389	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				390	{
				391	return vadd_s16(a, b);
				392	}
				393
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	394	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				395	{
				396	return vaddq_s8(a, b);
				397	}
				398
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	399	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				400	{
				401	return vaddq_s16(a, b);
				402	}
				403
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	404	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				405	{
				406	return vqadd_s8(a, b);
				407	}
				408
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	409	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				410	{
				411	return vqadd_s16(a, b);
				412	}
				413
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	414	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				415	{
				416	return vqadd_s32(a, b);
				417	}
				418
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	419	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				420	{
				421	return vqaddq_s8(a, b);
				422	}
				423
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	424	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				425	{
				426	return vqaddq_s16(a, b);
				427	}
				428
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	429	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				430	{
				431	return vqaddq_s32(a, b);
				432	}
				433
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	434	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				435	{
				436	return vpaddl_s8(a);
				437	}
				438
				439	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				440	{
				441	return vsub_s8(a, b);
				442	}
				443
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	444	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				445	{
				446	return vsub_s16(a, b);
				447	}
				448
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	449	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				450	{
				451	return vsubq_s8(a, b);
				452	}
				453
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	454	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				455	{
				456	return vsubq_s16(a, b);
				457	}
				458
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	459	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				460	{
				461	return vqsub_s8(a, b);
				462	}
				463
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	464	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				465	{
				466	return vqsub_s16(a, b);
				467	}
				468
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	469	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				470	{
				471	return vqsubq_s8(a, b);
				472	}
				473
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	474	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				475	{
				476	return vqsubq_s16(a, b);
				477	}
				478
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	479	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				480	{
				481	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				482
				483	// Initialize the temporary result with a constant used to round up the result
				484	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				485
				486	// Vector multiply-accumulate long
				487	res = vmlal_s8(res, a, b);
				488
				489	// Shift right by fixed_point_position
				490	res = vshlq_s16(res, fixed_point_position_s16);
				491
				492	// Convert back to qint8
				493	return vmovn_s16(res);
				494	}
				495
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	496	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				497	{
				498	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				499
				500	// Initialize the temporary result with a constant used to round up the result
				501	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				502
				503	// Vector multiply-accumulate long
				504	res = vmlal_s16(res, a, b);
				505
				506	// Shift right by fixed_point_position
				507	res = vshlq_s32(res, fixed_point_position_s32);
				508
				509	// Convert back to qint16
				510	return vmovn_s32(res);
				511	}
				512
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	513	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				514	{
				515	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				516
				517	// Initialize the temporary results with a constant used to round up the result
				518	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				519	qint16x8_t res1 = res0;
				520
				521	// Vector multiply-accumulate long
				522	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				523	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				524
				525	// Shift right by fixed_point_position
				526	res0 = vshlq_s16(res0, fixed_point_position_s16);
				527	res1 = vshlq_s16(res1, fixed_point_position_s16);
				528
				529	// Convert back to qint8
				530	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				531	}
				532
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	533	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				534	{
				535	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				536
				537	// Initialize the temporary results with a constant used to round up the result
				538	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				539	qint32x4_t res1 = res0;
				540
				541	// Vector multiply-accumulate long
				542	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				543	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				544
				545	// Shift right by fixed_point_position
				546	res0 = vshlq_s32(res0, fixed_point_position_s32);
				547	res1 = vshlq_s32(res1, fixed_point_position_s32);
				548
				549	// Convert back to qint16
				550	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				551	}
				552
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	553	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				554	{
				555	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				556
				557	// Initialize the temporary result with a constant used to round up the result
				558	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				559
				560	// Vector multiply-accumulate long
				561	res = vmlal_s8(res, a, b);
				562
				563	// Shift right by fixed_point_position
				564	res = vqshlq_s16(res, fixed_point_position_s16);
				565
				566	// Convert back to qint8 and saturate
				567	return vqmovn_s16(res);
				568	}
				569
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	570	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				571	{
				572	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				573
				574	// Initialize the temporary result with a constant used to round up the result
				575	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				576
				577	// Vector multiply-accumulate long
				578	res = vmlal_s16(res, a, b);
				579
				580	// Shift right by fixed_point_position
				581	res = vqshlq_s32(res, fixed_point_position_s32);
				582
				583	// Convert back to qint16 and saturate
				584	return vqmovn_s32(res);
				585	}
				586
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	587	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				588	{
				589	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				590
				591	// Initialize the temporary results with a constant used to round up the result
				592	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				593	qint16x8_t res1 = res0;
				594
				595	// Vector multiply-accumulate long
				596	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				597	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				598
				599	// Shift right by fixed_point_position
				600	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				601	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				602
				603	// Convert back to qint8 and saturate
				604	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				605	}
				606
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	607	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				608	{
				609	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				610
				611	// Initialize the temporary results with a constant used to round up the result
				612	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				613	qint32x4_t res1 = res0;
				614
				615	// Vector multiply-accumulate long
				616	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				617	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				618
				619	// Shift right by fixed_point_position
				620	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				621	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				622
				623	// Convert back to qint16 and saturate
				624	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				625	}
				626
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	627	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				628	{
				629	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				630
				631	qint16x8_t res = vmull_s8(a, b);
				632
				633	return vqrshlq_s16(res, fixed_point_position_s16);
				634	}
				635
Pablo Tello	f87cc7f	2017-07-26 10:28:40 +0100	[diff] [blame]	636	inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				637	{
				638	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				639
				640	// Initialize the temporary results with a constant used to round up the result
				641	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				642
				643	// Vector multiply-accumulate long
				644	tmp = vmull_s16(a, b);
				645
				646	// Shift right by fixed_point_position
				647	return vqshlq_s32(tmp, fixed_point_position_s32);
				648	}
				649
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	650	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				651	{
				652	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				653
				654	// Initialize the temporary results with a constant used to round up the result
				655	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				656
				657	// Vector multiply-accumulate long
				658	tmp = vmlal_s8(tmp, b, c);
				659
				660	// Shift right by fixed_point_position
				661	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				662
				663	// Convert back to qint8 and accumulate
				664	return vadd_s8(a, vmovn_s16(tmp));
				665	}
				666
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	667	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				668	{
				669	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				670
				671	// Initialize the temporary results with a constant used to round up the result
				672	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				673
				674	// Vector multiply-accumulate long
				675	tmp = vmlal_s16(tmp, b, c);
				676
				677	// Shift right by fixed_point_position
				678	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				679
				680	// Convert back to qint16 and accumulate
				681	return vadd_s16(a, vmovn_s32(tmp));
				682	}
				683
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	684	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				685	{
				686	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				687
				688	// Initialize the temporary results with a constant used to round up the result
				689	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				690	qint16x8_t tmp1 = tmp0;
				691
				692	// Vector multiply-accumulate long
				693	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				694	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				695
				696	// Shift right by fixed_point_position
				697	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				698	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				699
				700	// Convert back to qint8 and accumulate
				701	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				702	}
				703
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	704	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				705	{
				706	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				707
				708	// Initialize the temporary results with a constant used to round up the result
				709	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				710	qint32x4_t tmp1 = tmp0;
				711
				712	// Vector multiply-accumulate long
				713	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				714	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				715
				716	// Shift right by fixed_point_position
				717	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				718	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				719
				720	// Convert back to qint16 and accumulate
				721	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				722	}
				723
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	724	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				725	{
				726	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				727
				728	// Initialize the temporary results with a constant used to round up the result
				729	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				730
				731	// Vector multiply-accumulate long
				732	tmp = vmlal_s8(tmp, b, c);
				733
				734	// Shift right by fixed_point_position
				735	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				736
				737	// Convert back to qint8 and accumulate
				738	return vqadd_s8(a, vqmovn_s16(tmp));
				739	}
				740
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	741	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				742	{
				743	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				744
				745	// Initialize the temporary results with a constant used to round up the result
				746	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				747
				748	// Vector multiply-accumulate long
				749	tmp = vmlal_s16(tmp, b, c);
				750
				751	// Shift right by fixed_point_position
				752	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				753
				754	// Convert back to qint8 and accumulate
				755	return vqadd_s16(a, vqmovn_s32(tmp));
				756	}
				757
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	758	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				759	{
				760	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				761
				762	// Initialize the temporary results with a constant used to round up the result
				763	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				764	qint16x8_t tmp1 = tmp0;
				765
				766	// Vector multiply-accumulate long
				767	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				768	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				769
				770	// Shift right by fixed_point_position
				771	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				772	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				773
				774	// Convert back to qint8 and accumulate
				775	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				776	return vqaddq_s8(a, res);
				777	}
				778
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	779	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				780	{
				781	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				782
				783	// Initialize the temporary results with a constant used to round up the result
				784	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				785	qint32x4_t tmp1 = tmp0;
				786
				787	// Vector multiply-accumulate long
				788	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				789	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				790
				791	// Shift right by fixed_point_position
				792	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				793	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				794
				795	// Convert back to qint16 and accumulate
				796	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				797	return vqaddq_s16(a, res);
				798	}
				799
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	800	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				801	{
				802	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				803
				804	// Initialize the temporary results with a constant used to round up the result
				805	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				806
				807	// Vector multiply-accumulate long
				808	tmp = vmlal_s8(tmp, b, c);
				809
				810	// Shift right by fixed_point_position
				811	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				812
				813	// Accumulate
				814	return vaddq_s16(a, tmp);
				815	}
				816
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	817	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				818	{
				819	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				820
				821	// Initialize the temporary results with a constant used to round up the result
				822	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				823
				824	// Vector multiply-accumulate long
				825	tmp = vmlal_s16(tmp, b, c);
				826
				827	// Shift right by fixed_point_position
				828	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				829
				830	// Accumulate
				831	return vaddq_s32(a, tmp);
				832	}
				833
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	834	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				835	{
				836	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				837
				838	// Initialize the temporary results with a constant used to round up the result
				839	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				840
				841	// Vector multiply-accumulate long
				842	tmp = vmlal_s8(tmp, b, c);
				843
				844	// Shift right by fixed_point_position
				845	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				846
				847	// Accumulate
				848	return vqaddq_s16(a, tmp);
				849	}
				850
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	851	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				852	{
				853	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				854
				855	// Initialize the temporary results with a constant used to round up the result
				856	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				857
				858	// Vector multiply-accumulate long
				859	tmp = vmlal_s16(tmp, b, c);
				860
				861	// Shift right by fixed_point_position
				862	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				863
				864	// Accumulate
				865	return vqaddq_s32(a, tmp);
				866	}
				867
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	868	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	869	{
				870	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				871
				872	float32x4x2_t res_f32 =
				873	{
				874	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	875	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				876	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	877	}
				878	};
				879
				880	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				881	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				882
				883	const int32x4x2_t res_s32 =
				884	{
				885	{
				886	vcvtq_s32_f32(res_f32.val[0]),
				887	vcvtq_s32_f32(res_f32.val[1]),
				888	}
				889	};
				890
				891	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				892
				893	return vqmovn_s16(res_s16);
				894	}
				895
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	896	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	897	{
				898	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				899
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	900	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	901
				902	res_f32 = vmlaq_f32(res_f32, a, pow2);
				903
				904	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				905
				906	return vqmovn_s32(res_s32);
				907	}
				908
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	909	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	910	{
				911	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				912
				913	float32x4x4_t res_f32 =
				914	{
				915	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	916	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				917	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				918	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				919	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	920	}
				921	};
				922
				923	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				924	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				925	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				926	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				927
				928	const int32x4x4_t res_s32 =
				929	{
				930	{
				931	vcvtq_s32_f32(res_f32.val[0]),
				932	vcvtq_s32_f32(res_f32.val[1]),
				933	vcvtq_s32_f32(res_f32.val[2]),
				934	vcvtq_s32_f32(res_f32.val[3]),
				935	}
				936	};
				937
				938	const int16x8x2_t res_s16 =
				939	{
				940	{
				941	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				942	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				943	}
				944	};
				945
				946	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				947	}
				948
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	949	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	950	{
				951	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				952
				953	float32x4x2_t res_f32 =
				954	{
				955	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	956	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				957	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	958	}
				959	};
				960
				961	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				962	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				963
				964	const int32x4x2_t res_s32 =
				965	{
				966	{
				967	vcvtq_s32_f32(res_f32.val[0]),
				968	vcvtq_s32_f32(res_f32.val[1])
				969	}
				970	};
				971
				972	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				973	}
				974
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	975	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				976	{
				977	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				978
				979	const int16x8_t res_s16 = vmovl_s8(a);
				980
				981	const int32x4x2_t res_s32 =
				982	{
				983	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	984	vmovl_s16(vget_low_qs16(res_s16)),
				985	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	986	}
				987	};
				988
				989	float32x4x2_t res_f32 =
				990	{
				991	{
				992	vcvtq_f32_s32(res_s32.val[0]),
				993	vcvtq_f32_s32(res_s32.val[1])
				994	}
				995	};
				996
				997	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				998	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				999
				1000	return res_f32;
				1001	}
				1002
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1003	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				1004	{
				1005	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1006	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				1007
				1008	return vmulq_f32(res_f32, pow2);
				1009	}
				1010
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1011	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				1012	{
				1013	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1014
				1015	const int16x8x2_t res_s16 =
				1016	{
				1017	{
				1018	vmovl_s8(vget_low_s8(a)),
				1019	vmovl_s8(vget_high_s8(a)),
				1020	}
				1021	};
				1022
				1023	const int32x4x4_t res_s32 =
				1024	{
				1025	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1026	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				1027	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				1028	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				1029	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1030	}
				1031	};
				1032
				1033	float32x4x4_t res_f32 =
				1034	{
				1035	{
				1036	vcvtq_f32_s32(res_s32.val[0]),
				1037	vcvtq_f32_s32(res_s32.val[1]),
				1038	vcvtq_f32_s32(res_s32.val[2]),
				1039	vcvtq_f32_s32(res_s32.val[3])
				1040	}
				1041	};
				1042
				1043	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1044	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1045	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1046	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1047
				1048	return res_f32;
				1049	}
				1050
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1051	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1052	{
				1053	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1054
				1055	const int32x4x2_t res_s32 =
				1056	{
				1057	{
				1058	vmovl_s16(vget_low_qs16(a)),
				1059	vmovl_s16(vget_high_qs16(a))
				1060	}
				1061	};
				1062
				1063	float32x4x2_t res_f32 =
				1064	{
				1065	{
				1066	vcvtq_f32_s32(res_s32.val[0]),
				1067	vcvtq_f32_s32(res_s32.val[1])
				1068	}
				1069	};
				1070
				1071	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1072	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1073
				1074	return res_f32;
				1075	}
				1076
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1077	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1078	{
				1079	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1080	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1081	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1082	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1083	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1084
				1085	// Find shift value
				1086	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1087	const qint8x8_t temp = vshl_s8(a, shift_value);
				1088
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1089	// Newton-Raphson division initial estimate X0 calculation
				1090	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1091
				1092	uint8x8_t set_one = vcgt_s8(x, const_one);
				1093	x = vbsl_s8(set_one, const_one, x);
				1094
				1095	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1096	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1097	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1098	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1099
				1100	return vshl_s8(x, shift_value);
				1101	}
				1102
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1103	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1104	{
				1105	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1106	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1107	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1108	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1109	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1110
				1111	// Find shift value
				1112	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1113	const qint16x4_t temp = vshl_s16(a, shift_value);
				1114
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1115	// Newton-Raphson division initial estimate X0 calculation
				1116	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1117
				1118	uint16x4_t set_one = vcgt_s16(x, const_one);
				1119	x = vbsl_s16(set_one, const_one, x);
				1120
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1121	// Use four iterations of Newton-Raphson method to get the result
				1122	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1123	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1124	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1125	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1126
				1127	return vshl_s16(x, shift_value);
				1128	}
				1129
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1130	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1131	{
				1132	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1133	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1134	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1135	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1136	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1137
				1138	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1139	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1140	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1141
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1142	// Newton-Raphson division initial estimate X0 calculation
				1143	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1144
				1145	uint8x8_t set_one = vcgt_s8(x, const_one);
				1146	x = vbsl_s8(set_one, const_one, x);
				1147
				1148	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1149	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1150	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1151	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1152
				1153	return vqshl_s8(x, shift_value);
				1154	}
				1155
				1156	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1157	{
				1158	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1159	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1160	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1161	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1162	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1163
				1164	// Find shift value
				1165	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1166	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1167
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1168	// Newton-Raphson division initial estimate X0 calculation
				1169	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1170
				1171	uint16x4_t set_one = vcgt_s16(x, const_one);
				1172	x = vbsl_s16(set_one, const_one, x);
				1173
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1174	// Use four iterations of Newton-Raphson method to get the result
				1175	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1176	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1177	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1178	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1179
				1180	return vqshl_s16(x, shift_value);
				1181	}
				1182
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1183	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1184	{
				1185	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1186	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1187	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1188	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1189	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1190
				1191	// Find shift value
				1192	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1193	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1194
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1195	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1196	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1197
				1198	// Set initial guess to one if x > 1
				1199	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1200	x = vbslq_s8(set_one, const_one, x);
				1201
				1202	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1203	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1204	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1205	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1206
				1207	return vshlq_s8(x, shift_value);
				1208	}
				1209
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1210	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1211	{
				1212	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1213	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1214	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1215	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1216	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1217
				1218	// Find shift value
				1219	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1220	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1221
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1222	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1223	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1224
				1225	// Set initial guess to one if x > 1
				1226	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1227	x = vbslq_s16(set_one, const_one, x);
				1228
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1229	// Use four iterations of Newton-Raphson method to get the result
				1230	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1231	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1232	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1233	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1234
				1235	return vshlq_s16(x, shift_value);
				1236	}
				1237
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1238	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1239	{
				1240	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1241	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1242	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1243	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1244	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1245
				1246	// Find shift value
				1247	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1248	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1249
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1250	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1251	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1252
				1253	// Set initial guess to one if x > 1
				1254	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1255	x = vbslq_s8(set_one, const_one, x);
				1256
				1257	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1258	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1259	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1260	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1261
				1262	return vqshlq_s8(x, shift_value);
				1263	}
				1264
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1265	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1266	{
				1267	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1268	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1269	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1270	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1271	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1272
				1273	// Find shift value
				1274	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1275	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1276
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1277	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1278	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1279
				1280	// Set initial guess to one if x > 1
				1281	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1282	x = vbslq_s16(set_one, const_one, x);
				1283
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1284	// Use four iterations of Newton-Raphson method to get the result
				1285	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1286	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1287	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1288	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1289
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1290	// Saturate result in case of overflow
				1291	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1292	}
				1293
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1294	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1295	{
				1296	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1297	}
				1298
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1299	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1300	{
				1301	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1302	}
				1303
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1304	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1305	{
				1306	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1307	}
				1308
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1309	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1310	{
				1311	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1312	}
				1313
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1314	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1315	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1316	{
				1317	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1318	const qint8x8_t const_one = vdup_n_s8(1);
				1319	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1320	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1321	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1322	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1323	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1324	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1325	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1326	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1327	return res;
				1328	}
				1329
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1330	template <bool islog>
				1331	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1332	{
				1333	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1334	const qint16x4_t const_one = vdup_n_s16(1);
				1335	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1336	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1337	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1338	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1339	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1340	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1341	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1342	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1343	return res;
				1344	}
				1345
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1346	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1347	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1348	{
				1349	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1350	const qint8x8_t const_one = vdup_n_s8(1);
				1351	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1352	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1353	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1354	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1355	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1356	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1357	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1358	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1359	return res;
				1360	}
				1361
				1362	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1363	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1364	{
				1365	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1366	const qint16x4_t const_one = vdup_n_s16(1);
				1367	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1368	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1369	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1370	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1371	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1372	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1373	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1374	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1375	return res;
				1376	}
				1377
				1378	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1379	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1380	{
				1381	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1382	const qint8x16_t const_one = vdupq_n_s8(1);
				1383	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1384	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1385	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1386	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1387	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1388	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1389	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1390	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1391	return res;
				1392	}
				1393
				1394	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1395	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1396	{
				1397	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1398	const qint16x8_t const_one = vdupq_n_s16(1);
				1399	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1400	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1401	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1402	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1403	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1404	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1405	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1406	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1407	return res;
				1408	}
				1409
				1410	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1411	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1412	{
				1413	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1414	const qint8x16_t const_one = vdupq_n_s8(1);
				1415	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1416	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1417	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1418	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1419	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1420	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1421	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1422	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1423	return res;
				1424	}
				1425
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1426	template <bool islog>
				1427	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1428	{
				1429	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1430	const qint16x8_t const_one = vdupq_n_s16(1);
				1431	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1432	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1433	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1434	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1435	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1436	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1437	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1438	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1439	return res;
				1440	}
				1441
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1442	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1443	{
				1444	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1445	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1446	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1447	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1448
				1449	// Perform range reduction [-log(2),log(2)]
				1450	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1451
				1452	// get decimal part from m
				1453	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1454
				1455	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1456	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1457
				1458	// Polynomial Approximation
				1459	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1460	poly = vqadd_s8(poly, const_one);
				1461
				1462	// Reconstruct
				1463	poly = vqshl_s8(poly, dec_m);
				1464
				1465	return poly;
				1466	}
				1467
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1468	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1469	{
				1470	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1471	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1472	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1473	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1474
				1475	// Perform range reduction [-log(2),log(2)]
				1476	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1477
				1478	// get decimal part from m
				1479	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1480
				1481	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1482	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1483
				1484	// Polynomial Approximation
				1485	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1486	poly = vqadd_s16(poly, const_one);
				1487
				1488	// Reconstruct
				1489	poly = vqshl_s16(poly, dec_m);
				1490
				1491	return poly;
				1492	}
				1493
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1494	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1495	{
				1496	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1497	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1498	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1499	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1500
				1501	// Perform range reduction [-log(2),log(2)]
				1502	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1503
				1504	// get decimal part from m
				1505	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1506
				1507	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1508	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1509
				1510	// Polynomial Approximation
				1511	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1512	poly = vqaddq_s8(poly, const_one);
				1513
				1514	// Reconstruct
				1515	poly = vqshlq_s8(poly, dec_m);
				1516
				1517	return poly;
				1518	}
				1519
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1520	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1521	{
				1522	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1523	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1524	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1525	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1526
				1527	// Perform range reduction [-log(2),log(2)]
				1528	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1529
				1530	// get decimal part from m
				1531	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1532
				1533	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1534	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1535
				1536	// Polynomial Approximation
				1537	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1538	poly = vqaddq_s16(poly, const_one);
				1539
				1540	// Reconstruct
				1541	poly = vqshlq_s16(poly, dec_m);
				1542
				1543	return poly;
				1544	}
				1545
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1546	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1547	{
				1548	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1549	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1550	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1551
				1552	// If 0 < a < 1, calculate log(1/x)
				1553	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1554	qint8x8_t recip = vdup_n_s8(0);
				1555	recip = vbsl_s8(calc_reciprocal, recip, a);
				1556
				1557	// Calculate reciprocal
				1558	recip = vrecip_qs8(recip, fixed_point_position);
				1559	a = vbsl_s8(calc_reciprocal, recip, a);
				1560
				1561	// Get decimal part of a
				1562	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1563	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1564
				1565	// Get exponent of 2^n which is equal or less than dec_a
				1566	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1567
				1568	// Get x to range (1, 2]
				1569	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1570	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1571	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1572
				1573	// Polynomial Approximation
				1574	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1575
				1576	// Reconstruct
				1577	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1578
				1579	// Set negative value for 0 < a < 1
				1580	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1581
				1582	return poly;
				1583	}
				1584
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1585	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1586	{
				1587	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1588	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1589	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1590
				1591	// If 0 < a < 1, calculate log(1/x)
				1592	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1593	qint16x4_t recip = vdup_n_s16(0);
				1594	recip = vbsl_s16(calc_reciprocal, recip, a);
				1595
				1596	// Calculate reciprocal
				1597	recip = vrecip_qs16(recip, fixed_point_position);
				1598	a = vbsl_s16(calc_reciprocal, recip, a);
				1599
				1600	// Get decimal part of a
				1601	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1602	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1603
				1604	// Get exponent of 2^n which is equal or less than dec_a
				1605	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1606
				1607	// Get x to range (1, 2]
				1608	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1609	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1610	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1611
				1612	// Polynomial Approximation
				1613	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1614
				1615	// Reconstruct
				1616	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1617
				1618	// Set negative value for 0 < a < 1
				1619	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1620
				1621	return poly;
				1622	}
				1623
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1624	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1625	{
				1626	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1627	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1628	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1629
				1630	// If 0 < a < 1, calculate log(1/x)
				1631	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1632	qint8x16_t recip = vdupq_n_s8(0);
				1633	recip = vbslq_s8(calc_reciprocal, a, recip);
				1634
				1635	// Calculate reciprocal
				1636	recip = vrecipq_qs8(recip, fixed_point_position);
				1637	a = vbslq_s8(calc_reciprocal, recip, a);
				1638
				1639	// Get decimal part of a
				1640	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1641	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1642
				1643	// Get exponent of 2^n which is equal or less than dec_a
				1644	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1645
				1646	// Get x to range (1, 2]
				1647	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1648	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1649	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1650
				1651	// Polynomial Approximation
				1652	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1653
				1654	// Reconstruct
				1655	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1656
				1657	// Set negative value for 0 < a < 1
				1658	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1659
				1660	return poly;
				1661	}
				1662
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1663	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1664	{
				1665	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1666	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1667	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1668
				1669	// If 0 < a < 1, calculate log(1/x)
				1670	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1671	qint16x8_t recip = vdupq_n_s16(0);
				1672	recip = vbslq_s16(calc_reciprocal, a, recip);
				1673
				1674	// Calculate reciprocal
				1675	recip = vqrecipq_qs16(recip, fixed_point_position);
				1676	a = vbslq_s16(calc_reciprocal, recip, a);
				1677
				1678	// Get decimal part of a
				1679	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1680	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1681
				1682	// Get exponent of 2^n which is equal or less than dec_a
				1683	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1684
				1685	// Get x to range (1, 2]
				1686	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1687	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1688	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1689
				1690	// Polynomial Approximation
				1691	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1692
				1693	// Reconstruct
				1694	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1695
				1696	// Set negative value for 0 < a < 1
				1697	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1698
				1699	return poly;
				1700	}
				1701
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1702	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1703	{
				1704	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1705
				1706	// Find shift value. Number must be in (0.5, 2) range.
				1707	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1708
				1709	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1710	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1711	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1712	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1713	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1714
				1715	temp = vshl_s8(a, shift_value);
				1716
				1717	// Initial guess
				1718	qint8x8_t x = temp;
				1719
				1720	// Calculate (x / 2) * (3 - a * x^2)
				1721	// After three iterations we have the result for 8 bit
				1722	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1723	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1724	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1725
				1726	return vshl_s8(x, shift_value2);
				1727	}
				1728
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1729	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1730	{
				1731	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1732
				1733	// Find shift value. Number must be in (0.5, 2) range.
				1734	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1735
				1736	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1737	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1738	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1739	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1740	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1741
				1742	temp = vshl_s16(a, shift_value);
				1743
				1744	// Initial guess
				1745	qint16x4_t x = temp;
				1746
				1747	// Calculate (x / 2) * (3 - a * x^2)
				1748	// After five iterations we have the result for 8 bit
				1749	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1750	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1751	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1752	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1753	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1754
				1755	return vshl_s16(x, shift_value2);
				1756	}
				1757
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1758	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1759	{
				1760	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1761
				1762	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1763	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1764
				1765	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1766	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1767	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1768	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1769	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1770
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1771	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1772
				1773	// Initial guess
				1774	qint8x8_t x = temp;
				1775
				1776	// Calculate (x / 2) * (3 - a * x^2)
				1777	// After three iterations we have the result for 8 bit
				1778	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1779	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1780	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1781
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1782	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1783	}
				1784
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1785	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1786	{
				1787	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1788
				1789	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1790	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1791
				1792	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1793	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1794	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1795	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1796	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1797
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1798	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1799
				1800	// Initial guess
				1801	qint16x4_t x = temp;
				1802
				1803	// Calculate (x / 2) * (3 - a * x^2)
				1804	// After five iterations we have the result for 16 bit
				1805	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1806	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1807	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1808	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1809	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1810
				1811	return vqshl_s16(x, shift_value2);
				1812	}
				1813
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1814	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1815	{
				1816	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1817
				1818	// Find shift value. Number must be in (0.5, 2) range.
				1819	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1820
				1821	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1822	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1823	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1824	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1825	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1826
				1827	temp = vshlq_s8(a, shift_value);
				1828
				1829	// Initial guess
				1830	qint8x16_t x = temp;
				1831
				1832	// Calculate (x / 2) * (3 - a * x^2)
				1833	// After three iterations we have the result for 8 bit
				1834	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1835	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1836	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1837
				1838	return vshlq_s8(x, shift_value2);
				1839	}
				1840
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1841	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1842	{
				1843	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1844
				1845	// Find shift value. Number must be in (0.5, 2) range.
				1846	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1847
				1848	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1849	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1850	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1851	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1852	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1853
				1854	temp = vshlq_s16(a, shift_value);
				1855
				1856	// Initial guess
				1857	qint16x8_t x = temp;
				1858
				1859	// Calculate (x / 2) * (3 - a * x^2)
				1860	// After five iterations we have the result for 16 bit
				1861	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1862	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1863	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1864	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1865	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1866
				1867	return vshlq_s16(x, shift_value2);
				1868	}
				1869
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1870	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1871	{
				1872	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1873
				1874	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1875	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1876
				1877	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1878	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1879	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1880	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1881	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1882
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1883	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1884
				1885	// Initial guess
				1886	qint8x16_t x = temp;
				1887
				1888	// Calculate (x / 2) * (3 - a * x^2)
				1889	// After three iterations we have the result for 8 bit
				1890	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1891	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1892	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1893
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1894	return vqshlq_s8(x, shift_value2);
				1895	}
				1896
				1897	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1898	{
				1899	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1900
				1901	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1902	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1903
				1904	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1905	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1906	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1907	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1908	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1909
				1910	temp = vqshlq_s16(a, shift_value);
				1911
				1912	// Initial guess
				1913	qint16x8_t x = temp;
				1914
				1915	// Calculate (x / 2) * (3 - a * x^2)
				1916	// After five iterations we have the result for 16 bit
				1917	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1918	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1919	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1920	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1921	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1922
				1923	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1924	}
				1925
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1926	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1927	{
				1928	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1929	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1930
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1931	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1932	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1933	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1934	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1935
				1936	return tanh;
				1937	}
				1938
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1939	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1940	{
				1941	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1942	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1943
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1944	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1945	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1946	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1947	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1948
				1949	return tanh;
				1950	}
				1951
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1952	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1953	{
				1954	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1955	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1956
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1957	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1958	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1959	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1960	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1961
				1962	return tanh;
				1963	}
				1964
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1965	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1966	{
				1967	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1968	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1969
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1970	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1971	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1972	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1973	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1974
				1975	return tanh;
				1976	}
				1977
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1978	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1979	{
				1980	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1981	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1982
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	1983	inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1984	{
				1985	return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1986	}
				1987
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1988	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1989	{
				1990	float32x4x2_t res =
				1991	{
				1992	{
				1993	vmaxq_f32(a.val[0], b.val[0]),
				1994	vmaxq_f32(a.val[1], b.val[1])
				1995	}
				1996	};
				1997	return res;
				1998	}
Gian Marco Iodice	356f643	2017-09-22 11:32:21 +0100	[diff] [blame]	1999	} // namespace arm_compute