Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: b86c3cbec3a7b630c98f3e52f0017f2815ade385 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame^]	2	* Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Abe Mbise	4bd2cb8	2017-09-27 18:39:19 +0100	[diff] [blame]	24	#include <array>
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	25	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	26
				27	namespace arm_compute
				28	{
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	29	/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				30	* Format is in Q0.7 for all elements
				31	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	32	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	33	{
				34	{
				35	vdup_n_s8(0x7F), // 0.9978546
				36	vdup_n_s8(0x3F), // 0.4994721
				37	vdup_n_s8(0x16), // 0.1763723
				38	vdup_n_s8(0x05), // 0.0435108
				39	}
				40	};
				41
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	42	/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				43	* Format is in Q0.15 for all elements
				44	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	45	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				46	{
				47	{
				48	vdup_n_s16(0x7FBA), // 0.9978546
				49	vdup_n_s16(0x3FE9), // 0.4994721
				50	vdup_n_s16(0x1693), // 0.1763723
				51	vdup_n_s16(0x0592), // 0.0435108
				52	}
				53	};
				54
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	55	/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				56	* Format is in Q0.7 for all elements
				57	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	58	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	59	{
				60	{
				61	vdupq_n_s8(0x7F), // 0.9978546
				62	vdupq_n_s8(0x3F), // 0.4994721
				63	vdupq_n_s8(0x16), // 0.1763723
				64	vdupq_n_s8(0x05), // 0.0435108
				65	}
				66	};
				67
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	68	/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				69	* Format is in Q0.15 for all elements
				70	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	71	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				72	{
				73	{
				74	vdupq_n_s16(0x7FBA), // 0.9978546
				75	vdupq_n_s16(0x3FE9), // 0.4994721
				76	vdupq_n_s16(0x1693), // 0.1763723
				77	vdupq_n_s16(0x0592), // 0.0435108
				78	}
				79	};
				80
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	81	/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				82	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				83	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	84	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	85	{
				86	{
				87	vdup_n_s8(0x5C), // 1.4384189
				88	vdup_n_s8(-0x56), // -0.6771900
				89	vdup_n_s8(0x29), // 0.3218538
				90	vdup_n_s8(-0x0A), // -0.0832229
				91	}
				92	};
				93
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	94	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				95	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				96	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	97	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				98	{
				99	{
				100	vdup_n_s16(0x5C0F), // 1.4384189
				101	vdup_n_s16(-0x56AE), // -0.6771900
				102	vdup_n_s16(0x2933), // 0.3218538
				103	vdup_n_s16(-0x0AA7), // -0.0832229
				104	}
				105	};
				106
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	107	/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				108	* Format is in Q0.7 for all elements except the first one which is in Q1.6
				109	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	110	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	111	{
				112	{
				113	vdupq_n_s8(0x5C), // 1.4384189
				114	vdupq_n_s8(-0x56), // -0.6771900
				115	vdupq_n_s8(0x29), // 0.3218538
				116	vdupq_n_s8(-0x0A), // -0.0832229
				117	}
				118	};
				119
Michalis Spyrou	bbd9fb9	2017-06-22 12:57:51 +0100	[diff] [blame]	120	/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				121	* Format is in Q0.15 for all elements except the first one which is in Q1.14
				122	*/
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	123	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				124	{
				125	{
				126	vdupq_n_s16(0x5C0F), // 1.4384189
				127	vdupq_n_s16(-0x56AE), // -0.6771900
				128	vdupq_n_s16(0x2933), // 0.3218538
				129	vdupq_n_s16(-0x0AA7), // -0.0832229
				130	}
				131	};
				132
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame^]	133	#ifndef DOXYGEN_SKIP_THIS
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	134	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				135	{
				136	return vget_low_s8(a);
				137	}
				138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	139	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				140	{
				141	return vget_low_s16(a);
				142	}
				143
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				145	{
				146	return vget_high_s8(a);
				147	}
				148
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	149	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				150	{
				151	return vget_high_s16(a);
				152	}
				153
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	154	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				155	{
				156	return vld1_s8(addr);
				157	}
				158
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				160	{
				161	return vld1_s16(addr);
				162	}
				163
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	164	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				165	{
				166	return vld1q_s8(addr);
				167	}
				168
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	169	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				170	{
				171	return vld1q_s16(addr);
				172	}
				173
				174	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				175	{
				176	return vld1_dup_s8(addr);
				177	}
				178
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	179	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				180	{
				181	return vld1_dup_s16(addr);
				182	}
				183
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	184	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				185	{
				186	return vld1q_dup_s8(addr);
				187	}
				188
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	189	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				190	{
				191	return vld1q_dup_s16(addr);
				192	}
				193
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	194	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				195	{
				196	return vld2q_s16(addr);
				197	}
				198
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	199	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				200	{
				201	vst1_s8(addr, b);
				202	}
				203
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				205	{
				206	vst1_s16(addr, b);
				207	}
				208
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	209	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				210	{
				211	vst1q_s8(addr, b);
				212	}
				213
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				215	{
				216	vst1q_s16(addr, b);
				217	}
				218
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	219	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				220	{
				221	vst2q_s16(addr, b);
				222	}
				223
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	224	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				225	{
				226	return vqmovn_s16(a);
				227	}
				228
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	229	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				230	{
				231	return vqmovn_s32(a);
				232	}
				233
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	234	inline qint8x8_t vdup_n_qs8(qint8_t a)
				235	{
				236	return vdup_n_s8(a);
				237	}
				238
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	239	inline qint16x4_t vdup_n_qs16(qint16_t a)
				240	{
				241	return vdup_n_s16(a);
				242	}
				243
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	244	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				245	{
				246	return vdupq_n_s8(a);
				247	}
				248
				249	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				250	{
				251	float32x4x4_t res =
				252	{
				253	{
				254	vdupq_n_f32(a),
				255	vdupq_n_f32(a),
				256	vdupq_n_f32(a),
				257	vdupq_n_f32(a),
				258	}
				259	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	260	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	261	}
				262
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	263	inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
				264	{
				265	float32x4x2_t res =
				266	{
				267	{
				268	vdupq_n_f32(a),
				269	vdupq_n_f32(a),
				270	}
				271	};
				272	return vqcvtq_qs16_f32(res, fixed_point_position);
				273	}
				274
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	275	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				276	{
				277	return vdupq_n_s16(a);
				278	}
				279
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	280	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				281	{
				282	return vdupq_n_s32(a);
				283	}
				284
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	285	inline qint8x8_t vabs_qs8(qint8x8_t a)
				286	{
				287	return vabs_s8(a);
				288	}
				289
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	290	inline qint16x4_t vabs_qs16(qint16x4_t a)
				291	{
				292	return vabs_s16(a);
				293	}
				294
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	295	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				296	{
				297	return vabsq_s8(a);
				298	}
				299
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	300	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				301	{
				302	return vabsq_s16(a);
				303	}
				304
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	305	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				306	{
				307	return vqabs_s8(a);
				308	}
				309
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	310	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				311	{
				312	return vqabs_s16(a);
				313	}
				314
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	315	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				316	{
				317	return vqabsq_s8(a);
				318	}
				319
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	320	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				321	{
				322	return vqabsq_s16(a);
				323	}
				324
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	325	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				326	{
				327	return vmax_s8(a, b);
				328	}
				329
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	330	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				331	{
				332	return vmax_s16(a, b);
				333	}
				334
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	335	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				336	{
				337	return vmaxq_s8(a, b);
				338	}
				339
				340	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				341	{
				342	return vpmax_s8(a, b);
				343	}
				344
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	345	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				346	{
				347	return vpmax_s16(a, b);
				348	}
				349
				350	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				351	{
				352	return vmaxq_s16(a, b);
				353	}
				354
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	355	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				356	{
				357	return vmin_s8(a, b);
				358	}
				359
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	360	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				361	{
				362	return vmin_s16(a, b);
				363	}
				364
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	365	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				366	{
				367	return vminq_s8(a, b);
				368	}
				369
				370	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				371	{
				372	return vpmin_s8(a, b);
				373	}
				374
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	375	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				376	{
				377	return vpmin_s16(a, b);
				378	}
				379
				380	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				381	{
				382	return vminq_s16(a, b);
				383	}
				384
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	385	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				386	{
				387	return vadd_s8(a, b);
				388	}
				389
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	390	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				391	{
				392	return vadd_s16(a, b);
				393	}
				394
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	395	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				396	{
				397	return vaddq_s8(a, b);
				398	}
				399
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	400	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				401	{
				402	return vaddq_s16(a, b);
				403	}
				404
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	405	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				406	{
				407	return vqadd_s8(a, b);
				408	}
				409
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	410	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				411	{
				412	return vqadd_s16(a, b);
				413	}
				414
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	415	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				416	{
				417	return vqadd_s32(a, b);
				418	}
				419
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	420	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				421	{
				422	return vqaddq_s8(a, b);
				423	}
				424
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	425	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				426	{
				427	return vqaddq_s16(a, b);
				428	}
				429
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	430	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				431	{
				432	return vqaddq_s32(a, b);
				433	}
				434
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	435	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				436	{
				437	return vpaddl_s8(a);
				438	}
				439
				440	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				441	{
				442	return vsub_s8(a, b);
				443	}
				444
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	445	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				446	{
				447	return vsub_s16(a, b);
				448	}
				449
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	450	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				451	{
				452	return vsubq_s8(a, b);
				453	}
				454
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	455	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				456	{
				457	return vsubq_s16(a, b);
				458	}
				459
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	460	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				461	{
				462	return vqsub_s8(a, b);
				463	}
				464
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	465	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				466	{
				467	return vqsub_s16(a, b);
				468	}
				469
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	470	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				471	{
				472	return vqsubq_s8(a, b);
				473	}
				474
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	475	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				476	{
				477	return vqsubq_s16(a, b);
				478	}
				479
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	480	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				481	{
				482	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				483
				484	// Initialize the temporary result with a constant used to round up the result
				485	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				486
				487	// Vector multiply-accumulate long
				488	res = vmlal_s8(res, a, b);
				489
				490	// Shift right by fixed_point_position
				491	res = vshlq_s16(res, fixed_point_position_s16);
				492
				493	// Convert back to qint8
				494	return vmovn_s16(res);
				495	}
				496
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	497	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				498	{
				499	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				500
				501	// Initialize the temporary result with a constant used to round up the result
				502	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				503
				504	// Vector multiply-accumulate long
				505	res = vmlal_s16(res, a, b);
				506
				507	// Shift right by fixed_point_position
				508	res = vshlq_s32(res, fixed_point_position_s32);
				509
				510	// Convert back to qint16
				511	return vmovn_s32(res);
				512	}
				513
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	514	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				515	{
				516	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				517
				518	// Initialize the temporary results with a constant used to round up the result
				519	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				520	qint16x8_t res1 = res0;
				521
				522	// Vector multiply-accumulate long
				523	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				524	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				525
				526	// Shift right by fixed_point_position
				527	res0 = vshlq_s16(res0, fixed_point_position_s16);
				528	res1 = vshlq_s16(res1, fixed_point_position_s16);
				529
				530	// Convert back to qint8
				531	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				532	}
				533
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	534	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				535	{
				536	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				537
				538	// Initialize the temporary results with a constant used to round up the result
				539	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				540	qint32x4_t res1 = res0;
				541
				542	// Vector multiply-accumulate long
				543	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				544	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				545
				546	// Shift right by fixed_point_position
				547	res0 = vshlq_s32(res0, fixed_point_position_s32);
				548	res1 = vshlq_s32(res1, fixed_point_position_s32);
				549
				550	// Convert back to qint16
				551	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				552	}
				553
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	554	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				555	{
				556	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				557
				558	// Initialize the temporary result with a constant used to round up the result
				559	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				560
				561	// Vector multiply-accumulate long
				562	res = vmlal_s8(res, a, b);
				563
				564	// Shift right by fixed_point_position
				565	res = vqshlq_s16(res, fixed_point_position_s16);
				566
				567	// Convert back to qint8 and saturate
				568	return vqmovn_s16(res);
				569	}
				570
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	571	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				572	{
				573	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				574
				575	// Initialize the temporary result with a constant used to round up the result
				576	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				577
				578	// Vector multiply-accumulate long
				579	res = vmlal_s16(res, a, b);
				580
				581	// Shift right by fixed_point_position
				582	res = vqshlq_s32(res, fixed_point_position_s32);
				583
				584	// Convert back to qint16 and saturate
				585	return vqmovn_s32(res);
				586	}
				587
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	588	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				589	{
				590	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				591
				592	// Initialize the temporary results with a constant used to round up the result
				593	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				594	qint16x8_t res1 = res0;
				595
				596	// Vector multiply-accumulate long
				597	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				598	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				599
				600	// Shift right by fixed_point_position
				601	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				602	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				603
				604	// Convert back to qint8 and saturate
				605	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				606	}
				607
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	608	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				609	{
				610	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				611
				612	// Initialize the temporary results with a constant used to round up the result
				613	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				614	qint32x4_t res1 = res0;
				615
				616	// Vector multiply-accumulate long
				617	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				618	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				619
				620	// Shift right by fixed_point_position
				621	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				622	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				623
				624	// Convert back to qint16 and saturate
				625	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				626	}
				627
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	628	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				629	{
				630	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				631
				632	qint16x8_t res = vmull_s8(a, b);
				633
				634	return vqrshlq_s16(res, fixed_point_position_s16);
				635	}
				636
Pablo Tello	f87cc7f	2017-07-26 10:28:40 +0100	[diff] [blame]	637	inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				638	{
				639	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				640
				641	// Initialize the temporary results with a constant used to round up the result
				642	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				643
				644	// Vector multiply-accumulate long
				645	tmp = vmull_s16(a, b);
				646
				647	// Shift right by fixed_point_position
				648	return vqshlq_s32(tmp, fixed_point_position_s32);
				649	}
				650
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	651	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				652	{
				653	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				654
				655	// Initialize the temporary results with a constant used to round up the result
				656	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				657
				658	// Vector multiply-accumulate long
				659	tmp = vmlal_s8(tmp, b, c);
				660
				661	// Shift right by fixed_point_position
				662	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				663
				664	// Convert back to qint8 and accumulate
				665	return vadd_s8(a, vmovn_s16(tmp));
				666	}
				667
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	668	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				669	{
				670	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				671
				672	// Initialize the temporary results with a constant used to round up the result
				673	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				674
				675	// Vector multiply-accumulate long
				676	tmp = vmlal_s16(tmp, b, c);
				677
				678	// Shift right by fixed_point_position
				679	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				680
				681	// Convert back to qint16 and accumulate
				682	return vadd_s16(a, vmovn_s32(tmp));
				683	}
				684
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	685	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				686	{
				687	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				688
				689	// Initialize the temporary results with a constant used to round up the result
				690	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				691	qint16x8_t tmp1 = tmp0;
				692
				693	// Vector multiply-accumulate long
				694	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				695	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				696
				697	// Shift right by fixed_point_position
				698	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				699	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				700
				701	// Convert back to qint8 and accumulate
				702	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				703	}
				704
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	705	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				706	{
				707	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				708
				709	// Initialize the temporary results with a constant used to round up the result
				710	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				711	qint32x4_t tmp1 = tmp0;
				712
				713	// Vector multiply-accumulate long
				714	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				715	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				716
				717	// Shift right by fixed_point_position
				718	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				719	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				720
				721	// Convert back to qint16 and accumulate
				722	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				723	}
				724
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	725	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				726	{
				727	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				728
				729	// Initialize the temporary results with a constant used to round up the result
				730	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				731
				732	// Vector multiply-accumulate long
				733	tmp = vmlal_s8(tmp, b, c);
				734
				735	// Shift right by fixed_point_position
				736	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				737
				738	// Convert back to qint8 and accumulate
				739	return vqadd_s8(a, vqmovn_s16(tmp));
				740	}
				741
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	742	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				743	{
				744	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				745
				746	// Initialize the temporary results with a constant used to round up the result
				747	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				748
				749	// Vector multiply-accumulate long
				750	tmp = vmlal_s16(tmp, b, c);
				751
				752	// Shift right by fixed_point_position
				753	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				754
				755	// Convert back to qint8 and accumulate
				756	return vqadd_s16(a, vqmovn_s32(tmp));
				757	}
				758
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	759	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				760	{
				761	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				762
				763	// Initialize the temporary results with a constant used to round up the result
				764	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				765	qint16x8_t tmp1 = tmp0;
				766
				767	// Vector multiply-accumulate long
				768	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				769	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				770
				771	// Shift right by fixed_point_position
				772	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				773	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				774
				775	// Convert back to qint8 and accumulate
				776	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				777	return vqaddq_s8(a, res);
				778	}
				779
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	780	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				781	{
				782	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				783
				784	// Initialize the temporary results with a constant used to round up the result
				785	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				786	qint32x4_t tmp1 = tmp0;
				787
				788	// Vector multiply-accumulate long
				789	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				790	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				791
				792	// Shift right by fixed_point_position
				793	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				794	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				795
				796	// Convert back to qint16 and accumulate
				797	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				798	return vqaddq_s16(a, res);
				799	}
				800
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	801	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				802	{
				803	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				804
				805	// Initialize the temporary results with a constant used to round up the result
				806	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				807
				808	// Vector multiply-accumulate long
				809	tmp = vmlal_s8(tmp, b, c);
				810
				811	// Shift right by fixed_point_position
				812	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				813
				814	// Accumulate
				815	return vaddq_s16(a, tmp);
				816	}
				817
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	818	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				819	{
				820	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				821
				822	// Initialize the temporary results with a constant used to round up the result
				823	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				824
				825	// Vector multiply-accumulate long
				826	tmp = vmlal_s16(tmp, b, c);
				827
				828	// Shift right by fixed_point_position
				829	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				830
				831	// Accumulate
				832	return vaddq_s32(a, tmp);
				833	}
				834
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	835	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				836	{
				837	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				838
				839	// Initialize the temporary results with a constant used to round up the result
				840	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				841
				842	// Vector multiply-accumulate long
				843	tmp = vmlal_s8(tmp, b, c);
				844
				845	// Shift right by fixed_point_position
				846	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				847
				848	// Accumulate
				849	return vqaddq_s16(a, tmp);
				850	}
				851
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	852	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				853	{
				854	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				855
				856	// Initialize the temporary results with a constant used to round up the result
				857	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				858
				859	// Vector multiply-accumulate long
				860	tmp = vmlal_s16(tmp, b, c);
				861
				862	// Shift right by fixed_point_position
				863	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				864
				865	// Accumulate
				866	return vqaddq_s32(a, tmp);
				867	}
				868
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	869	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	870	{
				871	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				872
				873	float32x4x2_t res_f32 =
				874	{
				875	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	876	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				877	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	878	}
				879	};
				880
				881	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				882	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				883
				884	const int32x4x2_t res_s32 =
				885	{
				886	{
				887	vcvtq_s32_f32(res_f32.val[0]),
				888	vcvtq_s32_f32(res_f32.val[1]),
				889	}
				890	};
				891
				892	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				893
				894	return vqmovn_s16(res_s16);
				895	}
				896
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	897	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	898	{
				899	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				900
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	901	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	902
				903	res_f32 = vmlaq_f32(res_f32, a, pow2);
				904
				905	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				906
				907	return vqmovn_s32(res_s32);
				908	}
				909
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	910	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	911	{
				912	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				913
				914	float32x4x4_t res_f32 =
				915	{
				916	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	917	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				918	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				919	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				920	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	921	}
				922	};
				923
				924	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				925	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				926	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				927	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				928
				929	const int32x4x4_t res_s32 =
				930	{
				931	{
				932	vcvtq_s32_f32(res_f32.val[0]),
				933	vcvtq_s32_f32(res_f32.val[1]),
				934	vcvtq_s32_f32(res_f32.val[2]),
				935	vcvtq_s32_f32(res_f32.val[3]),
				936	}
				937	};
				938
				939	const int16x8x2_t res_s16 =
				940	{
				941	{
				942	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				943	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				944	}
				945	};
				946
				947	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				948	}
				949
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	950	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	951	{
				952	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				953
				954	float32x4x2_t res_f32 =
				955	{
				956	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	957	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				958	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	959	}
				960	};
				961
				962	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				963	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				964
				965	const int32x4x2_t res_s32 =
				966	{
				967	{
				968	vcvtq_s32_f32(res_f32.val[0]),
				969	vcvtq_s32_f32(res_f32.val[1])
				970	}
				971	};
				972
				973	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				974	}
				975
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	976	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				977	{
				978	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				979
				980	const int16x8_t res_s16 = vmovl_s8(a);
				981
				982	const int32x4x2_t res_s32 =
				983	{
				984	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	985	vmovl_s16(vget_low_qs16(res_s16)),
				986	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	987	}
				988	};
				989
				990	float32x4x2_t res_f32 =
				991	{
				992	{
				993	vcvtq_f32_s32(res_s32.val[0]),
				994	vcvtq_f32_s32(res_s32.val[1])
				995	}
				996	};
				997
				998	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				999	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1000
				1001	return res_f32;
				1002	}
				1003
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1004	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				1005	{
				1006	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1007	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				1008
				1009	return vmulq_f32(res_f32, pow2);
				1010	}
				1011
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1012	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				1013	{
				1014	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1015
				1016	const int16x8x2_t res_s16 =
				1017	{
				1018	{
				1019	vmovl_s8(vget_low_s8(a)),
				1020	vmovl_s8(vget_high_s8(a)),
				1021	}
				1022	};
				1023
				1024	const int32x4x4_t res_s32 =
				1025	{
				1026	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1027	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				1028	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				1029	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				1030	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1031	}
				1032	};
				1033
				1034	float32x4x4_t res_f32 =
				1035	{
				1036	{
				1037	vcvtq_f32_s32(res_s32.val[0]),
				1038	vcvtq_f32_s32(res_s32.val[1]),
				1039	vcvtq_f32_s32(res_s32.val[2]),
				1040	vcvtq_f32_s32(res_s32.val[3])
				1041	}
				1042	};
				1043
				1044	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1045	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1046	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1047	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1048
				1049	return res_f32;
				1050	}
				1051
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1052	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1053	{
				1054	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1055
				1056	const int32x4x2_t res_s32 =
				1057	{
				1058	{
				1059	vmovl_s16(vget_low_qs16(a)),
				1060	vmovl_s16(vget_high_qs16(a))
				1061	}
				1062	};
				1063
				1064	float32x4x2_t res_f32 =
				1065	{
				1066	{
				1067	vcvtq_f32_s32(res_s32.val[0]),
				1068	vcvtq_f32_s32(res_s32.val[1])
				1069	}
				1070	};
				1071
				1072	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1073	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1074
				1075	return res_f32;
				1076	}
				1077
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1078	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1079	{
				1080	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1081	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1082	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1083	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1084	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1085
				1086	// Find shift value
				1087	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1088	const qint8x8_t temp = vshl_s8(a, shift_value);
				1089
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1090	// Newton-Raphson division initial estimate X0 calculation
				1091	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1092
				1093	uint8x8_t set_one = vcgt_s8(x, const_one);
				1094	x = vbsl_s8(set_one, const_one, x);
				1095
				1096	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1097	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1098	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1099	x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1100
				1101	return vshl_s8(x, shift_value);
				1102	}
				1103
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1104	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1105	{
				1106	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1107	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1108	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1109	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1110	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1111
				1112	// Find shift value
				1113	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1114	const qint16x4_t temp = vshl_s16(a, shift_value);
				1115
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1116	// Newton-Raphson division initial estimate X0 calculation
				1117	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1118
				1119	uint16x4_t set_one = vcgt_s16(x, const_one);
				1120	x = vbsl_s16(set_one, const_one, x);
				1121
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1122	// Use four iterations of Newton-Raphson method to get the result
				1123	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1124	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1125	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1126	x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1127
				1128	return vshl_s16(x, shift_value);
				1129	}
				1130
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1131	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1132	{
				1133	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1134	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1135	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1136	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1137	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1138
				1139	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1140	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1141	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1142
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1143	// Newton-Raphson division initial estimate X0 calculation
				1144	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1145
				1146	uint8x8_t set_one = vcgt_s8(x, const_one);
				1147	x = vbsl_s8(set_one, const_one, x);
				1148
				1149	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1150	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1151	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1152	x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1153
				1154	return vqshl_s8(x, shift_value);
				1155	}
				1156
				1157	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1158	{
				1159	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1160	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1161	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1162	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1163	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1164
				1165	// Find shift value
				1166	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1167	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1168
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1169	// Newton-Raphson division initial estimate X0 calculation
				1170	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1171
				1172	uint16x4_t set_one = vcgt_s16(x, const_one);
				1173	x = vbsl_s16(set_one, const_one, x);
				1174
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1175	// Use four iterations of Newton-Raphson method to get the result
				1176	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1177	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1178	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1179	x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1180
				1181	return vqshl_s16(x, shift_value);
				1182	}
				1183
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1184	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1185	{
				1186	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1187	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1188	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1189	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1190	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1191
				1192	// Find shift value
				1193	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1194	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1195
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1196	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1197	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1198
				1199	// Set initial guess to one if x > 1
				1200	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1201	x = vbslq_s8(set_one, const_one, x);
				1202
				1203	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1204	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1205	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1206	x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1207
				1208	return vshlq_s8(x, shift_value);
				1209	}
				1210
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1211	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1212	{
				1213	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1214	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1215	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1216	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1217	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1218
				1219	// Find shift value
				1220	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1221	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1222
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1223	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1224	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1225
				1226	// Set initial guess to one if x > 1
				1227	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1228	x = vbslq_s16(set_one, const_one, x);
				1229
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1230	// Use four iterations of Newton-Raphson method to get the result
				1231	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1232	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1233	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1234	x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1235
				1236	return vshlq_s16(x, shift_value);
				1237	}
				1238
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1239	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1240	{
				1241	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1242	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1243	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1244	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1245	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1246
				1247	// Find shift value
				1248	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1249	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1250
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1251	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1252	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1253
				1254	// Set initial guess to one if x > 1
				1255	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1256	x = vbslq_s8(set_one, const_one, x);
				1257
				1258	// Use three iterations of Newton-Raphson method to get the result
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1259	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1260	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
				1261	x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1262
				1263	return vqshlq_s8(x, shift_value);
				1264	}
				1265
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1266	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1267	{
				1268	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1269	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1270	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1271	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1272	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1273
				1274	// Find shift value
				1275	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1276	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1277
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1278	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1279	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1280
				1281	// Set initial guess to one if x > 1
				1282	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1283	x = vbslq_s16(set_one, const_one, x);
				1284
Michalis Spyrou	25466a9	2017-08-17 12:56:46 +0100	[diff] [blame]	1285	// Use four iterations of Newton-Raphson method to get the result
				1286	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1287	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1288	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
				1289	x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1290
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1291	// Saturate result in case of overflow
				1292	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1293	}
				1294
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1295	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1296	{
				1297	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1298	}
				1299
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1300	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1301	{
				1302	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1303	}
				1304
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1305	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1306	{
				1307	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1308	}
				1309
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1310	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1311	{
				1312	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1313	}
				1314
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1315	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1316	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1317	{
				1318	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1319	const qint8x8_t const_one = vdup_n_s8(1);
				1320	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1321	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1322	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1323	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1324	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1325	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1326	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1327	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1328	return res;
				1329	}
				1330
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1331	template <bool islog>
				1332	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1333	{
				1334	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1335	const qint16x4_t const_one = vdup_n_s16(1);
				1336	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1337	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1338	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1339	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1340	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1341	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1342	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1343	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1344	return res;
				1345	}
				1346
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1347	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1348	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1349	{
				1350	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1351	const qint8x8_t const_one = vdup_n_s8(1);
				1352	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1353	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1354	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1355	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1356	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1357	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1358	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1359	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1360	return res;
				1361	}
				1362
				1363	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1364	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1365	{
				1366	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1367	const qint16x4_t const_one = vdup_n_s16(1);
				1368	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1369	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1370	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1371	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1372	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1373	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1374	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1375	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1376	return res;
				1377	}
				1378
				1379	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1380	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1381	{
				1382	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1383	const qint8x16_t const_one = vdupq_n_s8(1);
				1384	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1385	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1386	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1387	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1388	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1389	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1390	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1391	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1392	return res;
				1393	}
				1394
				1395	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1396	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1397	{
				1398	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1399	const qint16x8_t const_one = vdupq_n_s16(1);
				1400	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1401	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1402	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1403	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1404	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1405	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1406	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1407	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1408	return res;
				1409	}
				1410
				1411	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1412	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1413	{
				1414	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1415	const qint8x16_t const_one = vdupq_n_s8(1);
				1416	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1417	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1418	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1419	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1420	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1421	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1422	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1423	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1424	return res;
				1425	}
				1426
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1427	template <bool islog>
				1428	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1429	{
				1430	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1431	const qint16x8_t const_one = vdupq_n_s16(1);
				1432	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1433	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1434	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1435	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1436	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1437	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1438	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1439	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1440	return res;
				1441	}
				1442
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1443	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1444	{
				1445	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1446	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1447	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1448	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1449
				1450	// Perform range reduction [-log(2),log(2)]
				1451	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1452
				1453	// get decimal part from m
				1454	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1455
				1456	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1457	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1458
				1459	// Polynomial Approximation
				1460	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1461	poly = vqadd_s8(poly, const_one);
				1462
				1463	// Reconstruct
				1464	poly = vqshl_s8(poly, dec_m);
				1465
				1466	return poly;
				1467	}
				1468
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1469	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1470	{
				1471	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1472	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1473	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1474	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1475
				1476	// Perform range reduction [-log(2),log(2)]
				1477	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1478
				1479	// get decimal part from m
				1480	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1481
				1482	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1483	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1484
				1485	// Polynomial Approximation
				1486	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1487	poly = vqadd_s16(poly, const_one);
				1488
				1489	// Reconstruct
				1490	poly = vqshl_s16(poly, dec_m);
				1491
				1492	return poly;
				1493	}
				1494
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1495	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1496	{
				1497	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1498	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1499	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1500	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1501
				1502	// Perform range reduction [-log(2),log(2)]
				1503	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1504
				1505	// get decimal part from m
				1506	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1507
				1508	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1509	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1510
				1511	// Polynomial Approximation
				1512	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1513	poly = vqaddq_s8(poly, const_one);
				1514
				1515	// Reconstruct
				1516	poly = vqshlq_s8(poly, dec_m);
				1517
				1518	return poly;
				1519	}
				1520
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1521	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1522	{
				1523	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1524	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1525	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1526	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1527
				1528	// Perform range reduction [-log(2),log(2)]
				1529	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1530
				1531	// get decimal part from m
				1532	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1533
				1534	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1535	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1536
				1537	// Polynomial Approximation
				1538	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1539	poly = vqaddq_s16(poly, const_one);
				1540
				1541	// Reconstruct
				1542	poly = vqshlq_s16(poly, dec_m);
				1543
				1544	return poly;
				1545	}
				1546
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1547	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1548	{
				1549	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1550	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1551	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1552
				1553	// If 0 < a < 1, calculate log(1/x)
				1554	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1555	qint8x8_t recip = vdup_n_s8(0);
				1556	recip = vbsl_s8(calc_reciprocal, recip, a);
				1557
				1558	// Calculate reciprocal
				1559	recip = vrecip_qs8(recip, fixed_point_position);
				1560	a = vbsl_s8(calc_reciprocal, recip, a);
				1561
				1562	// Get decimal part of a
				1563	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1564	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1565
				1566	// Get exponent of 2^n which is equal or less than dec_a
				1567	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1568
				1569	// Get x to range (1, 2]
				1570	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1571	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1572	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1573
				1574	// Polynomial Approximation
				1575	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1576
				1577	// Reconstruct
				1578	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1579
				1580	// Set negative value for 0 < a < 1
				1581	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1582
				1583	return poly;
				1584	}
				1585
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1586	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1587	{
				1588	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1589	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1590	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1591
				1592	// If 0 < a < 1, calculate log(1/x)
				1593	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1594	qint16x4_t recip = vdup_n_s16(0);
				1595	recip = vbsl_s16(calc_reciprocal, recip, a);
				1596
				1597	// Calculate reciprocal
				1598	recip = vrecip_qs16(recip, fixed_point_position);
				1599	a = vbsl_s16(calc_reciprocal, recip, a);
				1600
				1601	// Get decimal part of a
				1602	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1603	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1604
				1605	// Get exponent of 2^n which is equal or less than dec_a
				1606	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1607
				1608	// Get x to range (1, 2]
				1609	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1610	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1611	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1612
				1613	// Polynomial Approximation
				1614	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1615
				1616	// Reconstruct
				1617	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1618
				1619	// Set negative value for 0 < a < 1
				1620	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1621
				1622	return poly;
				1623	}
				1624
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1625	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1626	{
				1627	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1628	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1629	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1630
				1631	// If 0 < a < 1, calculate log(1/x)
				1632	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1633	qint8x16_t recip = vdupq_n_s8(0);
				1634	recip = vbslq_s8(calc_reciprocal, a, recip);
				1635
				1636	// Calculate reciprocal
				1637	recip = vrecipq_qs8(recip, fixed_point_position);
				1638	a = vbslq_s8(calc_reciprocal, recip, a);
				1639
				1640	// Get decimal part of a
				1641	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1642	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1643
				1644	// Get exponent of 2^n which is equal or less than dec_a
				1645	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1646
				1647	// Get x to range (1, 2]
				1648	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1649	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1650	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1651
				1652	// Polynomial Approximation
				1653	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1654
				1655	// Reconstruct
				1656	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1657
				1658	// Set negative value for 0 < a < 1
				1659	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1660
				1661	return poly;
				1662	}
				1663
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1664	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1665	{
				1666	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1667	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1668	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1669
				1670	// If 0 < a < 1, calculate log(1/x)
				1671	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1672	qint16x8_t recip = vdupq_n_s16(0);
				1673	recip = vbslq_s16(calc_reciprocal, a, recip);
				1674
				1675	// Calculate reciprocal
				1676	recip = vqrecipq_qs16(recip, fixed_point_position);
				1677	a = vbslq_s16(calc_reciprocal, recip, a);
				1678
				1679	// Get decimal part of a
				1680	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1681	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1682
				1683	// Get exponent of 2^n which is equal or less than dec_a
				1684	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1685
				1686	// Get x to range (1, 2]
				1687	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1688	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1689	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1690
				1691	// Polynomial Approximation
				1692	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1693
				1694	// Reconstruct
				1695	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1696
				1697	// Set negative value for 0 < a < 1
				1698	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1699
				1700	return poly;
				1701	}
				1702
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1703	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1704	{
				1705	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1706
				1707	// Find shift value. Number must be in (0.5, 2) range.
				1708	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1709
				1710	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1711	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1712	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1713	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1714	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1715
				1716	temp = vshl_s8(a, shift_value);
				1717
				1718	// Initial guess
				1719	qint8x8_t x = temp;
				1720
				1721	// Calculate (x / 2) * (3 - a * x^2)
				1722	// After three iterations we have the result for 8 bit
				1723	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1724	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1725	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1726
				1727	return vshl_s8(x, shift_value2);
				1728	}
				1729
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1730	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1731	{
				1732	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1733
				1734	// Find shift value. Number must be in (0.5, 2) range.
				1735	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1736
				1737	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1738	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1739	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1740	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1741	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1742
				1743	temp = vshl_s16(a, shift_value);
				1744
				1745	// Initial guess
				1746	qint16x4_t x = temp;
				1747
				1748	// Calculate (x / 2) * (3 - a * x^2)
				1749	// After five iterations we have the result for 8 bit
				1750	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1751	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1752	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1753	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1754	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1755
				1756	return vshl_s16(x, shift_value2);
				1757	}
				1758
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1759	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1760	{
				1761	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1762
				1763	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1764	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1765
				1766	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1767	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1768	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1769	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1770	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1771
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1772	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1773
				1774	// Initial guess
				1775	qint8x8_t x = temp;
				1776
				1777	// Calculate (x / 2) * (3 - a * x^2)
				1778	// After three iterations we have the result for 8 bit
				1779	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1780	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1781	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1782
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1783	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1784	}
				1785
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1786	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1787	{
				1788	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1789
				1790	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1791	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1792
				1793	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1794	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1795	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1796	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1797	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1798
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1799	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1800
				1801	// Initial guess
				1802	qint16x4_t x = temp;
				1803
				1804	// Calculate (x / 2) * (3 - a * x^2)
				1805	// After five iterations we have the result for 16 bit
				1806	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1807	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1808	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1809	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1810	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1811
				1812	return vqshl_s16(x, shift_value2);
				1813	}
				1814
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1815	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1816	{
				1817	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1818
				1819	// Find shift value. Number must be in (0.5, 2) range.
				1820	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1821
				1822	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1823	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1824	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1825	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1826	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1827
				1828	temp = vshlq_s8(a, shift_value);
				1829
				1830	// Initial guess
				1831	qint8x16_t x = temp;
				1832
				1833	// Calculate (x / 2) * (3 - a * x^2)
				1834	// After three iterations we have the result for 8 bit
				1835	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1836	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1837	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1838
				1839	return vshlq_s8(x, shift_value2);
				1840	}
				1841
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1842	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1843	{
				1844	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1845
				1846	// Find shift value. Number must be in (0.5, 2) range.
				1847	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1848
				1849	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1850	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1851	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1852	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1853	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1854
				1855	temp = vshlq_s16(a, shift_value);
				1856
				1857	// Initial guess
				1858	qint16x8_t x = temp;
				1859
				1860	// Calculate (x / 2) * (3 - a * x^2)
				1861	// After five iterations we have the result for 16 bit
				1862	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1863	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1864	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1865	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1866	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1867
				1868	return vshlq_s16(x, shift_value2);
				1869	}
				1870
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1871	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1872	{
				1873	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1874
				1875	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1876	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1877
				1878	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1879	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1880	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1881	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1882	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1883
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1884	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1885
				1886	// Initial guess
				1887	qint8x16_t x = temp;
				1888
				1889	// Calculate (x / 2) * (3 - a * x^2)
				1890	// After three iterations we have the result for 8 bit
				1891	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1892	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1893	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1894
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1895	return vqshlq_s8(x, shift_value2);
				1896	}
				1897
				1898	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1899	{
				1900	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1901
				1902	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1903	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1904
				1905	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1906	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1907	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1908	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1909	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1910
				1911	temp = vqshlq_s16(a, shift_value);
				1912
				1913	// Initial guess
				1914	qint16x8_t x = temp;
				1915
				1916	// Calculate (x / 2) * (3 - a * x^2)
				1917	// After five iterations we have the result for 16 bit
				1918	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1919	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1920	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1921	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1922	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1923
				1924	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1925	}
				1926
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1927	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1928	{
				1929	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1930	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1931
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1932	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1933	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1934	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1935	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1936
				1937	return tanh;
				1938	}
				1939
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1940	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1941	{
				1942	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1943	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1944
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1945	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1946	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1947	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1948	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1949
				1950	return tanh;
				1951	}
				1952
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1953	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1954	{
				1955	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1956	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1957
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1958	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1959	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1960	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1961	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1962
				1963	return tanh;
				1964	}
				1965
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1966	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1967	{
				1968	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1969	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1970
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1971	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1972	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1973	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1974	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1975
				1976	return tanh;
				1977	}
				1978
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1979	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1980	{
				1981	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1982	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1983
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	1984	inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1985	{
				1986	return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1987	}
				1988
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1989	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1990	{
				1991	float32x4x2_t res =
				1992	{
				1993	{
				1994	vmaxq_f32(a.val[0], b.val[0]),
				1995	vmaxq_f32(a.val[1], b.val[1])
				1996	}
				1997	};
				1998	return res;
				1999	}
Alex Gilday	c357c47	2018-03-21 13:54:09 +0000	[diff] [blame^]	2000	#endif /* DOXYGEN_SKIP_THIS */
Gian Marco Iodice	356f643	2017-09-22 11:32:21 +0100	[diff] [blame]	2001	} // namespace arm_compute