Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: dd1066d6bcc1aca4503c117f25eda519645c0bf3 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	24	#include <limits>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	namespace arm_compute
				27	{
				28	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				29	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	30	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	31	{
				32	{
				33	vdup_n_s8(0x7F), // 0.9978546
				34	vdup_n_s8(0x3F), // 0.4994721
				35	vdup_n_s8(0x16), // 0.1763723
				36	vdup_n_s8(0x05), // 0.0435108
				37	}
				38	};
				39
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	40	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				41	* Format is in Q0.15 for all elements */
				42	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				43	{
				44	{
				45	vdup_n_s16(0x7FBA), // 0.9978546
				46	vdup_n_s16(0x3FE9), // 0.4994721
				47	vdup_n_s16(0x1693), // 0.1763723
				48	vdup_n_s16(0x0592), // 0.0435108
				49	}
				50	};
				51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				53	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	54	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	55	{
				56	{
				57	vdupq_n_s8(0x7F), // 0.9978546
				58	vdupq_n_s8(0x3F), // 0.4994721
				59	vdupq_n_s8(0x16), // 0.1763723
				60	vdupq_n_s8(0x05), // 0.0435108
				61	}
				62	};
				63
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	64	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				65	* Format is in Q0.15 for all elements */
				66	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				67	{
				68	{
				69	vdupq_n_s16(0x7FBA), // 0.9978546
				70	vdupq_n_s16(0x3FE9), // 0.4994721
				71	vdupq_n_s16(0x1693), // 0.1763723
				72	vdupq_n_s16(0x0592), // 0.0435108
				73	}
				74	};
				75
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	76	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				77	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	78	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	79	{
				80	{
				81	vdup_n_s8(0x5C), // 1.4384189
				82	vdup_n_s8(-0x56), // -0.6771900
				83	vdup_n_s8(0x29), // 0.3218538
				84	vdup_n_s8(-0x0A), // -0.0832229
				85	}
				86	};
				87
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	88	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				89	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				90	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				91	{
				92	{
				93	vdup_n_s16(0x5C0F), // 1.4384189
				94	vdup_n_s16(-0x56AE), // -0.6771900
				95	vdup_n_s16(0x2933), // 0.3218538
				96	vdup_n_s16(-0x0AA7), // -0.0832229
				97	}
				98	};
				99
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	100	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				101	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	102	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	103	{
				104	{
				105	vdupq_n_s8(0x5C), // 1.4384189
				106	vdupq_n_s8(-0x56), // -0.6771900
				107	vdupq_n_s8(0x29), // 0.3218538
				108	vdupq_n_s8(-0x0A), // -0.0832229
				109	}
				110	};
				111
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	112	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				113	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				114	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				115	{
				116	{
				117	vdupq_n_s16(0x5C0F), // 1.4384189
				118	vdupq_n_s16(-0x56AE), // -0.6771900
				119	vdupq_n_s16(0x2933), // 0.3218538
				120	vdupq_n_s16(-0x0AA7), // -0.0832229
				121	}
				122	};
				123
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	124	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				125	{
				126	return vget_low_s8(a);
				127	}
				128
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	129	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				130	{
				131	return vget_low_s16(a);
				132	}
				133
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	134	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				135	{
				136	return vget_high_s8(a);
				137	}
				138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	139	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				140	{
				141	return vget_high_s16(a);
				142	}
				143
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	144	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				145	{
				146	return vld1_s8(addr);
				147	}
				148
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	149	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				150	{
				151	return vld1_s16(addr);
				152	}
				153
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	154	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				155	{
				156	return vld1q_s8(addr);
				157	}
				158
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				160	{
				161	return vld1q_s16(addr);
				162	}
				163
				164	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				165	{
				166	return vld1_dup_s8(addr);
				167	}
				168
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	169	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				170	{
				171	return vld1_dup_s16(addr);
				172	}
				173
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	174	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				175	{
				176	return vld1q_dup_s8(addr);
				177	}
				178
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	179	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				180	{
				181	return vld1q_dup_s16(addr);
				182	}
				183
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	184	inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
				185	{
				186	return vld2q_s16(addr);
				187	}
				188
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	189	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				190	{
				191	vst1_s8(addr, b);
				192	}
				193
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	194	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				195	{
				196	vst1_s16(addr, b);
				197	}
				198
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	199	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				200	{
				201	vst1q_s8(addr, b);
				202	}
				203
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				205	{
				206	vst1q_s16(addr, b);
				207	}
				208
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	209	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				210	{
				211	vst2q_s16(addr, b);
				212	}
				213
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				215	{
				216	return vqmovn_s16(a);
				217	}
				218
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	219	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				220	{
				221	return vqmovn_s32(a);
				222	}
				223
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	224	inline qint8x8_t vdup_n_qs8(qint8_t a)
				225	{
				226	return vdup_n_s8(a);
				227	}
				228
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	229	inline qint16x4_t vdup_n_qs16(qint16_t a)
				230	{
				231	return vdup_n_s16(a);
				232	}
				233
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	234	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				235	{
				236	return vdupq_n_s8(a);
				237	}
				238
				239	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				240	{
				241	float32x4x4_t res =
				242	{
				243	{
				244	vdupq_n_f32(a),
				245	vdupq_n_f32(a),
				246	vdupq_n_f32(a),
				247	vdupq_n_f32(a),
				248	}
				249	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	250	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	251	}
				252
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	253	inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
				254	{
				255	float32x4x2_t res =
				256	{
				257	{
				258	vdupq_n_f32(a),
				259	vdupq_n_f32(a),
				260	}
				261	};
				262	return vqcvtq_qs16_f32(res, fixed_point_position);
				263	}
				264
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	265	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				266	{
				267	return vdupq_n_s16(a);
				268	}
				269
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	270	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				271	{
				272	return vdupq_n_s32(a);
				273	}
				274
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	275	inline qint8x8_t vabs_qs8(qint8x8_t a)
				276	{
				277	return vabs_s8(a);
				278	}
				279
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	280	inline qint16x4_t vabs_qs16(qint16x4_t a)
				281	{
				282	return vabs_s16(a);
				283	}
				284
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	285	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				286	{
				287	return vabsq_s8(a);
				288	}
				289
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	290	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				291	{
				292	return vabsq_s16(a);
				293	}
				294
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	295	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				296	{
				297	return vqabs_s8(a);
				298	}
				299
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	300	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				301	{
				302	return vqabs_s16(a);
				303	}
				304
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	305	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				306	{
				307	return vqabsq_s8(a);
				308	}
				309
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	310	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				311	{
				312	return vqabsq_s16(a);
				313	}
				314
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	315	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				316	{
				317	return vmax_s8(a, b);
				318	}
				319
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	320	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				321	{
				322	return vmax_s16(a, b);
				323	}
				324
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	325	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				326	{
				327	return vmaxq_s8(a, b);
				328	}
				329
				330	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				331	{
				332	return vpmax_s8(a, b);
				333	}
				334
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	335	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				336	{
				337	return vpmax_s16(a, b);
				338	}
				339
				340	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				341	{
				342	return vmaxq_s16(a, b);
				343	}
				344
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	345	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				346	{
				347	return vmin_s8(a, b);
				348	}
				349
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	350	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				351	{
				352	return vmin_s16(a, b);
				353	}
				354
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	355	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				356	{
				357	return vminq_s8(a, b);
				358	}
				359
				360	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				361	{
				362	return vpmin_s8(a, b);
				363	}
				364
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	365	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				366	{
				367	return vpmin_s16(a, b);
				368	}
				369
				370	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				371	{
				372	return vminq_s16(a, b);
				373	}
				374
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	375	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				376	{
				377	return vadd_s8(a, b);
				378	}
				379
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	380	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				381	{
				382	return vadd_s16(a, b);
				383	}
				384
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	385	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				386	{
				387	return vaddq_s8(a, b);
				388	}
				389
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	390	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				391	{
				392	return vaddq_s16(a, b);
				393	}
				394
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	395	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				396	{
				397	return vqadd_s8(a, b);
				398	}
				399
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	400	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				401	{
				402	return vqadd_s16(a, b);
				403	}
				404
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	405	inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
				406	{
				407	return vqadd_s32(a, b);
				408	}
				409
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	410	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				411	{
				412	return vqaddq_s8(a, b);
				413	}
				414
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	415	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				416	{
				417	return vqaddq_s16(a, b);
				418	}
				419
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	420	inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
				421	{
				422	return vqaddq_s32(a, b);
				423	}
				424
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	425	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				426	{
				427	return vpaddl_s8(a);
				428	}
				429
				430	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				431	{
				432	return vsub_s8(a, b);
				433	}
				434
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	435	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				436	{
				437	return vsub_s16(a, b);
				438	}
				439
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	440	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				441	{
				442	return vsubq_s8(a, b);
				443	}
				444
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	445	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				446	{
				447	return vsubq_s16(a, b);
				448	}
				449
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	450	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				451	{
				452	return vqsub_s8(a, b);
				453	}
				454
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	455	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				456	{
				457	return vqsub_s16(a, b);
				458	}
				459
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	460	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				461	{
				462	return vqsubq_s8(a, b);
				463	}
				464
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	465	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				466	{
				467	return vqsubq_s16(a, b);
				468	}
				469
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	470	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				471	{
				472	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				473
				474	// Initialize the temporary result with a constant used to round up the result
				475	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				476
				477	// Vector multiply-accumulate long
				478	res = vmlal_s8(res, a, b);
				479
				480	// Shift right by fixed_point_position
				481	res = vshlq_s16(res, fixed_point_position_s16);
				482
				483	// Convert back to qint8
				484	return vmovn_s16(res);
				485	}
				486
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	487	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				488	{
				489	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				490
				491	// Initialize the temporary result with a constant used to round up the result
				492	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				493
				494	// Vector multiply-accumulate long
				495	res = vmlal_s16(res, a, b);
				496
				497	// Shift right by fixed_point_position
				498	res = vshlq_s32(res, fixed_point_position_s32);
				499
				500	// Convert back to qint16
				501	return vmovn_s32(res);
				502	}
				503
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	504	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				505	{
				506	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				507
				508	// Initialize the temporary results with a constant used to round up the result
				509	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				510	qint16x8_t res1 = res0;
				511
				512	// Vector multiply-accumulate long
				513	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				514	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				515
				516	// Shift right by fixed_point_position
				517	res0 = vshlq_s16(res0, fixed_point_position_s16);
				518	res1 = vshlq_s16(res1, fixed_point_position_s16);
				519
				520	// Convert back to qint8
				521	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				522	}
				523
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	524	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				525	{
				526	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				527
				528	// Initialize the temporary results with a constant used to round up the result
				529	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				530	qint32x4_t res1 = res0;
				531
				532	// Vector multiply-accumulate long
				533	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				534	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				535
				536	// Shift right by fixed_point_position
				537	res0 = vshlq_s32(res0, fixed_point_position_s32);
				538	res1 = vshlq_s32(res1, fixed_point_position_s32);
				539
				540	// Convert back to qint16
				541	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				542	}
				543
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	544	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				545	{
				546	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				547
				548	// Initialize the temporary result with a constant used to round up the result
				549	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				550
				551	// Vector multiply-accumulate long
				552	res = vmlal_s8(res, a, b);
				553
				554	// Shift right by fixed_point_position
				555	res = vqshlq_s16(res, fixed_point_position_s16);
				556
				557	// Convert back to qint8 and saturate
				558	return vqmovn_s16(res);
				559	}
				560
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	561	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				562	{
				563	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				564
				565	// Initialize the temporary result with a constant used to round up the result
				566	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				567
				568	// Vector multiply-accumulate long
				569	res = vmlal_s16(res, a, b);
				570
				571	// Shift right by fixed_point_position
				572	res = vqshlq_s32(res, fixed_point_position_s32);
				573
				574	// Convert back to qint16 and saturate
				575	return vqmovn_s32(res);
				576	}
				577
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	578	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				579	{
				580	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				581
				582	// Initialize the temporary results with a constant used to round up the result
				583	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				584	qint16x8_t res1 = res0;
				585
				586	// Vector multiply-accumulate long
				587	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				588	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				589
				590	// Shift right by fixed_point_position
				591	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				592	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				593
				594	// Convert back to qint8 and saturate
				595	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				596	}
				597
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	598	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				599	{
				600	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				601
				602	// Initialize the temporary results with a constant used to round up the result
				603	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				604	qint32x4_t res1 = res0;
				605
				606	// Vector multiply-accumulate long
				607	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				608	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				609
				610	// Shift right by fixed_point_position
				611	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				612	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				613
				614	// Convert back to qint16 and saturate
				615	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				616	}
				617
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	618	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				619	{
				620	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				621
				622	qint16x8_t res = vmull_s8(a, b);
				623
				624	return vqrshlq_s16(res, fixed_point_position_s16);
				625	}
				626
Pablo Tello	f87cc7f	2017-07-26 10:28:40 +0100	[diff] [blame^]	627	inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				628	{
				629	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				630
				631	// Initialize the temporary results with a constant used to round up the result
				632	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				633
				634	// Vector multiply-accumulate long
				635	tmp = vmull_s16(a, b);
				636
				637	// Shift right by fixed_point_position
				638	return vqshlq_s32(tmp, fixed_point_position_s32);
				639	}
				640
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	641	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				642	{
				643	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				644
				645	// Initialize the temporary results with a constant used to round up the result
				646	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				647
				648	// Vector multiply-accumulate long
				649	tmp = vmlal_s8(tmp, b, c);
				650
				651	// Shift right by fixed_point_position
				652	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				653
				654	// Convert back to qint8 and accumulate
				655	return vadd_s8(a, vmovn_s16(tmp));
				656	}
				657
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	658	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				659	{
				660	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				661
				662	// Initialize the temporary results with a constant used to round up the result
				663	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				664
				665	// Vector multiply-accumulate long
				666	tmp = vmlal_s16(tmp, b, c);
				667
				668	// Shift right by fixed_point_position
				669	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				670
				671	// Convert back to qint16 and accumulate
				672	return vadd_s16(a, vmovn_s32(tmp));
				673	}
				674
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	675	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				676	{
				677	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				678
				679	// Initialize the temporary results with a constant used to round up the result
				680	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				681	qint16x8_t tmp1 = tmp0;
				682
				683	// Vector multiply-accumulate long
				684	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				685	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				686
				687	// Shift right by fixed_point_position
				688	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				689	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				690
				691	// Convert back to qint8 and accumulate
				692	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				693	}
				694
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	695	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				696	{
				697	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				698
				699	// Initialize the temporary results with a constant used to round up the result
				700	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				701	qint32x4_t tmp1 = tmp0;
				702
				703	// Vector multiply-accumulate long
				704	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				705	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				706
				707	// Shift right by fixed_point_position
				708	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				709	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				710
				711	// Convert back to qint16 and accumulate
				712	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				713	}
				714
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	715	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				716	{
				717	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				718
				719	// Initialize the temporary results with a constant used to round up the result
				720	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				721
				722	// Vector multiply-accumulate long
				723	tmp = vmlal_s8(tmp, b, c);
				724
				725	// Shift right by fixed_point_position
				726	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				727
				728	// Convert back to qint8 and accumulate
				729	return vqadd_s8(a, vqmovn_s16(tmp));
				730	}
				731
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	732	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				733	{
				734	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				735
				736	// Initialize the temporary results with a constant used to round up the result
				737	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				738
				739	// Vector multiply-accumulate long
				740	tmp = vmlal_s16(tmp, b, c);
				741
				742	// Shift right by fixed_point_position
				743	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				744
				745	// Convert back to qint8 and accumulate
				746	return vqadd_s16(a, vqmovn_s32(tmp));
				747	}
				748
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	749	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				750	{
				751	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				752
				753	// Initialize the temporary results with a constant used to round up the result
				754	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				755	qint16x8_t tmp1 = tmp0;
				756
				757	// Vector multiply-accumulate long
				758	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				759	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				760
				761	// Shift right by fixed_point_position
				762	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				763	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				764
				765	// Convert back to qint8 and accumulate
				766	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				767	return vqaddq_s8(a, res);
				768	}
				769
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	770	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				771	{
				772	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				773
				774	// Initialize the temporary results with a constant used to round up the result
				775	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				776	qint32x4_t tmp1 = tmp0;
				777
				778	// Vector multiply-accumulate long
				779	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				780	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				781
				782	// Shift right by fixed_point_position
				783	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				784	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				785
				786	// Convert back to qint16 and accumulate
				787	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				788	return vqaddq_s16(a, res);
				789	}
				790
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	791	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				792	{
				793	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				794
				795	// Initialize the temporary results with a constant used to round up the result
				796	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				797
				798	// Vector multiply-accumulate long
				799	tmp = vmlal_s8(tmp, b, c);
				800
				801	// Shift right by fixed_point_position
				802	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				803
				804	// Accumulate
				805	return vaddq_s16(a, tmp);
				806	}
				807
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	808	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				809	{
				810	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				811
				812	// Initialize the temporary results with a constant used to round up the result
				813	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				814
				815	// Vector multiply-accumulate long
				816	tmp = vmlal_s16(tmp, b, c);
				817
				818	// Shift right by fixed_point_position
				819	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				820
				821	// Accumulate
				822	return vaddq_s32(a, tmp);
				823	}
				824
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	825	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				826	{
				827	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				828
				829	// Initialize the temporary results with a constant used to round up the result
				830	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				831
				832	// Vector multiply-accumulate long
				833	tmp = vmlal_s8(tmp, b, c);
				834
				835	// Shift right by fixed_point_position
				836	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				837
				838	// Accumulate
				839	return vqaddq_s16(a, tmp);
				840	}
				841
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	842	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				843	{
				844	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				845
				846	// Initialize the temporary results with a constant used to round up the result
				847	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				848
				849	// Vector multiply-accumulate long
				850	tmp = vmlal_s16(tmp, b, c);
				851
				852	// Shift right by fixed_point_position
				853	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				854
				855	// Accumulate
				856	return vqaddq_s32(a, tmp);
				857	}
				858
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	859	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	860	{
				861	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				862
				863	float32x4x2_t res_f32 =
				864	{
				865	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	866	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				867	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	868	}
				869	};
				870
				871	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				872	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				873
				874	const int32x4x2_t res_s32 =
				875	{
				876	{
				877	vcvtq_s32_f32(res_f32.val[0]),
				878	vcvtq_s32_f32(res_f32.val[1]),
				879	}
				880	};
				881
				882	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				883
				884	return vqmovn_s16(res_s16);
				885	}
				886
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	887	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	888	{
				889	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				890
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	891	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	892
				893	res_f32 = vmlaq_f32(res_f32, a, pow2);
				894
				895	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				896
				897	return vqmovn_s32(res_s32);
				898	}
				899
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	900	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	901	{
				902	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				903
				904	float32x4x4_t res_f32 =
				905	{
				906	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	907	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				908	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				909	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				910	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	911	}
				912	};
				913
				914	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				915	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				916	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				917	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				918
				919	const int32x4x4_t res_s32 =
				920	{
				921	{
				922	vcvtq_s32_f32(res_f32.val[0]),
				923	vcvtq_s32_f32(res_f32.val[1]),
				924	vcvtq_s32_f32(res_f32.val[2]),
				925	vcvtq_s32_f32(res_f32.val[3]),
				926	}
				927	};
				928
				929	const int16x8x2_t res_s16 =
				930	{
				931	{
				932	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				933	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				934	}
				935	};
				936
				937	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				938	}
				939
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	940	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	941	{
				942	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				943
				944	float32x4x2_t res_f32 =
				945	{
				946	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	947	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				948	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	949	}
				950	};
				951
				952	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				953	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				954
				955	const int32x4x2_t res_s32 =
				956	{
				957	{
				958	vcvtq_s32_f32(res_f32.val[0]),
				959	vcvtq_s32_f32(res_f32.val[1])
				960	}
				961	};
				962
				963	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				964	}
				965
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	966	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				967	{
				968	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				969
				970	const int16x8_t res_s16 = vmovl_s8(a);
				971
				972	const int32x4x2_t res_s32 =
				973	{
				974	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	975	vmovl_s16(vget_low_qs16(res_s16)),
				976	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	977	}
				978	};
				979
				980	float32x4x2_t res_f32 =
				981	{
				982	{
				983	vcvtq_f32_s32(res_s32.val[0]),
				984	vcvtq_f32_s32(res_s32.val[1])
				985	}
				986	};
				987
				988	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				989	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				990
				991	return res_f32;
				992	}
				993
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	994	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				995	{
				996	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				997	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				998
				999	return vmulq_f32(res_f32, pow2);
				1000	}
				1001
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1002	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				1003	{
				1004	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1005
				1006	const int16x8x2_t res_s16 =
				1007	{
				1008	{
				1009	vmovl_s8(vget_low_s8(a)),
				1010	vmovl_s8(vget_high_s8(a)),
				1011	}
				1012	};
				1013
				1014	const int32x4x4_t res_s32 =
				1015	{
				1016	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1017	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				1018	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				1019	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				1020	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1021	}
				1022	};
				1023
				1024	float32x4x4_t res_f32 =
				1025	{
				1026	{
				1027	vcvtq_f32_s32(res_s32.val[0]),
				1028	vcvtq_f32_s32(res_s32.val[1]),
				1029	vcvtq_f32_s32(res_s32.val[2]),
				1030	vcvtq_f32_s32(res_s32.val[3])
				1031	}
				1032	};
				1033
				1034	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1035	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1036	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				1037	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				1038
				1039	return res_f32;
				1040	}
				1041
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1042	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1043	{
				1044	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1045
				1046	const int32x4x2_t res_s32 =
				1047	{
				1048	{
				1049	vmovl_s16(vget_low_qs16(a)),
				1050	vmovl_s16(vget_high_qs16(a))
				1051	}
				1052	};
				1053
				1054	float32x4x2_t res_f32 =
				1055	{
				1056	{
				1057	vcvtq_f32_s32(res_s32.val[0]),
				1058	vcvtq_f32_s32(res_s32.val[1])
				1059	}
				1060	};
				1061
				1062	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1063	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1064
				1065	return res_f32;
				1066	}
				1067
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1068	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1069	{
				1070	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1071	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1072	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1073	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1074
				1075	// Find shift value
				1076	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1077	const qint8x8_t temp = vshl_s8(a, shift_value);
				1078
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1079	// Newton-Raphson division initial estimate X0 calculation
				1080	qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1081
				1082	uint8x8_t set_one = vcgt_s8(x, const_one);
				1083	x = vbsl_s8(set_one, const_one, x);
				1084
				1085	// Use three iterations of Newton-Raphson method to get the result
				1086	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1087	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1088	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1089
				1090	return vshl_s8(x, shift_value);
				1091	}
				1092
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1093	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1094	{
				1095	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1096	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1097	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1098	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1099
				1100	// Find shift value
				1101	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1102	const qint16x4_t temp = vshl_s16(a, shift_value);
				1103
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1104	// Newton-Raphson division initial estimate X0 calculation
				1105	qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1106
				1107	uint16x4_t set_one = vcgt_s16(x, const_one);
				1108	x = vbsl_s16(set_one, const_one, x);
				1109
				1110	// Use five iterations of Newton-Raphson method to get the result
				1111	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1112	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1113	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1114	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1115	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1116
				1117	return vshl_s16(x, shift_value);
				1118	}
				1119
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1120	inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
				1121	{
				1122	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				1123	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1124	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1125	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1126
				1127	// Find shift value
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1128	const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1129	const qint8x8_t temp = vqshl_s8(a, shift_value);
				1130
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1131	// Newton-Raphson division initial estimate X0 calculation
				1132	qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1133
				1134	uint8x8_t set_one = vcgt_s8(x, const_one);
				1135	x = vbsl_s8(set_one, const_one, x);
				1136
				1137	// Use three iterations of Newton-Raphson method to get the result
				1138	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1139	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1140	x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1141
				1142	return vqshl_s8(x, shift_value);
				1143	}
				1144
				1145	inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
				1146	{
				1147	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1148	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1149	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1150	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1151
				1152	// Find shift value
				1153	const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1154	const qint16x4_t temp = vqshl_s16(a, shift_value);
				1155
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1156	// Newton-Raphson division initial estimate X0 calculation
				1157	qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1158
				1159	uint16x4_t set_one = vcgt_s16(x, const_one);
				1160	x = vbsl_s16(set_one, const_one, x);
				1161
				1162	// Use five iterations of Newton-Raphson method to get the result
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1163	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1164	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1165	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1166	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1167	x = vqadd_s16(x, vqmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
Georgios Pinitas	9247c92	2017-06-28 18:29:47 +0100	[diff] [blame]	1168
				1169	return vqshl_s16(x, shift_value);
				1170	}
				1171
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1172	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1173	{
				1174	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1175	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1176	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1177	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1178
				1179	// Find shift value
				1180	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1181	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1182
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1183	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1184	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1185
				1186	// Set initial guess to one if x > 1
				1187	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1188	x = vbslq_s8(set_one, const_one, x);
				1189
				1190	// Use three iterations of Newton-Raphson method to get the result
				1191	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1192	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1193	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1194
				1195	return vshlq_s8(x, shift_value);
				1196	}
				1197
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1198	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1199	{
				1200	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1201	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1202	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1203	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1204
				1205	// Find shift value
				1206	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1207	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1208
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1209	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1210	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1211
				1212	// Set initial guess to one if x > 1
				1213	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1214	x = vbslq_s16(set_one, const_one, x);
				1215
				1216	// Use five iterations of Newton-Raphson method to get the result
				1217	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1218	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1219	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1220	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1221	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1222
				1223	return vshlq_s16(x, shift_value);
				1224	}
				1225
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1226	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1227	{
				1228	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1229	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1230	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1231	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1232
				1233	// Find shift value
				1234	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1235	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1236
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1237	// Newton-Raphson division initial estimate X0 calculation
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1238	qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1239
				1240	// Set initial guess to one if x > 1
				1241	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1242	x = vbslq_s8(set_one, const_one, x);
				1243
				1244	// Use three iterations of Newton-Raphson method to get the result
				1245	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1246	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1247	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1248
				1249	return vqshlq_s8(x, shift_value);
				1250	}
				1251
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1252	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1253	{
				1254	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1255	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1256	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1257	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1258
				1259	// Find shift value
				1260	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1261	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1262
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1263	// Newton-Raphson division initial estimate X0 calculation
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1264	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1265
				1266	// Set initial guess to one if x > 1
				1267	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1268	x = vbslq_s16(set_one, const_one, x);
				1269
				1270	// Use five iterations of Newton-Raphson method to get the result
				1271	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1272	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1273	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1274	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1275	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1276
Georgios Pinitas	00394ae	2017-06-22 18:13:55 +0100	[diff] [blame]	1277	// Saturate result in case of overflow
				1278	return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1279	}
				1280
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1281	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1282	{
				1283	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1284	}
				1285
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1286	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1287	{
				1288	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1289	}
				1290
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1291	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1292	{
				1293	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1294	}
				1295
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1296	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1297	{
				1298	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1299	}
				1300
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1301	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1302	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1303	{
				1304	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1305	const qint8x8_t const_one = vdup_n_s8(1);
				1306	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1307	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1308	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1309	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1310	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1311	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1312	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1313	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1314	return res;
				1315	}
				1316
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1317	template <bool islog>
				1318	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1319	{
				1320	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1321	const qint16x4_t const_one = vdup_n_s16(1);
				1322	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1323	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1324	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1325	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1326	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1327	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1328	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1329	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1330	return res;
				1331	}
				1332
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1333	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1334	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1335	{
				1336	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1337	const qint8x8_t const_one = vdup_n_s8(1);
				1338	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1339	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1340	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1341	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1342	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1343	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1344	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1345	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1346	return res;
				1347	}
				1348
				1349	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1350	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1351	{
				1352	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1353	const qint16x4_t const_one = vdup_n_s16(1);
				1354	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1355	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1356	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1357	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1358	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1359	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1360	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1361	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1362	return res;
				1363	}
				1364
				1365	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1366	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1367	{
				1368	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1369	const qint8x16_t const_one = vdupq_n_s8(1);
				1370	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1371	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1372	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1373	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1374	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1375	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1376	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1377	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1378	return res;
				1379	}
				1380
				1381	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1382	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1383	{
				1384	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1385	const qint16x8_t const_one = vdupq_n_s16(1);
				1386	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1387	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1388	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1389	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1390	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1391	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1392	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1393	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1394	return res;
				1395	}
				1396
				1397	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1398	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1399	{
				1400	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1401	const qint8x16_t const_one = vdupq_n_s8(1);
				1402	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1403	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1404	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1405	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1406	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1407	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1408	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1409	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1410	return res;
				1411	}
				1412
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1413	template <bool islog>
				1414	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1415	{
				1416	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1417	const qint16x8_t const_one = vdupq_n_s16(1);
				1418	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1419	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1420	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1421	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1422	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1423	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1424	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1425	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1426	return res;
				1427	}
				1428
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1429	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1430	{
				1431	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1432	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1433	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1434	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1435
				1436	// Perform range reduction [-log(2),log(2)]
				1437	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1438
				1439	// get decimal part from m
				1440	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1441
				1442	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1443	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1444
				1445	// Polynomial Approximation
				1446	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1447	poly = vqadd_s8(poly, const_one);
				1448
				1449	// Reconstruct
				1450	poly = vqshl_s8(poly, dec_m);
				1451
				1452	return poly;
				1453	}
				1454
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1455	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1456	{
				1457	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1458	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1459	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1460	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1461
				1462	// Perform range reduction [-log(2),log(2)]
				1463	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1464
				1465	// get decimal part from m
				1466	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1467
				1468	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1469	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1470
				1471	// Polynomial Approximation
				1472	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1473	poly = vqadd_s16(poly, const_one);
				1474
				1475	// Reconstruct
				1476	poly = vqshl_s16(poly, dec_m);
				1477
				1478	return poly;
				1479	}
				1480
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1481	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1482	{
				1483	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1484	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1485	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1486	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1487
				1488	// Perform range reduction [-log(2),log(2)]
				1489	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1490
				1491	// get decimal part from m
				1492	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1493
				1494	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1495	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1496
				1497	// Polynomial Approximation
				1498	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1499	poly = vqaddq_s8(poly, const_one);
				1500
				1501	// Reconstruct
				1502	poly = vqshlq_s8(poly, dec_m);
				1503
				1504	return poly;
				1505	}
				1506
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1507	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1508	{
				1509	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1510	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1511	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1512	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1513
				1514	// Perform range reduction [-log(2),log(2)]
				1515	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1516
				1517	// get decimal part from m
				1518	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1519
				1520	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1521	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1522
				1523	// Polynomial Approximation
				1524	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1525	poly = vqaddq_s16(poly, const_one);
				1526
				1527	// Reconstruct
				1528	poly = vqshlq_s16(poly, dec_m);
				1529
				1530	return poly;
				1531	}
				1532
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1533	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1534	{
				1535	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1536	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1537	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1538
				1539	// If 0 < a < 1, calculate log(1/x)
				1540	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1541	qint8x8_t recip = vdup_n_s8(0);
				1542	recip = vbsl_s8(calc_reciprocal, recip, a);
				1543
				1544	// Calculate reciprocal
				1545	recip = vrecip_qs8(recip, fixed_point_position);
				1546	a = vbsl_s8(calc_reciprocal, recip, a);
				1547
				1548	// Get decimal part of a
				1549	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1550	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1551
				1552	// Get exponent of 2^n which is equal or less than dec_a
				1553	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1554
				1555	// Get x to range (1, 2]
				1556	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1557	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1558	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1559
				1560	// Polynomial Approximation
				1561	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1562
				1563	// Reconstruct
				1564	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1565
				1566	// Set negative value for 0 < a < 1
				1567	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1568
				1569	return poly;
				1570	}
				1571
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1572	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1573	{
				1574	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1575	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1576	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1577
				1578	// If 0 < a < 1, calculate log(1/x)
				1579	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1580	qint16x4_t recip = vdup_n_s16(0);
				1581	recip = vbsl_s16(calc_reciprocal, recip, a);
				1582
				1583	// Calculate reciprocal
				1584	recip = vrecip_qs16(recip, fixed_point_position);
				1585	a = vbsl_s16(calc_reciprocal, recip, a);
				1586
				1587	// Get decimal part of a
				1588	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1589	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1590
				1591	// Get exponent of 2^n which is equal or less than dec_a
				1592	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1593
				1594	// Get x to range (1, 2]
				1595	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1596	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1597	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1598
				1599	// Polynomial Approximation
				1600	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1601
				1602	// Reconstruct
				1603	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1604
				1605	// Set negative value for 0 < a < 1
				1606	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1607
				1608	return poly;
				1609	}
				1610
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1611	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1612	{
				1613	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1614	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1615	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1616
				1617	// If 0 < a < 1, calculate log(1/x)
				1618	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1619	qint8x16_t recip = vdupq_n_s8(0);
				1620	recip = vbslq_s8(calc_reciprocal, a, recip);
				1621
				1622	// Calculate reciprocal
				1623	recip = vrecipq_qs8(recip, fixed_point_position);
				1624	a = vbslq_s8(calc_reciprocal, recip, a);
				1625
				1626	// Get decimal part of a
				1627	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1628	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1629
				1630	// Get exponent of 2^n which is equal or less than dec_a
				1631	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1632
				1633	// Get x to range (1, 2]
				1634	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1635	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1636	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1637
				1638	// Polynomial Approximation
				1639	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1640
				1641	// Reconstruct
				1642	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1643
				1644	// Set negative value for 0 < a < 1
				1645	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1646
				1647	return poly;
				1648	}
				1649
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1650	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1651	{
				1652	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1653	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1654	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1655
				1656	// If 0 < a < 1, calculate log(1/x)
				1657	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1658	qint16x8_t recip = vdupq_n_s16(0);
				1659	recip = vbslq_s16(calc_reciprocal, a, recip);
				1660
				1661	// Calculate reciprocal
				1662	recip = vqrecipq_qs16(recip, fixed_point_position);
				1663	a = vbslq_s16(calc_reciprocal, recip, a);
				1664
				1665	// Get decimal part of a
				1666	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1667	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1668
				1669	// Get exponent of 2^n which is equal or less than dec_a
				1670	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1671
				1672	// Get x to range (1, 2]
				1673	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1674	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1675	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1676
				1677	// Polynomial Approximation
				1678	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1679
				1680	// Reconstruct
				1681	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1682
				1683	// Set negative value for 0 < a < 1
				1684	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1685
				1686	return poly;
				1687	}
				1688
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1689	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1690	{
				1691	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1692
				1693	// Find shift value. Number must be in (0.5, 2) range.
				1694	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1695
				1696	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1697	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1698	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1699	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1700	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1701
				1702	temp = vshl_s8(a, shift_value);
				1703
				1704	// Initial guess
				1705	qint8x8_t x = temp;
				1706
				1707	// Calculate (x / 2) * (3 - a * x^2)
				1708	// After three iterations we have the result for 8 bit
				1709	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1710	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1711	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1712
				1713	return vshl_s8(x, shift_value2);
				1714	}
				1715
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1716	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1717	{
				1718	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1719
				1720	// Find shift value. Number must be in (0.5, 2) range.
				1721	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1722
				1723	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1724	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1725	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1726	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1727	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1728
				1729	temp = vshl_s16(a, shift_value);
				1730
				1731	// Initial guess
				1732	qint16x4_t x = temp;
				1733
				1734	// Calculate (x / 2) * (3 - a * x^2)
				1735	// After five iterations we have the result for 8 bit
				1736	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1737	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1738	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1739	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1740	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1741
				1742	return vshl_s16(x, shift_value2);
				1743	}
				1744
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1745	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1746	{
				1747	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1748
				1749	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1750	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1751
				1752	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1753	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1754	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1755	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1756	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1757
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1758	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1759
				1760	// Initial guess
				1761	qint8x8_t x = temp;
				1762
				1763	// Calculate (x / 2) * (3 - a * x^2)
				1764	// After three iterations we have the result for 8 bit
				1765	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1766	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1767	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1768
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1769	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1770	}
				1771
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1772	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1773	{
				1774	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1775
				1776	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1777	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1778
				1779	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1780	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1781	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1782	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1783	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1784
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1785	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1786
				1787	// Initial guess
				1788	qint16x4_t x = temp;
				1789
				1790	// Calculate (x / 2) * (3 - a * x^2)
				1791	// After five iterations we have the result for 16 bit
				1792	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1793	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1794	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1795	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1796	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1797
				1798	return vqshl_s16(x, shift_value2);
				1799	}
				1800
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1801	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1802	{
				1803	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1804
				1805	// Find shift value. Number must be in (0.5, 2) range.
				1806	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1807
				1808	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1809	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1810	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1811	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1812	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1813
				1814	temp = vshlq_s8(a, shift_value);
				1815
				1816	// Initial guess
				1817	qint8x16_t x = temp;
				1818
				1819	// Calculate (x / 2) * (3 - a * x^2)
				1820	// After three iterations we have the result for 8 bit
				1821	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1822	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1823	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1824
				1825	return vshlq_s8(x, shift_value2);
				1826	}
				1827
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1828	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1829	{
				1830	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1831
				1832	// Find shift value. Number must be in (0.5, 2) range.
				1833	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1834
				1835	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1836	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1837	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1838	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1839	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1840
				1841	temp = vshlq_s16(a, shift_value);
				1842
				1843	// Initial guess
				1844	qint16x8_t x = temp;
				1845
				1846	// Calculate (x / 2) * (3 - a * x^2)
				1847	// After five iterations we have the result for 16 bit
				1848	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1849	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1850	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1851	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1852	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1853
				1854	return vshlq_s16(x, shift_value2);
				1855	}
				1856
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1857	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1858	{
				1859	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1860
				1861	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1862	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1863
				1864	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1865	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1866	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1867	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1868	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1869
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1870	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1871
				1872	// Initial guess
				1873	qint8x16_t x = temp;
				1874
				1875	// Calculate (x / 2) * (3 - a * x^2)
				1876	// After three iterations we have the result for 8 bit
				1877	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1878	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1879	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1880
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1881	return vqshlq_s8(x, shift_value2);
				1882	}
				1883
				1884	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1885	{
				1886	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1887
				1888	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1889	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1890
				1891	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1892	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1893	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1894	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1895	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1896
				1897	temp = vqshlq_s16(a, shift_value);
				1898
				1899	// Initial guess
				1900	qint16x8_t x = temp;
				1901
				1902	// Calculate (x / 2) * (3 - a * x^2)
				1903	// After five iterations we have the result for 16 bit
				1904	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1905	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1906	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1907	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1908	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1909
				1910	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1911	}
				1912
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1913	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1914	{
				1915	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1916	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1917
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1918	const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1919	const qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1920	const qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1921	const qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1922
				1923	return tanh;
				1924	}
				1925
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1926	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1927	{
				1928	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1929	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1930
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1931	const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1932	const qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1933	const qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1934	const qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1935
				1936	return tanh;
				1937	}
				1938
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1939	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1940	{
				1941	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1942	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1943
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1944	const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1945	const qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1946	const qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1947	const qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1948
				1949	return tanh;
				1950	}
				1951
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1952	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1953	{
				1954	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1955	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1956
Michalis Spyrou	bbd3d60	2017-06-21 17:29:40 +0100	[diff] [blame]	1957	const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1958	const qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1959	const qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1960	const qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame]	1961
				1962	return tanh;
				1963	}
				1964
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1965	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1966	{
				1967	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1968	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1969
Michele Di Giorgio	d5e65c7	2017-07-26 17:09:17 +0100	[diff] [blame]	1970	inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1971	{
				1972	return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1973	}
				1974
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1975	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1976	{
				1977	float32x4x2_t res =
				1978	{
				1979	{
				1980	vmaxq_f32(a.val[0], b.val[0]),
				1981	vmaxq_f32(a.val[1], b.val[1])
				1982	}
				1983	};
				1984	return res;
				1985	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1986	}