Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: 92af82cf716bc11da21f1aac8429feec20ec6b83 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	namespace arm_compute
				26	{
				27	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				28	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	29	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	30	{
				31	{
				32	vdup_n_s8(0x7F), // 0.9978546
				33	vdup_n_s8(0x3F), // 0.4994721
				34	vdup_n_s8(0x16), // 0.1763723
				35	vdup_n_s8(0x05), // 0.0435108
				36	}
				37	};
				38
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	39	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				40	* Format is in Q0.15 for all elements */
				41	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				42	{
				43	{
				44	vdup_n_s16(0x7FBA), // 0.9978546
				45	vdup_n_s16(0x3FE9), // 0.4994721
				46	vdup_n_s16(0x1693), // 0.1763723
				47	vdup_n_s16(0x0592), // 0.0435108
				48	}
				49	};
				50
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	51	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				52	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	53	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	54	{
				55	{
				56	vdupq_n_s8(0x7F), // 0.9978546
				57	vdupq_n_s8(0x3F), // 0.4994721
				58	vdupq_n_s8(0x16), // 0.1763723
				59	vdupq_n_s8(0x05), // 0.0435108
				60	}
				61	};
				62
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	63	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				64	* Format is in Q0.15 for all elements */
				65	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				66	{
				67	{
				68	vdupq_n_s16(0x7FBA), // 0.9978546
				69	vdupq_n_s16(0x3FE9), // 0.4994721
				70	vdupq_n_s16(0x1693), // 0.1763723
				71	vdupq_n_s16(0x0592), // 0.0435108
				72	}
				73	};
				74
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	75	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				76	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	77	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	78	{
				79	{
				80	vdup_n_s8(0x5C), // 1.4384189
				81	vdup_n_s8(-0x56), // -0.6771900
				82	vdup_n_s8(0x29), // 0.3218538
				83	vdup_n_s8(-0x0A), // -0.0832229
				84	}
				85	};
				86
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	87	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				88	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				89	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				90	{
				91	{
				92	vdup_n_s16(0x5C0F), // 1.4384189
				93	vdup_n_s16(-0x56AE), // -0.6771900
				94	vdup_n_s16(0x2933), // 0.3218538
				95	vdup_n_s16(-0x0AA7), // -0.0832229
				96	}
				97	};
				98
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	99	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				100	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	101	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	102	{
				103	{
				104	vdupq_n_s8(0x5C), // 1.4384189
				105	vdupq_n_s8(-0x56), // -0.6771900
				106	vdupq_n_s8(0x29), // 0.3218538
				107	vdupq_n_s8(-0x0A), // -0.0832229
				108	}
				109	};
				110
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	111	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				112	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				113	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				114	{
				115	{
				116	vdupq_n_s16(0x5C0F), // 1.4384189
				117	vdupq_n_s16(-0x56AE), // -0.6771900
				118	vdupq_n_s16(0x2933), // 0.3218538
				119	vdupq_n_s16(-0x0AA7), // -0.0832229
				120	}
				121	};
				122
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	123	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				124	{
				125	return vget_low_s8(a);
				126	}
				127
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	128	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				129	{
				130	return vget_low_s16(a);
				131	}
				132
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				134	{
				135	return vget_high_s8(a);
				136	}
				137
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	138	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				139	{
				140	return vget_high_s16(a);
				141	}
				142
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	143	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				144	{
				145	return vld1_s8(addr);
				146	}
				147
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	148	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				149	{
				150	return vld1_s16(addr);
				151	}
				152
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	153	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				154	{
				155	return vld1q_s8(addr);
				156	}
				157
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	158	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				159	{
				160	return vld1q_s16(addr);
				161	}
				162
				163	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				164	{
				165	return vld1_dup_s8(addr);
				166	}
				167
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	168	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				169	{
				170	return vld1_dup_s16(addr);
				171	}
				172
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	173	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				174	{
				175	return vld1q_dup_s8(addr);
				176	}
				177
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	178	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				179	{
				180	return vld1q_dup_s16(addr);
				181	}
				182
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	183	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				184	{
				185	vst1_s8(addr, b);
				186	}
				187
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	188	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				189	{
				190	vst1_s16(addr, b);
				191	}
				192
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	193	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				194	{
				195	vst1q_s8(addr, b);
				196	}
				197
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	198	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				199	{
				200	vst1q_s16(addr, b);
				201	}
				202
				203	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				204	{
				205	return vqmovn_s16(a);
				206	}
				207
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	208	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				209	{
				210	return vqmovn_s32(a);
				211	}
				212
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	213	inline qint8x8_t vdup_n_qs8(qint8_t a)
				214	{
				215	return vdup_n_s8(a);
				216	}
				217
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	218	inline qint16x4_t vdup_n_qs16(qint16_t a)
				219	{
				220	return vdup_n_s16(a);
				221	}
				222
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	223	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				224	{
				225	return vdupq_n_s8(a);
				226	}
				227
				228	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				229	{
				230	float32x4x4_t res =
				231	{
				232	{
				233	vdupq_n_f32(a),
				234	vdupq_n_f32(a),
				235	vdupq_n_f32(a),
				236	vdupq_n_f32(a),
				237	}
				238	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	239	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	240	}
				241
				242	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				243	{
				244	return vdupq_n_s16(a);
				245	}
				246
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	247	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				248	{
				249	return vdupq_n_s32(a);
				250	}
				251
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	252	inline qint8x8_t vabs_qs8(qint8x8_t a)
				253	{
				254	return vabs_s8(a);
				255	}
				256
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	257	inline qint16x4_t vabs_qs16(qint16x4_t a)
				258	{
				259	return vabs_s16(a);
				260	}
				261
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	262	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				263	{
				264	return vabsq_s8(a);
				265	}
				266
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	267	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				268	{
				269	return vabsq_s16(a);
				270	}
				271
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	272	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				273	{
				274	return vqabs_s8(a);
				275	}
				276
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	277	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				278	{
				279	return vqabs_s16(a);
				280	}
				281
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	282	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				283	{
				284	return vqabsq_s8(a);
				285	}
				286
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	287	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				288	{
				289	return vqabsq_s16(a);
				290	}
				291
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	292	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				293	{
				294	return vmax_s8(a, b);
				295	}
				296
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	297	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				298	{
				299	return vmax_s16(a, b);
				300	}
				301
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	302	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				303	{
				304	return vmaxq_s8(a, b);
				305	}
				306
				307	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				308	{
				309	return vpmax_s8(a, b);
				310	}
				311
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	312	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				313	{
				314	return vpmax_s16(a, b);
				315	}
				316
				317	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				318	{
				319	return vmaxq_s16(a, b);
				320	}
				321
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	322	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				323	{
				324	return vmin_s8(a, b);
				325	}
				326
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	327	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				328	{
				329	return vmin_s16(a, b);
				330	}
				331
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	332	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				333	{
				334	return vminq_s8(a, b);
				335	}
				336
				337	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				338	{
				339	return vpmin_s8(a, b);
				340	}
				341
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	342	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				343	{
				344	return vpmin_s16(a, b);
				345	}
				346
				347	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				348	{
				349	return vminq_s16(a, b);
				350	}
				351
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	352	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				353	{
				354	return vadd_s8(a, b);
				355	}
				356
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	357	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				358	{
				359	return vadd_s16(a, b);
				360	}
				361
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	362	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				363	{
				364	return vaddq_s8(a, b);
				365	}
				366
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	367	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				368	{
				369	return vaddq_s16(a, b);
				370	}
				371
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	372	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				373	{
				374	return vqadd_s8(a, b);
				375	}
				376
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	377	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				378	{
				379	return vqadd_s16(a, b);
				380	}
				381
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	382	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				383	{
				384	return vqaddq_s8(a, b);
				385	}
				386
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	387	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				388	{
				389	return vqaddq_s16(a, b);
				390	}
				391
				392	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				393	{
				394	return vpaddl_s8(a);
				395	}
				396
				397	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				398	{
				399	return vsub_s8(a, b);
				400	}
				401
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	402	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				403	{
				404	return vsub_s16(a, b);
				405	}
				406
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	407	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				408	{
				409	return vsubq_s8(a, b);
				410	}
				411
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	412	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				413	{
				414	return vsubq_s16(a, b);
				415	}
				416
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	417	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				418	{
				419	return vqsub_s8(a, b);
				420	}
				421
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	422	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				423	{
				424	return vqsub_s16(a, b);
				425	}
				426
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	427	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				428	{
				429	return vqsubq_s8(a, b);
				430	}
				431
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	432	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				433	{
				434	return vqsubq_s16(a, b);
				435	}
				436
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	437	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				438	{
				439	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				440
				441	// Initialize the temporary result with a constant used to round up the result
				442	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				443
				444	// Vector multiply-accumulate long
				445	res = vmlal_s8(res, a, b);
				446
				447	// Shift right by fixed_point_position
				448	res = vshlq_s16(res, fixed_point_position_s16);
				449
				450	// Convert back to qint8
				451	return vmovn_s16(res);
				452	}
				453
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	454	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				455	{
				456	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				457
				458	// Initialize the temporary result with a constant used to round up the result
				459	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				460
				461	// Vector multiply-accumulate long
				462	res = vmlal_s16(res, a, b);
				463
				464	// Shift right by fixed_point_position
				465	res = vshlq_s32(res, fixed_point_position_s32);
				466
				467	// Convert back to qint16
				468	return vmovn_s32(res);
				469	}
				470
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	471	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				472	{
				473	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				474
				475	// Initialize the temporary results with a constant used to round up the result
				476	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				477	qint16x8_t res1 = res0;
				478
				479	// Vector multiply-accumulate long
				480	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				481	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				482
				483	// Shift right by fixed_point_position
				484	res0 = vshlq_s16(res0, fixed_point_position_s16);
				485	res1 = vshlq_s16(res1, fixed_point_position_s16);
				486
				487	// Convert back to qint8
				488	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				489	}
				490
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	491	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				492	{
				493	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				494
				495	// Initialize the temporary results with a constant used to round up the result
				496	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				497	qint32x4_t res1 = res0;
				498
				499	// Vector multiply-accumulate long
				500	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				501	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				502
				503	// Shift right by fixed_point_position
				504	res0 = vshlq_s32(res0, fixed_point_position_s32);
				505	res1 = vshlq_s32(res1, fixed_point_position_s32);
				506
				507	// Convert back to qint16
				508	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				509	}
				510
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	511	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				512	{
				513	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				514
				515	// Initialize the temporary result with a constant used to round up the result
				516	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				517
				518	// Vector multiply-accumulate long
				519	res = vmlal_s8(res, a, b);
				520
				521	// Shift right by fixed_point_position
				522	res = vqshlq_s16(res, fixed_point_position_s16);
				523
				524	// Convert back to qint8 and saturate
				525	return vqmovn_s16(res);
				526	}
				527
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	528	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				529	{
				530	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				531
				532	// Initialize the temporary result with a constant used to round up the result
				533	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				534
				535	// Vector multiply-accumulate long
				536	res = vmlal_s16(res, a, b);
				537
				538	// Shift right by fixed_point_position
				539	res = vqshlq_s32(res, fixed_point_position_s32);
				540
				541	// Convert back to qint16 and saturate
				542	return vqmovn_s32(res);
				543	}
				544
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	545	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				546	{
				547	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				548
				549	// Initialize the temporary results with a constant used to round up the result
				550	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				551	qint16x8_t res1 = res0;
				552
				553	// Vector multiply-accumulate long
				554	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				555	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				556
				557	// Shift right by fixed_point_position
				558	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				559	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				560
				561	// Convert back to qint8 and saturate
				562	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				563	}
				564
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	565	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				566	{
				567	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				568
				569	// Initialize the temporary results with a constant used to round up the result
				570	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				571	qint32x4_t res1 = res0;
				572
				573	// Vector multiply-accumulate long
				574	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				575	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				576
				577	// Shift right by fixed_point_position
				578	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				579	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				580
				581	// Convert back to qint16 and saturate
				582	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				583	}
				584
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	585	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				586	{
				587	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				588
				589	qint16x8_t res = vmull_s8(a, b);
				590
				591	return vqrshlq_s16(res, fixed_point_position_s16);
				592	}
				593
				594	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				595	{
				596	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				597
				598	// Initialize the temporary results with a constant used to round up the result
				599	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				600
				601	// Vector multiply-accumulate long
				602	tmp = vmlal_s8(tmp, b, c);
				603
				604	// Shift right by fixed_point_position
				605	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				606
				607	// Convert back to qint8 and accumulate
				608	return vadd_s8(a, vmovn_s16(tmp));
				609	}
				610
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	611	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				612	{
				613	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				614
				615	// Initialize the temporary results with a constant used to round up the result
				616	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				617
				618	// Vector multiply-accumulate long
				619	tmp = vmlal_s16(tmp, b, c);
				620
				621	// Shift right by fixed_point_position
				622	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				623
				624	// Convert back to qint16 and accumulate
				625	return vadd_s16(a, vmovn_s32(tmp));
				626	}
				627
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	628	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				629	{
				630	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				631
				632	// Initialize the temporary results with a constant used to round up the result
				633	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				634	qint16x8_t tmp1 = tmp0;
				635
				636	// Vector multiply-accumulate long
				637	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				638	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				639
				640	// Shift right by fixed_point_position
				641	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				642	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				643
				644	// Convert back to qint8 and accumulate
				645	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				646	}
				647
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	648	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				649	{
				650	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				651
				652	// Initialize the temporary results with a constant used to round up the result
				653	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				654	qint32x4_t tmp1 = tmp0;
				655
				656	// Vector multiply-accumulate long
				657	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				658	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				659
				660	// Shift right by fixed_point_position
				661	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				662	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				663
				664	// Convert back to qint16 and accumulate
				665	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				666	}
				667
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	668	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				669	{
				670	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				671
				672	// Initialize the temporary results with a constant used to round up the result
				673	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				674
				675	// Vector multiply-accumulate long
				676	tmp = vmlal_s8(tmp, b, c);
				677
				678	// Shift right by fixed_point_position
				679	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				680
				681	// Convert back to qint8 and accumulate
				682	return vqadd_s8(a, vqmovn_s16(tmp));
				683	}
				684
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	685	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				686	{
				687	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				688
				689	// Initialize the temporary results with a constant used to round up the result
				690	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				691
				692	// Vector multiply-accumulate long
				693	tmp = vmlal_s16(tmp, b, c);
				694
				695	// Shift right by fixed_point_position
				696	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				697
				698	// Convert back to qint8 and accumulate
				699	return vqadd_s16(a, vqmovn_s32(tmp));
				700	}
				701
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	702	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				703	{
				704	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				705
				706	// Initialize the temporary results with a constant used to round up the result
				707	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				708	qint16x8_t tmp1 = tmp0;
				709
				710	// Vector multiply-accumulate long
				711	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				712	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				713
				714	// Shift right by fixed_point_position
				715	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				716	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				717
				718	// Convert back to qint8 and accumulate
				719	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				720	return vqaddq_s8(a, res);
				721	}
				722
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	723	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				724	{
				725	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				726
				727	// Initialize the temporary results with a constant used to round up the result
				728	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				729	qint32x4_t tmp1 = tmp0;
				730
				731	// Vector multiply-accumulate long
				732	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				733	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				734
				735	// Shift right by fixed_point_position
				736	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				737	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				738
				739	// Convert back to qint16 and accumulate
				740	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				741	return vqaddq_s16(a, res);
				742	}
				743
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	744	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				745	{
				746	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				747
				748	// Initialize the temporary results with a constant used to round up the result
				749	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				750
				751	// Vector multiply-accumulate long
				752	tmp = vmlal_s8(tmp, b, c);
				753
				754	// Shift right by fixed_point_position
				755	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				756
				757	// Accumulate
				758	return vaddq_s16(a, tmp);
				759	}
				760
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	761	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				762	{
				763	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				764
				765	// Initialize the temporary results with a constant used to round up the result
				766	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				767
				768	// Vector multiply-accumulate long
				769	tmp = vmlal_s16(tmp, b, c);
				770
				771	// Shift right by fixed_point_position
				772	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				773
				774	// Accumulate
				775	return vaddq_s32(a, tmp);
				776	}
				777
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	778	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				779	{
				780	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				781
				782	// Initialize the temporary results with a constant used to round up the result
				783	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				784
				785	// Vector multiply-accumulate long
				786	tmp = vmlal_s8(tmp, b, c);
				787
				788	// Shift right by fixed_point_position
				789	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				790
				791	// Accumulate
				792	return vqaddq_s16(a, tmp);
				793	}
				794
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	795	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				796	{
				797	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				798
				799	// Initialize the temporary results with a constant used to round up the result
				800	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				801
				802	// Vector multiply-accumulate long
				803	tmp = vmlal_s16(tmp, b, c);
				804
				805	// Shift right by fixed_point_position
				806	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				807
				808	// Accumulate
				809	return vqaddq_s32(a, tmp);
				810	}
				811
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	812	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	813	{
				814	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				815
				816	float32x4x2_t res_f32 =
				817	{
				818	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	819	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				820	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	821	}
				822	};
				823
				824	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				825	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				826
				827	const int32x4x2_t res_s32 =
				828	{
				829	{
				830	vcvtq_s32_f32(res_f32.val[0]),
				831	vcvtq_s32_f32(res_f32.val[1]),
				832	}
				833	};
				834
				835	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				836
				837	return vqmovn_s16(res_s16);
				838	}
				839
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	840	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	841	{
				842	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				843
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	844	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	845
				846	res_f32 = vmlaq_f32(res_f32, a, pow2);
				847
				848	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				849
				850	return vqmovn_s32(res_s32);
				851	}
				852
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	853	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	854	{
				855	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				856
				857	float32x4x4_t res_f32 =
				858	{
				859	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	860	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				861	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				862	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				863	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	864	}
				865	};
				866
				867	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				868	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				869	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				870	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				871
				872	const int32x4x4_t res_s32 =
				873	{
				874	{
				875	vcvtq_s32_f32(res_f32.val[0]),
				876	vcvtq_s32_f32(res_f32.val[1]),
				877	vcvtq_s32_f32(res_f32.val[2]),
				878	vcvtq_s32_f32(res_f32.val[3]),
				879	}
				880	};
				881
				882	const int16x8x2_t res_s16 =
				883	{
				884	{
				885	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				886	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				887	}
				888	};
				889
				890	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				891	}
				892
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	893	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	894	{
				895	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				896
				897	float32x4x2_t res_f32 =
				898	{
				899	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame^]	900	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				901	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	902	}
				903	};
				904
				905	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				906	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				907
				908	const int32x4x2_t res_s32 =
				909	{
				910	{
				911	vcvtq_s32_f32(res_f32.val[0]),
				912	vcvtq_s32_f32(res_f32.val[1])
				913	}
				914	};
				915
				916	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				917	}
				918
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	919	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				920	{
				921	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				922
				923	const int16x8_t res_s16 = vmovl_s8(a);
				924
				925	const int32x4x2_t res_s32 =
				926	{
				927	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	928	vmovl_s16(vget_low_qs16(res_s16)),
				929	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	930	}
				931	};
				932
				933	float32x4x2_t res_f32 =
				934	{
				935	{
				936	vcvtq_f32_s32(res_s32.val[0]),
				937	vcvtq_f32_s32(res_s32.val[1])
				938	}
				939	};
				940
				941	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				942	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				943
				944	return res_f32;
				945	}
				946
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	947	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				948	{
				949	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				950	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				951
				952	return vmulq_f32(res_f32, pow2);
				953	}
				954
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	955	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				956	{
				957	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				958
				959	const int16x8x2_t res_s16 =
				960	{
				961	{
				962	vmovl_s8(vget_low_s8(a)),
				963	vmovl_s8(vget_high_s8(a)),
				964	}
				965	};
				966
				967	const int32x4x4_t res_s32 =
				968	{
				969	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	970	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				971	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				972	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				973	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	974	}
				975	};
				976
				977	float32x4x4_t res_f32 =
				978	{
				979	{
				980	vcvtq_f32_s32(res_s32.val[0]),
				981	vcvtq_f32_s32(res_s32.val[1]),
				982	vcvtq_f32_s32(res_s32.val[2]),
				983	vcvtq_f32_s32(res_s32.val[3])
				984	}
				985	};
				986
				987	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				988	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				989	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				990	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				991
				992	return res_f32;
				993	}
				994
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	995	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				996	{
				997	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				998
				999	const int32x4x2_t res_s32 =
				1000	{
				1001	{
				1002	vmovl_s16(vget_low_qs16(a)),
				1003	vmovl_s16(vget_high_qs16(a))
				1004	}
				1005	};
				1006
				1007	float32x4x2_t res_f32 =
				1008	{
				1009	{
				1010	vcvtq_f32_s32(res_s32.val[0]),
				1011	vcvtq_f32_s32(res_s32.val[1])
				1012	}
				1013	};
				1014
				1015	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1016	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1017
				1018	return res_f32;
				1019	}
				1020
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1021	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1022	{
				1023	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1024	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1025	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1026	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1027
				1028	// Find shift value
				1029	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1030	const qint8x8_t temp = vshl_s8(a, shift_value);
				1031
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1032	qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1033
				1034	uint8x8_t set_one = vcgt_s8(x, const_one);
				1035	x = vbsl_s8(set_one, const_one, x);
				1036
				1037	// Use three iterations of Newton-Raphson method to get the result
				1038	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1039	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1040	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1041
				1042	return vshl_s8(x, shift_value);
				1043	}
				1044
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1045	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1046	{
				1047	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1048	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1049	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1050	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1051
				1052	// Find shift value
				1053	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1054	const qint16x4_t temp = vshl_s16(a, shift_value);
				1055
				1056	qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
				1057
				1058	uint16x4_t set_one = vcgt_s16(x, const_one);
				1059	x = vbsl_s16(set_one, const_one, x);
				1060
				1061	// Use five iterations of Newton-Raphson method to get the result
				1062	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1063	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1064	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1065	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1066	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1067
				1068	return vshl_s16(x, shift_value);
				1069	}
				1070
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1071	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1072	{
				1073	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1074	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1075	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1076	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1077
				1078	// Find shift value
				1079	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1080	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1081
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1082	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1083
				1084	// Set initial guess to one if x > 1
				1085	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1086	x = vbslq_s8(set_one, const_one, x);
				1087
				1088	// Use three iterations of Newton-Raphson method to get the result
				1089	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1090	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1091	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1092
				1093	return vshlq_s8(x, shift_value);
				1094	}
				1095
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1096	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1097	{
				1098	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1099	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1100	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1101	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1102
				1103	// Find shift value
				1104	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1105	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1106
				1107	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1108
				1109	// Set initial guess to one if x > 1
				1110	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1111	x = vbslq_s16(set_one, const_one, x);
				1112
				1113	// Use five iterations of Newton-Raphson method to get the result
				1114	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1115	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1116	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1117	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1118	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1119
				1120	return vshlq_s16(x, shift_value);
				1121	}
				1122
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1123	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1124	{
				1125	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1126	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1127	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1128	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1129
				1130	// Find shift value
				1131	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1132	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1133
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1134	qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1135
				1136	// Set initial guess to one if x > 1
				1137	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1138	x = vbslq_s8(set_one, const_one, x);
				1139
				1140	// Use three iterations of Newton-Raphson method to get the result
				1141	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1142	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1143	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1144
				1145	return vqshlq_s8(x, shift_value);
				1146	}
				1147
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1148	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1149	{
				1150	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1151	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1152	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1153	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1154
				1155	// Find shift value
				1156	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1157	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1158
				1159	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1160
				1161	// Set initial guess to one if x > 1
				1162	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1163	x = vbslq_s16(set_one, const_one, x);
				1164
				1165	// Use five iterations of Newton-Raphson method to get the result
				1166	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1167	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1168	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1169	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1170	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1171
				1172	return vqshlq_s16(x, shift_value);
				1173	}
				1174
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1175	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1176	{
				1177	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1178	}
				1179
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1180	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1181	{
				1182	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1183	}
				1184
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1185	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1186	{
				1187	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1188	}
				1189
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1190	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1191	{
				1192	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1193	}
				1194
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1195	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1196	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1197	{
				1198	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1199	const qint8x8_t const_one = vdup_n_s8(1);
				1200	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1201	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1202	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1203	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1204	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1205	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1206	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1207	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1208	return res;
				1209	}
				1210
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1211	template <bool islog>
				1212	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1213	{
				1214	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1215	const qint16x4_t const_one = vdup_n_s16(1);
				1216	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1217	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1218	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1219	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1220	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1221	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1222	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1223	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1224	return res;
				1225	}
				1226
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1227	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1228	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1229	{
				1230	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1231	const qint8x8_t const_one = vdup_n_s8(1);
				1232	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1233	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1234	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1235	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1236	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1237	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1238	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1239	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1240	return res;
				1241	}
				1242
				1243	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1244	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1245	{
				1246	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1247	const qint16x4_t const_one = vdup_n_s16(1);
				1248	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1249	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1250	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1251	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1252	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1253	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1254	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1255	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1256	return res;
				1257	}
				1258
				1259	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1260	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1261	{
				1262	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1263	const qint8x16_t const_one = vdupq_n_s8(1);
				1264	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1265	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1266	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1267	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1268	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1269	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1270	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1271	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1272	return res;
				1273	}
				1274
				1275	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1276	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1277	{
				1278	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1279	const qint16x8_t const_one = vdupq_n_s16(1);
				1280	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1281	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1282	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1283	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1284	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1285	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1286	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1287	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1288	return res;
				1289	}
				1290
				1291	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1292	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1293	{
				1294	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1295	const qint8x16_t const_one = vdupq_n_s8(1);
				1296	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1297	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1298	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1299	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1300	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1301	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1302	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1303	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1304	return res;
				1305	}
				1306
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1307	template <bool islog>
				1308	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1309	{
				1310	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1311	const qint16x8_t const_one = vdupq_n_s16(1);
				1312	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1313	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1314	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1315	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1316	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1317	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1318	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1319	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1320	return res;
				1321	}
				1322
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1323	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1324	{
				1325	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1326	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1327	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1328	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1329
				1330	// Perform range reduction [-log(2),log(2)]
				1331	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1332
				1333	// get decimal part from m
				1334	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1335
				1336	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1337	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1338
				1339	// Polynomial Approximation
				1340	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1341	poly = vqadd_s8(poly, const_one);
				1342
				1343	// Reconstruct
				1344	poly = vqshl_s8(poly, dec_m);
				1345
				1346	return poly;
				1347	}
				1348
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1349	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1350	{
				1351	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1352	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1353	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1354	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1355
				1356	// Perform range reduction [-log(2),log(2)]
				1357	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1358
				1359	// get decimal part from m
				1360	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1361
				1362	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1363	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1364
				1365	// Polynomial Approximation
				1366	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1367	poly = vqadd_s16(poly, const_one);
				1368
				1369	// Reconstruct
				1370	poly = vqshl_s16(poly, dec_m);
				1371
				1372	return poly;
				1373	}
				1374
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1375	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1376	{
				1377	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1378	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1379	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1380	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1381
				1382	// Perform range reduction [-log(2),log(2)]
				1383	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1384
				1385	// get decimal part from m
				1386	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1387
				1388	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1389	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1390
				1391	// Polynomial Approximation
				1392	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1393	poly = vqaddq_s8(poly, const_one);
				1394
				1395	// Reconstruct
				1396	poly = vqshlq_s8(poly, dec_m);
				1397
				1398	return poly;
				1399	}
				1400
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1401	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1402	{
				1403	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1404	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1405	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1406	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1407
				1408	// Perform range reduction [-log(2),log(2)]
				1409	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1410
				1411	// get decimal part from m
				1412	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1413
				1414	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1415	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1416
				1417	// Polynomial Approximation
				1418	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1419	poly = vqaddq_s16(poly, const_one);
				1420
				1421	// Reconstruct
				1422	poly = vqshlq_s16(poly, dec_m);
				1423
				1424	return poly;
				1425	}
				1426
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1427	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1428	{
				1429	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1430	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1431	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1432
				1433	// If 0 < a < 1, calculate log(1/x)
				1434	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1435	qint8x8_t recip = vdup_n_s8(0);
				1436	recip = vbsl_s8(calc_reciprocal, recip, a);
				1437
				1438	// Calculate reciprocal
				1439	recip = vrecip_qs8(recip, fixed_point_position);
				1440	a = vbsl_s8(calc_reciprocal, recip, a);
				1441
				1442	// Get decimal part of a
				1443	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1444	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1445
				1446	// Get exponent of 2^n which is equal or less than dec_a
				1447	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1448
				1449	// Get x to range (1, 2]
				1450	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1451	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1452	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1453
				1454	// Polynomial Approximation
				1455	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1456
				1457	// Reconstruct
				1458	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1459
				1460	// Set negative value for 0 < a < 1
				1461	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1462
				1463	return poly;
				1464	}
				1465
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1466	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1467	{
				1468	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1469	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1470	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1471
				1472	// If 0 < a < 1, calculate log(1/x)
				1473	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1474	qint16x4_t recip = vdup_n_s16(0);
				1475	recip = vbsl_s16(calc_reciprocal, recip, a);
				1476
				1477	// Calculate reciprocal
				1478	recip = vrecip_qs16(recip, fixed_point_position);
				1479	a = vbsl_s16(calc_reciprocal, recip, a);
				1480
				1481	// Get decimal part of a
				1482	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1483	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1484
				1485	// Get exponent of 2^n which is equal or less than dec_a
				1486	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1487
				1488	// Get x to range (1, 2]
				1489	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1490	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1491	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1492
				1493	// Polynomial Approximation
				1494	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1495
				1496	// Reconstruct
				1497	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1498
				1499	// Set negative value for 0 < a < 1
				1500	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1501
				1502	return poly;
				1503	}
				1504
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1505	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1506	{
				1507	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1508	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1509	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1510
				1511	// If 0 < a < 1, calculate log(1/x)
				1512	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1513	qint8x16_t recip = vdupq_n_s8(0);
				1514	recip = vbslq_s8(calc_reciprocal, a, recip);
				1515
				1516	// Calculate reciprocal
				1517	recip = vrecipq_qs8(recip, fixed_point_position);
				1518	a = vbslq_s8(calc_reciprocal, recip, a);
				1519
				1520	// Get decimal part of a
				1521	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1522	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1523
				1524	// Get exponent of 2^n which is equal or less than dec_a
				1525	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1526
				1527	// Get x to range (1, 2]
				1528	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1529	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1530	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1531
				1532	// Polynomial Approximation
				1533	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1534
				1535	// Reconstruct
				1536	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1537
				1538	// Set negative value for 0 < a < 1
				1539	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1540
				1541	return poly;
				1542	}
				1543
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1544	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1545	{
				1546	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1547	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1548	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1549
				1550	// If 0 < a < 1, calculate log(1/x)
				1551	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1552	qint16x8_t recip = vdupq_n_s16(0);
				1553	recip = vbslq_s16(calc_reciprocal, a, recip);
				1554
				1555	// Calculate reciprocal
				1556	recip = vqrecipq_qs16(recip, fixed_point_position);
				1557	a = vbslq_s16(calc_reciprocal, recip, a);
				1558
				1559	// Get decimal part of a
				1560	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1561	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1562
				1563	// Get exponent of 2^n which is equal or less than dec_a
				1564	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1565
				1566	// Get x to range (1, 2]
				1567	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1568	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1569	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1570
				1571	// Polynomial Approximation
				1572	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1573
				1574	// Reconstruct
				1575	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1576
				1577	// Set negative value for 0 < a < 1
				1578	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1579
				1580	return poly;
				1581	}
				1582
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1583	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1584	{
				1585	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1586
				1587	// Find shift value. Number must be in (0.5, 2) range.
				1588	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1589
				1590	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1591	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1592	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1593	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1594	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1595
				1596	temp = vshl_s8(a, shift_value);
				1597
				1598	// Initial guess
				1599	qint8x8_t x = temp;
				1600
				1601	// Calculate (x / 2) * (3 - a * x^2)
				1602	// After three iterations we have the result for 8 bit
				1603	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1604	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1605	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1606
				1607	return vshl_s8(x, shift_value2);
				1608	}
				1609
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1610	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1611	{
				1612	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1613
				1614	// Find shift value. Number must be in (0.5, 2) range.
				1615	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1616
				1617	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1618	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1619	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1620	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1621	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1622
				1623	temp = vshl_s16(a, shift_value);
				1624
				1625	// Initial guess
				1626	qint16x4_t x = temp;
				1627
				1628	// Calculate (x / 2) * (3 - a * x^2)
				1629	// After five iterations we have the result for 8 bit
				1630	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1631	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1632	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1633	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1634	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1635
				1636	return vshl_s16(x, shift_value2);
				1637	}
				1638
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1639	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1640	{
				1641	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1642
				1643	// Find shift value. Number must be in (0.5, 2) range.
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1644	qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1645
				1646	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1647	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1648	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1649	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1650	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1651
				1652	temp = vshl_s8(a, shift_value);
				1653
				1654	// Initial guess
				1655	qint8x8_t x = temp;
				1656
				1657	// Calculate (x / 2) * (3 - a * x^2)
				1658	// After three iterations we have the result for 8 bit
				1659	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1660	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1661	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1662
				1663	return vshl_s8(x, shift_value2);
				1664	}
				1665
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1666	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1667	{
				1668	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1669
				1670	// Find shift value. Number must be in (0.5, 2) range.
				1671	qint16x4_t shift_value = vneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1672
				1673	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1674	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1675	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1676	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
				1677	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1678
				1679	temp = vshl_s16(a, shift_value);
				1680
				1681	// Initial guess
				1682	qint16x4_t x = temp;
				1683
				1684	// Calculate (x / 2) * (3 - a * x^2)
				1685	// After five iterations we have the result for 16 bit
				1686	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1687	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1688	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1689	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1690	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1691
				1692	return vqshl_s16(x, shift_value2);
				1693	}
				1694
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1695	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1696	{
				1697	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1698
				1699	// Find shift value. Number must be in (0.5, 2) range.
				1700	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1701
				1702	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1703	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1704	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1705	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1706	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1707
				1708	temp = vshlq_s8(a, shift_value);
				1709
				1710	// Initial guess
				1711	qint8x16_t x = temp;
				1712
				1713	// Calculate (x / 2) * (3 - a * x^2)
				1714	// After three iterations we have the result for 8 bit
				1715	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1716	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1717	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1718
				1719	return vshlq_s8(x, shift_value2);
				1720	}
				1721
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1722	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1723	{
				1724	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1725
				1726	// Find shift value. Number must be in (0.5, 2) range.
				1727	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1728
				1729	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1730	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1731	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1732	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1733	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1734
				1735	temp = vshlq_s16(a, shift_value);
				1736
				1737	// Initial guess
				1738	qint16x8_t x = temp;
				1739
				1740	// Calculate (x / 2) * (3 - a * x^2)
				1741	// After five iterations we have the result for 16 bit
				1742	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1743	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1744	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1745	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1746	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1747
				1748	return vshlq_s16(x, shift_value2);
				1749	}
				1750
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1751	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1752	{
				1753	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1754
				1755	// Find shift value. Number must be in (0.5, 2) range.
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1756	qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1757
				1758	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1759	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1760	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1761	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1762	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1763
				1764	temp = vshlq_s8(a, shift_value);
				1765
				1766	// Initial guess
				1767	qint8x16_t x = temp;
				1768
				1769	// Calculate (x / 2) * (3 - a * x^2)
				1770	// After three iterations we have the result for 8 bit
				1771	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1772	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1773	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1774
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1775	return vqshlq_s8(x, shift_value2);
				1776	}
				1777
				1778	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1779	{
				1780	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1781
				1782	// Find shift value. Number must be in (0.5, 2) range.
				1783	qint16x8_t shift_value = vnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1784
				1785	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1786	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1787	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1788	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
				1789	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1790
				1791	temp = vqshlq_s16(a, shift_value);
				1792
				1793	// Initial guess
				1794	qint16x8_t x = temp;
				1795
				1796	// Calculate (x / 2) * (3 - a * x^2)
				1797	// After five iterations we have the result for 16 bit
				1798	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1799	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1800	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1801	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1802	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1803
				1804	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1805	}
				1806
				1807	inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
				1808	{
				1809	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1810	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1811
				1812	qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1813	qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1814	qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1815	qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
				1816
				1817	return tanh;
				1818	}
				1819
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1820	inline qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position)
				1821	{
				1822	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1823	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1824
				1825	qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1826	qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1827	qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1828	qint16x4_t tanh = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
				1829
				1830	return tanh;
				1831	}
				1832
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1833	inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
				1834	{
				1835	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1836	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1837
				1838	qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1839	qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1840	qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1841	qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
				1842
				1843	return tanh;
				1844	}
				1845
				1846	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1847	{
				1848	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1849	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1850
				1851	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1852	{
				1853	float32x4x2_t res =
				1854	{
				1855	{
				1856	vmaxq_f32(a.val[0], b.val[0]),
				1857	vmaxq_f32(a.val[1], b.val[1])
				1858	}
				1859	};
				1860	return res;
				1861	}
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1862
				1863	inline qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1864	{
				1865	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1866	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1867
				1868	qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1869	qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1870	qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1871	qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
				1872
				1873	return tanh;
				1874	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1875	}