Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: b241dd5069bda8fc0054ea6028d05063aa02f730 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	namespace arm_compute
				26	{
				27	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				28	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	29	static const std::array<qint8x8_t, 4> exp_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	30	{
				31	{
				32	vdup_n_s8(0x7F), // 0.9978546
				33	vdup_n_s8(0x3F), // 0.4994721
				34	vdup_n_s8(0x16), // 0.1763723
				35	vdup_n_s8(0x05), // 0.0435108
				36	}
				37	};
				38
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	39	/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
				40	* Format is in Q0.15 for all elements */
				41	static const std::array<qint16x4_t, 4> exp_tab_qs16 =
				42	{
				43	{
				44	vdup_n_s16(0x7FBA), // 0.9978546
				45	vdup_n_s16(0x3FE9), // 0.4994721
				46	vdup_n_s16(0x1693), // 0.1763723
				47	vdup_n_s16(0x0592), // 0.0435108
				48	}
				49	};
				50
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	51	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				52	* Format is in Q0.7 for all elements */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	53	static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	54	{
				55	{
				56	vdupq_n_s8(0x7F), // 0.9978546
				57	vdupq_n_s8(0x3F), // 0.4994721
				58	vdupq_n_s8(0x16), // 0.1763723
				59	vdupq_n_s8(0x05), // 0.0435108
				60	}
				61	};
				62
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	63	/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
				64	* Format is in Q0.15 for all elements */
				65	static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
				66	{
				67	{
				68	vdupq_n_s16(0x7FBA), // 0.9978546
				69	vdupq_n_s16(0x3FE9), // 0.4994721
				70	vdupq_n_s16(0x1693), // 0.1763723
				71	vdupq_n_s16(0x0592), // 0.0435108
				72	}
				73	};
				74
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	75	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				76	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	77	static const std::array<qint8x8_t, 4> log_tab_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	78	{
				79	{
				80	vdup_n_s8(0x5C), // 1.4384189
				81	vdup_n_s8(-0x56), // -0.6771900
				82	vdup_n_s8(0x29), // 0.3218538
				83	vdup_n_s8(-0x0A), // -0.0832229
				84	}
				85	};
				86
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	87	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				88	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				89	static const std::array<qint16x4_t, 4> log_tab_qs16 =
				90	{
				91	{
				92	vdup_n_s16(0x5C0F), // 1.4384189
				93	vdup_n_s16(-0x56AE), // -0.6771900
				94	vdup_n_s16(0x2933), // 0.3218538
				95	vdup_n_s16(-0x0AA7), // -0.0832229
				96	}
				97	};
				98
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	99	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				100	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	101	static const std::array<qint8x16_t, 4> log_tabq_qs8 =
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	102	{
				103	{
				104	vdupq_n_s8(0x5C), // 1.4384189
				105	vdupq_n_s8(-0x56), // -0.6771900
				106	vdupq_n_s8(0x29), // 0.3218538
				107	vdupq_n_s8(-0x0A), // -0.0832229
				108	}
				109	};
				110
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	111	/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
				112	* Format is in Q0.15 for all elements except the first one which is in Q1.14 */
				113	static const std::array<qint16x8_t, 4> log_tabq_qs16 =
				114	{
				115	{
				116	vdupq_n_s16(0x5C0F), // 1.4384189
				117	vdupq_n_s16(-0x56AE), // -0.6771900
				118	vdupq_n_s16(0x2933), // 0.3218538
				119	vdupq_n_s16(-0x0AA7), // -0.0832229
				120	}
				121	};
				122
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	123	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				124	{
				125	return vget_low_s8(a);
				126	}
				127
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	128	inline qint16x4_t vget_low_qs16(qint16x8_t a)
				129	{
				130	return vget_low_s16(a);
				131	}
				132
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				134	{
				135	return vget_high_s8(a);
				136	}
				137
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	138	inline qint16x4_t vget_high_qs16(qint16x8_t a)
				139	{
				140	return vget_high_s16(a);
				141	}
				142
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	143	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				144	{
				145	return vld1_s8(addr);
				146	}
				147
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	148	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				149	{
				150	return vld1_s16(addr);
				151	}
				152
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	153	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				154	{
				155	return vld1q_s8(addr);
				156	}
				157
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	158	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				159	{
				160	return vld1q_s16(addr);
				161	}
				162
				163	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				164	{
				165	return vld1_dup_s8(addr);
				166	}
				167
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	168	inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
				169	{
				170	return vld1_dup_s16(addr);
				171	}
				172
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	173	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				174	{
				175	return vld1q_dup_s8(addr);
				176	}
				177
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	178	inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
				179	{
				180	return vld1q_dup_s16(addr);
				181	}
				182
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	183	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				184	{
				185	vst1_s8(addr, b);
				186	}
				187
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	188	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				189	{
				190	vst1_s16(addr, b);
				191	}
				192
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	193	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				194	{
				195	vst1q_s8(addr, b);
				196	}
				197
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	198	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				199	{
				200	vst1q_s16(addr, b);
				201	}
				202
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	203	inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
				204	{
				205	vst2q_s16(addr, b);
				206	}
				207
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	208	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				209	{
				210	return vqmovn_s16(a);
				211	}
				212
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	213	inline qint16x4_t vqmovn_qs32(qint32x4_t a)
				214	{
				215	return vqmovn_s32(a);
				216	}
				217
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	218	inline qint8x8_t vdup_n_qs8(qint8_t a)
				219	{
				220	return vdup_n_s8(a);
				221	}
				222
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	223	inline qint16x4_t vdup_n_qs16(qint16_t a)
				224	{
				225	return vdup_n_s16(a);
				226	}
				227
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	228	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				229	{
				230	return vdupq_n_s8(a);
				231	}
				232
				233	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				234	{
				235	float32x4x4_t res =
				236	{
				237	{
				238	vdupq_n_f32(a),
				239	vdupq_n_f32(a),
				240	vdupq_n_f32(a),
				241	vdupq_n_f32(a),
				242	}
				243	};
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	244	return vqcvtq_qs8_f32(res, fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	245	}
				246
				247	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				248	{
				249	return vdupq_n_s16(a);
				250	}
				251
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	252	inline qint32x4_t vdupq_n_qs32(qint32_t a)
				253	{
				254	return vdupq_n_s32(a);
				255	}
				256
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	257	inline qint8x8_t vabs_qs8(qint8x8_t a)
				258	{
				259	return vabs_s8(a);
				260	}
				261
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	262	inline qint16x4_t vabs_qs16(qint16x4_t a)
				263	{
				264	return vabs_s16(a);
				265	}
				266
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	267	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				268	{
				269	return vabsq_s8(a);
				270	}
				271
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	272	inline qint16x8_t vabsq_qs16(qint16x8_t a)
				273	{
				274	return vabsq_s16(a);
				275	}
				276
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	277	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				278	{
				279	return vqabs_s8(a);
				280	}
				281
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	282	inline qint16x4_t vqabs_qs16(qint16x4_t a)
				283	{
				284	return vqabs_s16(a);
				285	}
				286
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	287	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				288	{
				289	return vqabsq_s8(a);
				290	}
				291
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	292	inline qint16x8_t vqabsq_qs16(qint16x8_t a)
				293	{
				294	return vqabsq_s16(a);
				295	}
				296
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	297	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				298	{
				299	return vmax_s8(a, b);
				300	}
				301
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	302	inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
				303	{
				304	return vmax_s16(a, b);
				305	}
				306
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	307	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				308	{
				309	return vmaxq_s8(a, b);
				310	}
				311
				312	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				313	{
				314	return vpmax_s8(a, b);
				315	}
				316
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	317	inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
				318	{
				319	return vpmax_s16(a, b);
				320	}
				321
				322	inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
				323	{
				324	return vmaxq_s16(a, b);
				325	}
				326
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	327	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				328	{
				329	return vmin_s8(a, b);
				330	}
				331
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	332	inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
				333	{
				334	return vmin_s16(a, b);
				335	}
				336
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	337	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				338	{
				339	return vminq_s8(a, b);
				340	}
				341
				342	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				343	{
				344	return vpmin_s8(a, b);
				345	}
				346
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	347	inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
				348	{
				349	return vpmin_s16(a, b);
				350	}
				351
				352	inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
				353	{
				354	return vminq_s16(a, b);
				355	}
				356
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	357	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				358	{
				359	return vadd_s8(a, b);
				360	}
				361
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	362	inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
				363	{
				364	return vadd_s16(a, b);
				365	}
				366
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	367	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				368	{
				369	return vaddq_s8(a, b);
				370	}
				371
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	372	inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
				373	{
				374	return vaddq_s16(a, b);
				375	}
				376
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	377	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				378	{
				379	return vqadd_s8(a, b);
				380	}
				381
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	382	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				383	{
				384	return vqadd_s16(a, b);
				385	}
				386
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	387	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				388	{
				389	return vqaddq_s8(a, b);
				390	}
				391
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	392	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				393	{
				394	return vqaddq_s16(a, b);
				395	}
				396
				397	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				398	{
				399	return vpaddl_s8(a);
				400	}
				401
				402	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				403	{
				404	return vsub_s8(a, b);
				405	}
				406
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	407	inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
				408	{
				409	return vsub_s16(a, b);
				410	}
				411
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	412	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				413	{
				414	return vsubq_s8(a, b);
				415	}
				416
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	417	inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
				418	{
				419	return vsubq_s16(a, b);
				420	}
				421
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	422	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				423	{
				424	return vqsub_s8(a, b);
				425	}
				426
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	427	inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
				428	{
				429	return vqsub_s16(a, b);
				430	}
				431
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	432	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				433	{
				434	return vqsubq_s8(a, b);
				435	}
				436
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	437	inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
				438	{
				439	return vqsubq_s16(a, b);
				440	}
				441
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	442	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				443	{
				444	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				445
				446	// Initialize the temporary result with a constant used to round up the result
				447	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				448
				449	// Vector multiply-accumulate long
				450	res = vmlal_s8(res, a, b);
				451
				452	// Shift right by fixed_point_position
				453	res = vshlq_s16(res, fixed_point_position_s16);
				454
				455	// Convert back to qint8
				456	return vmovn_s16(res);
				457	}
				458
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	459	inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				460	{
				461	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				462
				463	// Initialize the temporary result with a constant used to round up the result
				464	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				465
				466	// Vector multiply-accumulate long
				467	res = vmlal_s16(res, a, b);
				468
				469	// Shift right by fixed_point_position
				470	res = vshlq_s32(res, fixed_point_position_s32);
				471
				472	// Convert back to qint16
				473	return vmovn_s32(res);
				474	}
				475
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	476	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				477	{
				478	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				479
				480	// Initialize the temporary results with a constant used to round up the result
				481	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				482	qint16x8_t res1 = res0;
				483
				484	// Vector multiply-accumulate long
				485	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				486	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				487
				488	// Shift right by fixed_point_position
				489	res0 = vshlq_s16(res0, fixed_point_position_s16);
				490	res1 = vshlq_s16(res1, fixed_point_position_s16);
				491
				492	// Convert back to qint8
				493	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				494	}
				495
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	496	inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				497	{
				498	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				499
				500	// Initialize the temporary results with a constant used to round up the result
				501	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				502	qint32x4_t res1 = res0;
				503
				504	// Vector multiply-accumulate long
				505	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				506	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				507
				508	// Shift right by fixed_point_position
				509	res0 = vshlq_s32(res0, fixed_point_position_s32);
				510	res1 = vshlq_s32(res1, fixed_point_position_s32);
				511
				512	// Convert back to qint16
				513	return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
				514	}
				515
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	516	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				517	{
				518	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				519
				520	// Initialize the temporary result with a constant used to round up the result
				521	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				522
				523	// Vector multiply-accumulate long
				524	res = vmlal_s8(res, a, b);
				525
				526	// Shift right by fixed_point_position
				527	res = vqshlq_s16(res, fixed_point_position_s16);
				528
				529	// Convert back to qint8 and saturate
				530	return vqmovn_s16(res);
				531	}
				532
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	533	inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				534	{
				535	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				536
				537	// Initialize the temporary result with a constant used to round up the result
				538	qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
				539
				540	// Vector multiply-accumulate long
				541	res = vmlal_s16(res, a, b);
				542
				543	// Shift right by fixed_point_position
				544	res = vqshlq_s32(res, fixed_point_position_s32);
				545
				546	// Convert back to qint16 and saturate
				547	return vqmovn_s32(res);
				548	}
				549
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	550	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				551	{
				552	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				553
				554	// Initialize the temporary results with a constant used to round up the result
				555	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				556	qint16x8_t res1 = res0;
				557
				558	// Vector multiply-accumulate long
				559	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				560	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				561
				562	// Shift right by fixed_point_position
				563	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				564	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				565
				566	// Convert back to qint8 and saturate
				567	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				568	}
				569
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	570	inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				571	{
				572	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				573
				574	// Initialize the temporary results with a constant used to round up the result
				575	qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				576	qint32x4_t res1 = res0;
				577
				578	// Vector multiply-accumulate long
				579	res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
				580	res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
				581
				582	// Shift right by fixed_point_position
				583	res0 = vqshlq_s32(res0, fixed_point_position_s32);
				584	res1 = vqshlq_s32(res1, fixed_point_position_s32);
				585
				586	// Convert back to qint16 and saturate
				587	return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
				588	}
				589
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	590	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				591	{
				592	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				593
				594	qint16x8_t res = vmull_s8(a, b);
				595
				596	return vqrshlq_s16(res, fixed_point_position_s16);
				597	}
				598
				599	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				600	{
				601	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				602
				603	// Initialize the temporary results with a constant used to round up the result
				604	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				605
				606	// Vector multiply-accumulate long
				607	tmp = vmlal_s8(tmp, b, c);
				608
				609	// Shift right by fixed_point_position
				610	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				611
				612	// Convert back to qint8 and accumulate
				613	return vadd_s8(a, vmovn_s16(tmp));
				614	}
				615
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	616	inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				617	{
				618	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				619
				620	// Initialize the temporary results with a constant used to round up the result
				621	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				622
				623	// Vector multiply-accumulate long
				624	tmp = vmlal_s16(tmp, b, c);
				625
				626	// Shift right by fixed_point_position
				627	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				628
				629	// Convert back to qint16 and accumulate
				630	return vadd_s16(a, vmovn_s32(tmp));
				631	}
				632
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	633	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				634	{
				635	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				636
				637	// Initialize the temporary results with a constant used to round up the result
				638	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				639	qint16x8_t tmp1 = tmp0;
				640
				641	// Vector multiply-accumulate long
				642	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				643	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				644
				645	// Shift right by fixed_point_position
				646	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				647	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				648
				649	// Convert back to qint8 and accumulate
				650	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				651	}
				652
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	653	inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				654	{
				655	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				656
				657	// Initialize the temporary results with a constant used to round up the result
				658	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				659	qint32x4_t tmp1 = tmp0;
				660
				661	// Vector multiply-accumulate long
				662	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				663	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				664
				665	// Shift right by fixed_point_position
				666	tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
				667	tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
				668
				669	// Convert back to qint16 and accumulate
				670	return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
				671	}
				672
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	673	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				674	{
				675	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				676
				677	// Initialize the temporary results with a constant used to round up the result
				678	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				679
				680	// Vector multiply-accumulate long
				681	tmp = vmlal_s8(tmp, b, c);
				682
				683	// Shift right by fixed_point_position
				684	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				685
				686	// Convert back to qint8 and accumulate
				687	return vqadd_s8(a, vqmovn_s16(tmp));
				688	}
				689
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	690	inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				691	{
				692	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				693
				694	// Initialize the temporary results with a constant used to round up the result
				695	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				696
				697	// Vector multiply-accumulate long
				698	tmp = vmlal_s16(tmp, b, c);
				699
				700	// Shift right by fixed_point_position
				701	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				702
				703	// Convert back to qint8 and accumulate
				704	return vqadd_s16(a, vqmovn_s32(tmp));
				705	}
				706
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	707	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				708	{
				709	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				710
				711	// Initialize the temporary results with a constant used to round up the result
				712	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				713	qint16x8_t tmp1 = tmp0;
				714
				715	// Vector multiply-accumulate long
				716	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				717	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				718
				719	// Shift right by fixed_point_position
				720	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				721	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				722
				723	// Convert back to qint8 and accumulate
				724	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				725	return vqaddq_s8(a, res);
				726	}
				727
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	728	inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
				729	{
				730	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				731
				732	// Initialize the temporary results with a constant used to round up the result
				733	qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
				734	qint32x4_t tmp1 = tmp0;
				735
				736	// Vector multiply-accumulate long
				737	tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
				738	tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
				739
				740	// Shift right by fixed_point_position
				741	tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
				742	tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
				743
				744	// Convert back to qint16 and accumulate
				745	qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
				746	return vqaddq_s16(a, res);
				747	}
				748
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	749	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				750	{
				751	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				752
				753	// Initialize the temporary results with a constant used to round up the result
				754	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				755
				756	// Vector multiply-accumulate long
				757	tmp = vmlal_s8(tmp, b, c);
				758
				759	// Shift right by fixed_point_position
				760	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				761
				762	// Accumulate
				763	return vaddq_s16(a, tmp);
				764	}
				765
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	766	inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				767	{
				768	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				769
				770	// Initialize the temporary results with a constant used to round up the result
				771	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				772
				773	// Vector multiply-accumulate long
				774	tmp = vmlal_s16(tmp, b, c);
				775
				776	// Shift right by fixed_point_position
				777	tmp = vshlq_s32(tmp, fixed_point_position_s32);
				778
				779	// Accumulate
				780	return vaddq_s32(a, tmp);
				781	}
				782
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	783	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				784	{
				785	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				786
				787	// Initialize the temporary results with a constant used to round up the result
				788	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				789
				790	// Vector multiply-accumulate long
				791	tmp = vmlal_s8(tmp, b, c);
				792
				793	// Shift right by fixed_point_position
				794	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				795
				796	// Accumulate
				797	return vqaddq_s16(a, tmp);
				798	}
				799
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	800	inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
				801	{
				802	const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
				803
				804	// Initialize the temporary results with a constant used to round up the result
				805	qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
				806
				807	// Vector multiply-accumulate long
				808	tmp = vmlal_s16(tmp, b, c);
				809
				810	// Shift right by fixed_point_position
				811	tmp = vqshlq_s32(tmp, fixed_point_position_s32);
				812
				813	// Accumulate
				814	return vqaddq_s32(a, tmp);
				815	}
				816
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	817	inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	818	{
				819	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				820
				821	float32x4x2_t res_f32 =
				822	{
				823	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	824	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				825	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	826	}
				827	};
				828
				829	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				830	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				831
				832	const int32x4x2_t res_s32 =
				833	{
				834	{
				835	vcvtq_s32_f32(res_f32.val[0]),
				836	vcvtq_s32_f32(res_f32.val[1]),
				837	}
				838	};
				839
				840	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				841
				842	return vqmovn_s16(res_s16);
				843	}
				844
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	845	inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	846	{
				847	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				848
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	849	float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	850
				851	res_f32 = vmlaq_f32(res_f32, a, pow2);
				852
				853	const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
				854
				855	return vqmovn_s32(res_s32);
				856	}
				857
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	858	inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	859	{
				860	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				861
				862	float32x4x4_t res_f32 =
				863	{
				864	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	865	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				866	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				867	vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				868	vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	869	}
				870	};
				871
				872	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				873	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				874	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				875	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				876
				877	const int32x4x4_t res_s32 =
				878	{
				879	{
				880	vcvtq_s32_f32(res_f32.val[0]),
				881	vcvtq_s32_f32(res_f32.val[1]),
				882	vcvtq_s32_f32(res_f32.val[2]),
				883	vcvtq_s32_f32(res_f32.val[3]),
				884	}
				885	};
				886
				887	const int16x8x2_t res_s16 =
				888	{
				889	{
				890	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				891	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				892	}
				893	};
				894
				895	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				896	}
				897
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	898	inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	899	{
				900	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				901
				902	float32x4x2_t res_f32 =
				903	{
				904	{
Georgios Pinitas	21efeb4	2017-07-04 12:47:17 +0100	[diff] [blame]	905	vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
				906	vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	907	}
				908	};
				909
				910	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				911	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				912
				913	const int32x4x2_t res_s32 =
				914	{
				915	{
				916	vcvtq_s32_f32(res_f32.val[0]),
				917	vcvtq_s32_f32(res_f32.val[1])
				918	}
				919	};
				920
				921	return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				922	}
				923
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	924	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				925	{
				926	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				927
				928	const int16x8_t res_s16 = vmovl_s8(a);
				929
				930	const int32x4x2_t res_s32 =
				931	{
				932	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	933	vmovl_s16(vget_low_qs16(res_s16)),
				934	vmovl_s16(vget_high_qs16(res_s16))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	935	}
				936	};
				937
				938	float32x4x2_t res_f32 =
				939	{
				940	{
				941	vcvtq_f32_s32(res_s32.val[0]),
				942	vcvtq_f32_s32(res_s32.val[1])
				943	}
				944	};
				945
				946	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				947	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				948
				949	return res_f32;
				950	}
				951
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	952	inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
				953	{
				954	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				955	const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
				956
				957	return vmulq_f32(res_f32, pow2);
				958	}
				959
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	960	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				961	{
				962	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				963
				964	const int16x8x2_t res_s16 =
				965	{
				966	{
				967	vmovl_s8(vget_low_s8(a)),
				968	vmovl_s8(vget_high_s8(a)),
				969	}
				970	};
				971
				972	const int32x4x4_t res_s32 =
				973	{
				974	{
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	975	vmovl_s16(vget_low_qs16(res_s16.val[0])),
				976	vmovl_s16(vget_high_qs16(res_s16.val[0])),
				977	vmovl_s16(vget_low_qs16(res_s16.val[1])),
				978	vmovl_s16(vget_high_qs16(res_s16.val[1])),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	979	}
				980	};
				981
				982	float32x4x4_t res_f32 =
				983	{
				984	{
				985	vcvtq_f32_s32(res_s32.val[0]),
				986	vcvtq_f32_s32(res_s32.val[1]),
				987	vcvtq_f32_s32(res_s32.val[2]),
				988	vcvtq_f32_s32(res_s32.val[3])
				989	}
				990	};
				991
				992	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				993	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				994	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				995	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				996
				997	return res_f32;
				998	}
				999
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1000	inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
				1001	{
				1002	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				1003
				1004	const int32x4x2_t res_s32 =
				1005	{
				1006	{
				1007	vmovl_s16(vget_low_qs16(a)),
				1008	vmovl_s16(vget_high_qs16(a))
				1009	}
				1010	};
				1011
				1012	float32x4x2_t res_f32 =
				1013	{
				1014	{
				1015	vcvtq_f32_s32(res_s32.val[0]),
				1016	vcvtq_f32_s32(res_s32.val[1])
				1017	}
				1018	};
				1019
				1020	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				1021	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				1022
				1023	return res_f32;
				1024	}
				1025
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1026	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				1027	{
				1028	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1029	const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1030	const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
				1031	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1032
				1033	// Find shift value
				1034	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1035	const qint8x8_t temp = vshl_s8(a, shift_value);
				1036
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1037	qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1038
				1039	uint8x8_t set_one = vcgt_s8(x, const_one);
				1040	x = vbsl_s8(set_one, const_one, x);
				1041
				1042	// Use three iterations of Newton-Raphson method to get the result
				1043	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1044	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1045	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1046
				1047	return vshl_s8(x, shift_value);
				1048	}
				1049
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1050	inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
				1051	{
				1052	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1053	const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
				1054	const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1055	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1056
				1057	// Find shift value
				1058	const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1059	const qint16x4_t temp = vshl_s16(a, shift_value);
				1060
				1061	qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
				1062
				1063	uint16x4_t set_one = vcgt_s16(x, const_one);
				1064	x = vbsl_s16(set_one, const_one, x);
				1065
				1066	// Use five iterations of Newton-Raphson method to get the result
				1067	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1068	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1069	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1070	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1071	x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1072
				1073	return vshl_s16(x, shift_value);
				1074	}
				1075
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1076	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1077	{
				1078	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1079	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1080	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1081	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1082
				1083	// Find shift value
				1084	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1085	const qint8x16_t temp = vshlq_s8(a, shift_value);
				1086
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1087	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1088
				1089	// Set initial guess to one if x > 1
				1090	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1091	x = vbslq_s8(set_one, const_one, x);
				1092
				1093	// Use three iterations of Newton-Raphson method to get the result
				1094	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1095	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1096	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1097
				1098	return vshlq_s8(x, shift_value);
				1099	}
				1100
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1101	inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1102	{
				1103	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1104	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1105	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1106	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1107
				1108	// Find shift value
				1109	const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1110	const qint16x8_t temp = vshlq_s16(a, shift_value);
				1111
				1112	qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1113
				1114	// Set initial guess to one if x > 1
				1115	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1116	x = vbslq_s16(set_one, const_one, x);
				1117
				1118	// Use five iterations of Newton-Raphson method to get the result
				1119	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1120	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1121	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1122	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1123	x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1124
				1125	return vshlq_s16(x, shift_value);
				1126	}
				1127
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1128	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				1129	{
				1130	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1131	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823
				1132	const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				1133	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1134
				1135	// Find shift value
				1136	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1137	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				1138
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1139	qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1140
				1141	// Set initial guess to one if x > 1
				1142	uint8x16_t set_one = vcgtq_s8(x, const_one);
				1143	x = vbslq_s8(set_one, const_one, x);
				1144
				1145	// Use three iterations of Newton-Raphson method to get the result
				1146	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1147	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1148	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				1149
				1150	return vqshlq_s8(x, shift_value);
				1151	}
				1152
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1153	inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
				1154	{
				1155	// We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
				1156	const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
				1157	const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
				1158	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1159
				1160	// Find shift value
				1161	const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1162	const qint16x8_t temp = vqshlq_s16(a, shift_value);
				1163
				1164	qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
				1165
				1166	// Set initial guess to one if x > 1
				1167	uint16x8_t set_one = vcgtq_s16(x, const_one);
				1168	x = vbslq_s16(set_one, const_one, x);
				1169
				1170	// Use five iterations of Newton-Raphson method to get the result
				1171	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1172	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1173	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1174	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1175	x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
				1176
				1177	return vqshlq_s16(x, shift_value);
				1178	}
				1179
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1180	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				1181	{
				1182	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				1183	}
				1184
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1185	inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
				1186	{
				1187	return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
				1188	}
				1189
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1190	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1191	{
				1192	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				1193	}
				1194
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1195	inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
				1196	{
				1197	return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
				1198	}
				1199
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1200	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1201	inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1202	{
				1203	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1204	const qint8x8_t const_one = vdup_n_s8(1);
				1205	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				1206	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1207	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1208	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1209	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				1210	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				1211	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				1212	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				1213	return res;
				1214	}
				1215
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1216	template <bool islog>
				1217	inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1218	{
				1219	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1220	const qint16x4_t const_one = vdup_n_s16(1);
				1221	const qint16x4_t A = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
				1222	const qint16x4_t B = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1223	const qint16x4_t C = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1224	const qint16x4_t D = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1225	const qint16x4_t x1 = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
				1226	const qint16x4_t x2 = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
				1227	const qint16x4_t x3 = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
				1228	const qint16x4_t res = vmul_qs16(a, x3, fixed_point_position);
				1229	return res;
				1230	}
				1231
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1232	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1233	inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1234	{
				1235	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				1236	const qint8x8_t const_one = vdup_n_s8(1);
				1237	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				1238	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				1239	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				1240	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				1241	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				1242	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				1243	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				1244	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				1245	return res;
				1246	}
				1247
				1248	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1249	inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
				1250	{
				1251	const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
				1252	const qint16x4_t const_one = vdup_n_s16(1);
				1253	const qint16x4_t A = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
				1254	const qint16x4_t B = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
				1255	const qint16x4_t C = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
				1256	const qint16x4_t D = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
				1257	const qint16x4_t x1 = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
				1258	const qint16x4_t x2 = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
				1259	const qint16x4_t x3 = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
				1260	const qint16x4_t res = vqmul_qs16(a, x3, fixed_point_position);
				1261	return res;
				1262	}
				1263
				1264	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1265	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1266	{
				1267	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1268	const qint8x16_t const_one = vdupq_n_s8(1);
				1269	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				1270	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1271	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1272	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1273	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				1274	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				1275	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				1276	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				1277	return res;
				1278	}
				1279
				1280	template <bool islog>
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1281	inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1282	{
				1283	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1284	const qint16x8_t const_one = vdupq_n_s16(1);
				1285	const qint16x8_t A = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
				1286	const qint16x8_t B = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1287	const qint16x8_t C = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1288	const qint16x8_t D = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1289	const qint16x8_t x1 = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
				1290	const qint16x8_t x2 = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
				1291	const qint16x8_t x3 = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
				1292	const qint16x8_t res = vmulq_qs16(a, x3, fixed_point_position);
				1293	return res;
				1294	}
				1295
				1296	template <bool islog>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1297	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				1298	{
				1299	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				1300	const qint8x16_t const_one = vdupq_n_s8(1);
				1301	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				1302	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				1303	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				1304	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				1305	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				1306	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				1307	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				1308	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				1309	return res;
				1310	}
				1311
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1312	template <bool islog>
				1313	inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
				1314	{
				1315	const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
				1316	const qint16x8_t const_one = vdupq_n_s16(1);
				1317	const qint16x8_t A = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
				1318	const qint16x8_t B = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
				1319	const qint16x8_t C = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
				1320	const qint16x8_t D = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
				1321	const qint16x8_t x1 = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
				1322	const qint16x8_t x2 = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
				1323	const qint16x8_t x3 = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
				1324	const qint16x8_t res = vqmulq_qs16(a, x3, fixed_point_position);
				1325	return res;
				1326	}
				1327
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1328	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				1329	{
				1330	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				1331	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1332	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				1333	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1334
				1335	// Perform range reduction [-log(2),log(2)]
				1336	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1337
				1338	// get decimal part from m
				1339	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				1340
				1341	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1342	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				1343
				1344	// Polynomial Approximation
				1345	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				1346	poly = vqadd_s8(poly, const_one);
				1347
				1348	// Reconstruct
				1349	poly = vqshl_s8(poly, dec_m);
				1350
				1351	return poly;
				1352	}
				1353
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1354	inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
				1355	{
				1356	const qint16x4_t shift_value = vdup_n_s16(fixed_point_position - 15);
				1357	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1358	const qint16x4_t const_ln2 = vqrshl_s16(vdup_n_s16(0x58B9), shift_value); // ln(2)
				1359	const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1360
				1361	// Perform range reduction [-log(2),log(2)]
				1362	const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1363
				1364	// get decimal part from m
				1365	const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
				1366
				1367	qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1368	alpha = vqabs_qs16(vqsub_s16(a, alpha));
				1369
				1370	// Polynomial Approximation
				1371	qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
				1372	poly = vqadd_s16(poly, const_one);
				1373
				1374	// Reconstruct
				1375	poly = vqshl_s16(poly, dec_m);
				1376
				1377	return poly;
				1378	}
				1379
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1380	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				1381	{
				1382	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				1383	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1384	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				1385	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				1386
				1387	// Perform range reduction [-log(2),log(2)]
				1388	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1389
				1390	// get decimal part from m
				1391	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				1392
				1393	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				1394	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				1395
				1396	// Polynomial Approximation
				1397	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				1398	poly = vqaddq_s8(poly, const_one);
				1399
				1400	// Reconstruct
				1401	poly = vqshlq_s8(poly, dec_m);
				1402
				1403	return poly;
				1404	}
				1405
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1406	inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
				1407	{
				1408	const qint16x8_t shift_value = vdupq_n_s16(fixed_point_position - 15);
				1409	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1410	const qint16x8_t const_ln2 = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value); // ln(2)
				1411	const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
				1412
				1413	// Perform range reduction [-log(2),log(2)]
				1414	const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				1415
				1416	// get decimal part from m
				1417	const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
				1418
				1419	qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
				1420	alpha = vqabsq_qs16(vqsubq_qs16(a, alpha));
				1421
				1422	// Polynomial Approximation
				1423	qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
				1424	poly = vqaddq_s16(poly, const_one);
				1425
				1426	// Reconstruct
				1427	poly = vqshlq_s16(poly, dec_m);
				1428
				1429	return poly;
				1430	}
				1431
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1432	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				1433	{
				1434	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1435	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				1436	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1437
				1438	// If 0 < a < 1, calculate log(1/x)
				1439	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				1440	qint8x8_t recip = vdup_n_s8(0);
				1441	recip = vbsl_s8(calc_reciprocal, recip, a);
				1442
				1443	// Calculate reciprocal
				1444	recip = vrecip_qs8(recip, fixed_point_position);
				1445	a = vbsl_s8(calc_reciprocal, recip, a);
				1446
				1447	// Get decimal part of a
				1448	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				1449	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				1450
				1451	// Get exponent of 2^n which is equal or less than dec_a
				1452	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				1453
				1454	// Get x to range (1, 2]
				1455	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				1456	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				1457	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				1458
				1459	// Polynomial Approximation
				1460	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				1461
				1462	// Reconstruct
				1463	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				1464
				1465	// Set negative value for 0 < a < 1
				1466	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				1467
				1468	return poly;
				1469	}
				1470
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1471	inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
				1472	{
				1473	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1474	const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
				1475	const qint16x4_t const_ln2 = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1476
				1477	// If 0 < a < 1, calculate log(1/x)
				1478	uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
				1479	qint16x4_t recip = vdup_n_s16(0);
				1480	recip = vbsl_s16(calc_reciprocal, recip, a);
				1481
				1482	// Calculate reciprocal
				1483	recip = vrecip_qs16(recip, fixed_point_position);
				1484	a = vbsl_s16(calc_reciprocal, recip, a);
				1485
				1486	// Get decimal part of a
				1487	qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
				1488	qint16x4_t dec_a = vshl_s16(a, shift_value); // a >> fixed_point_position
				1489
				1490	// Get exponent of 2^n which is equal or less than dec_a
				1491	shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
				1492
				1493	// Get x to range (1, 2]
				1494	const qint16x4_t shift_value_neg = vneg_s16(shift_value);
				1495	const qint16x4_t temp = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
				1496	const qint16x4_t sum = vmul_s16(shift_value, const_one);
				1497
				1498	// Polynomial Approximation
				1499	qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
				1500
				1501	// Reconstruct
				1502	poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
				1503
				1504	// Set negative value for 0 < a < 1
				1505	poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
				1506
				1507	return poly;
				1508	}
				1509
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1510	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				1511	{
				1512	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1513	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				1514	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				1515
				1516	// If 0 < a < 1, calculate log(1/x)
				1517	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				1518	qint8x16_t recip = vdupq_n_s8(0);
				1519	recip = vbslq_s8(calc_reciprocal, a, recip);
				1520
				1521	// Calculate reciprocal
				1522	recip = vrecipq_qs8(recip, fixed_point_position);
				1523	a = vbslq_s8(calc_reciprocal, recip, a);
				1524
				1525	// Get decimal part of a
				1526	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				1527	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				1528
				1529	// Get exponent of 2^n which is equal or less than dec_a
				1530	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				1531
				1532	// Get x to range (1, 2]
				1533	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				1534	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				1535	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				1536
				1537	// Polynomial Approximation
				1538	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				1539
				1540	// Reconstruct
				1541	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				1542
				1543	// Set negative value for 0 < a < 1
				1544	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				1545
				1546	return poly;
				1547	}
				1548
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1549	inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
				1550	{
				1551	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1552	const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
				1553	const qint16x8_t const_ln2 = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
				1554
				1555	// If 0 < a < 1, calculate log(1/x)
				1556	uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
				1557	qint16x8_t recip = vdupq_n_s16(0);
				1558	recip = vbslq_s16(calc_reciprocal, a, recip);
				1559
				1560	// Calculate reciprocal
				1561	recip = vqrecipq_qs16(recip, fixed_point_position);
				1562	a = vbslq_s16(calc_reciprocal, recip, a);
				1563
				1564	// Get decimal part of a
				1565	qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
				1566	qint16x8_t dec_a = vshlq_s16(a, shift_value); // a >> fixed_point_position
				1567
				1568	// Get exponent of 2^n which is equal or less than dec_a
				1569	shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
				1570
				1571	// Get x to range (1, 2]
				1572	const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
				1573	const qint16x8_t temp = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
				1574	const qint16x8_t sum = vmulq_s16(shift_value, const_one);
				1575
				1576	// Polynomial Approximation
				1577	qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
				1578
				1579	// Reconstruct
				1580	poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
				1581
				1582	// Set negative value for 0 < a < 1
				1583	poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
				1584
				1585	return poly;
				1586	}
				1587
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1588	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1589	{
				1590	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1591
				1592	// Find shift value. Number must be in (0.5, 2) range.
				1593	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				1594
				1595	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1596	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				1597	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				1598	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				1599	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				1600
				1601	temp = vshl_s8(a, shift_value);
				1602
				1603	// Initial guess
				1604	qint8x8_t x = temp;
				1605
				1606	// Calculate (x / 2) * (3 - a * x^2)
				1607	// After three iterations we have the result for 8 bit
				1608	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1609	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1610	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1611
				1612	return vshl_s8(x, shift_value2);
				1613	}
				1614
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1615	inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1616	{
				1617	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1618
				1619	// Find shift value. Number must be in (0.5, 2) range.
				1620	qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
				1621
				1622	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1623	qint16x4_t temp = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1624	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1625	temp = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
				1626	qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
				1627
				1628	temp = vshl_s16(a, shift_value);
				1629
				1630	// Initial guess
				1631	qint16x4_t x = temp;
				1632
				1633	// Calculate (x / 2) * (3 - a * x^2)
				1634	// After five iterations we have the result for 8 bit
				1635	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1636	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1637	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1638	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1639	x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1640
				1641	return vshl_s16(x, shift_value2);
				1642	}
				1643
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1644	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				1645	{
				1646	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				1647
				1648	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1649	qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1650
				1651	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1652	qint8x8_t temp = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1653	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1654	temp = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1655	qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1656
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1657	temp = vqshl_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1658
				1659	// Initial guess
				1660	qint8x8_t x = temp;
				1661
				1662	// Calculate (x / 2) * (3 - a * x^2)
				1663	// After three iterations we have the result for 8 bit
				1664	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1665	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1666	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1667
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1668	return vqshl_s8(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1669	}
				1670
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1671	inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
				1672	{
				1673	const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
				1674
				1675	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1676	qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1677
				1678	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1679	qint16x4_t temp = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
				1680	uint16x4_t temp_ltz = vclt_s16(temp, vdup_n_qs16(0));
				1681	temp = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1682	qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1683
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1684	temp = vqshl_s16(a, shift_value);
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1685
				1686	// Initial guess
				1687	qint16x4_t x = temp;
				1688
				1689	// Calculate (x / 2) * (3 - a * x^2)
				1690	// After five iterations we have the result for 16 bit
				1691	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1692	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1693	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1694	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1695	x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1696
				1697	return vqshl_s16(x, shift_value2);
				1698	}
				1699
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1700	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1701	{
				1702	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1703
				1704	// Find shift value. Number must be in (0.5, 2) range.
				1705	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				1706
				1707	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1708	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				1709	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				1710	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				1711	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				1712
				1713	temp = vshlq_s8(a, shift_value);
				1714
				1715	// Initial guess
				1716	qint8x16_t x = temp;
				1717
				1718	// Calculate (x / 2) * (3 - a * x^2)
				1719	// After three iterations we have the result for 8 bit
				1720	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1721	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1722	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1723
				1724	return vshlq_s8(x, shift_value2);
				1725	}
				1726
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1727	inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1728	{
				1729	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1730
				1731	// Find shift value. Number must be in (0.5, 2) range.
				1732	qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
				1733
				1734	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1735	qint16x8_t temp = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1736	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1737	temp = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
				1738	qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
				1739
				1740	temp = vshlq_s16(a, shift_value);
				1741
				1742	// Initial guess
				1743	qint16x8_t x = temp;
				1744
				1745	// Calculate (x / 2) * (3 - a * x^2)
				1746	// After five iterations we have the result for 16 bit
				1747	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1748	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1749	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1750	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1751	x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1752
				1753	return vshlq_s16(x, shift_value2);
				1754	}
				1755
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1756	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				1757	{
				1758	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				1759
				1760	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1761	qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1762
				1763	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1764	qint8x16_t temp = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1765	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1766	temp = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1767	qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1768
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1769	temp = vqshlq_s8(a, shift_value);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1770
				1771	// Initial guess
				1772	qint8x16_t x = temp;
				1773
				1774	// Calculate (x / 2) * (3 - a * x^2)
				1775	// After three iterations we have the result for 8 bit
				1776	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1777	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1778	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1779
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1780	return vqshlq_s8(x, shift_value2);
				1781	}
				1782
				1783	inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
				1784	{
				1785	const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
				1786
				1787	// Find shift value. Number must be in (0.5, 2) range.
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1788	qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1789
				1790	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				1791	qint16x8_t temp = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
				1792	uint16x8_t temp_ltz = vcltq_s16(temp, vdupq_n_qs16(0));
				1793	temp = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1794	qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1795
				1796	temp = vqshlq_s16(a, shift_value);
				1797
				1798	// Initial guess
				1799	qint16x8_t x = temp;
				1800
				1801	// Calculate (x / 2) * (3 - a * x^2)
				1802	// After five iterations we have the result for 16 bit
				1803	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1804	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1805	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1806	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1807	x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				1808
				1809	return vqshlq_s16(x, shift_value2);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1810	}
				1811
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1812	inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1813	{
				1814	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				1815	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				1816
				1817	qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1818	qint8x8_t num = vqsub_qs8(exp2x, const_one);
				1819	qint8x8_t den = vqadd_qs8(exp2x, const_one);
				1820	qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
				1821
				1822	return tanh;
				1823	}
				1824
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1825	inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
Michalis Spyrou	0a8334c	2017-06-14 18:00:05 +0100	[diff] [blame]	1826	{
				1827	const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
				1828	const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
				1829
				1830	qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1831	qint16x4_t num = vqsub_qs16(exp2x, const_one);
				1832	qint16x4_t den = vqadd_qs16(exp2x, const_one);
				1833	qint16x4_t tanh = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
				1834
				1835	return tanh;
				1836	}
				1837
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1838	inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1839	{
				1840	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1841	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1842
				1843	qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1844	qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1845	qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1846	qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
				1847
				1848	return tanh;
				1849	}
				1850
Georgios Pinitas	ccc65d4	2017-06-27 17:39:11 +0100	[diff] [blame^]	1851	inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
				1852	{
				1853	const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
				1854	const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
				1855
				1856	qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
				1857	qint16x8_t num = vqsubq_qs16(exp2x, const_one);
				1858	qint16x8_t den = vqaddq_qs16(exp2x, const_one);
				1859	qint16x8_t tanh = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
				1860
				1861	return tanh;
				1862	}
				1863
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1864	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1865	{
				1866	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1867	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame]	1868
				1869	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1870	{
				1871	float32x4x2_t res =
				1872	{
				1873	{
				1874	vmaxq_f32(a.val[0], b.val[0]),
				1875	vmaxq_f32(a.val[1], b.val[1])
				1876	}
				1877	};
				1878	return res;
				1879	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1880	}