Blame - arm_compute/core/NEON/NEFixedPoint.inl - ml/ComputeLibrary

blob: b57fd3edd2527f23936951d74c7b33c9c3d32ca7 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	namespace arm_compute
				26	{
				27	/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
				28	* Format is in Q0.7 for all elements */
				29	const std::array<qint8x8_t, 4> exp_tab_qs8 =
				30	{
				31	{
				32	vdup_n_s8(0x7F), // 0.9978546
				33	vdup_n_s8(0x3F), // 0.4994721
				34	vdup_n_s8(0x16), // 0.1763723
				35	vdup_n_s8(0x05), // 0.0435108
				36	}
				37	};
				38
				39	/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
				40	* Format is in Q0.7 for all elements */
				41	const std::array<qint8x16_t, 4> exp_tabq_qs8 =
				42	{
				43	{
				44	vdupq_n_s8(0x7F), // 0.9978546
				45	vdupq_n_s8(0x3F), // 0.4994721
				46	vdupq_n_s8(0x16), // 0.1763723
				47	vdupq_n_s8(0x05), // 0.0435108
				48	}
				49	};
				50
				51	/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
				52	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
				53	const std::array<qint8x8_t, 4> log_tab_qs8 =
				54	{
				55	{
				56	vdup_n_s8(0x5C), // 1.4384189
				57	vdup_n_s8(-0x56), // -0.6771900
				58	vdup_n_s8(0x29), // 0.3218538
				59	vdup_n_s8(-0x0A), // -0.0832229
				60	}
				61	};
				62
				63	/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
				64	* Format is in Q0.7 for all elements except the first one which is in Q1.6 */
				65	const std::array<qint8x16_t, 4> log_tabq_qs8 =
				66	{
				67	{
				68	vdupq_n_s8(0x5C), // 1.4384189
				69	vdupq_n_s8(-0x56), // -0.6771900
				70	vdupq_n_s8(0x29), // 0.3218538
				71	vdupq_n_s8(-0x0A), // -0.0832229
				72	}
				73	};
				74
				75	inline qint8x8_t vget_low_qs8(qint8x16_t a)
				76	{
				77	return vget_low_s8(a);
				78	}
				79
				80	inline qint8x8_t vget_high_qs8(qint8x16_t a)
				81	{
				82	return vget_high_s8(a);
				83	}
				84
				85	inline qint8x8_t vld1_qs8(const qint8_t *addr)
				86	{
				87	return vld1_s8(addr);
				88	}
				89
				90	inline qint8x16_t vld1q_qs8(const qint8_t *addr)
				91	{
				92	return vld1q_s8(addr);
				93	}
				94
				95	inline qint16x4_t vld1_qs16(const qint16_t *addr)
				96	{
				97	return vld1_s16(addr);
				98	}
				99
				100	inline qint16x8_t vld1q_qs16(const qint16_t *addr)
				101	{
				102	return vld1q_s16(addr);
				103	}
				104
				105	inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
				106	{
				107	return vld1_dup_s8(addr);
				108	}
				109
				110	inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
				111	{
				112	return vld1q_dup_s8(addr);
				113	}
				114
				115	inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
				116	{
				117	vst1_s8(addr, b);
				118	}
				119
				120	inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
				121	{
				122	vst1q_s8(addr, b);
				123	}
				124
				125	inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
				126	{
				127	vst1_s16(addr, b);
				128	}
				129
				130	inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
				131	{
				132	vst1q_s16(addr, b);
				133	}
				134
				135	inline qint8x8_t vqmovn_qs16(qint16x8_t a)
				136	{
				137	return vqmovn_s16(a);
				138	}
				139
				140	inline qint8x8_t vdup_n_qs8(qint8_t a)
				141	{
				142	return vdup_n_s8(a);
				143	}
				144
				145	inline qint8x16_t vdupq_n_qs8(qint8_t a)
				146	{
				147	return vdupq_n_s8(a);
				148	}
				149
				150	inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
				151	{
				152	float32x4x4_t res =
				153	{
				154	{
				155	vdupq_n_f32(a),
				156	vdupq_n_f32(a),
				157	vdupq_n_f32(a),
				158	vdupq_n_f32(a),
				159	}
				160	};
				161	return vcvtq_qs8_f32(res, fixed_point_position);
				162	}
				163
				164	inline qint16x8_t vdupq_n_qs16(qint16_t a)
				165	{
				166	return vdupq_n_s16(a);
				167	}
				168
				169	inline qint8x8_t vabs_qs8(qint8x8_t a)
				170	{
				171	return vabs_s8(a);
				172	}
				173
				174	inline qint8x16_t vabsq_qs8(qint8x16_t a)
				175	{
				176	return vabsq_s8(a);
				177	}
				178
				179	inline qint8x8_t vqabs_qs8(qint8x8_t a)
				180	{
				181	return vqabs_s8(a);
				182	}
				183
				184	inline qint8x16_t vqabsq_qs8(qint8x16_t a)
				185	{
				186	return vqabsq_s8(a);
				187	}
				188
				189	inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
				190	{
				191	return vmax_s8(a, b);
				192	}
				193
				194	inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
				195	{
				196	return vmaxq_s8(a, b);
				197	}
				198
				199	inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
				200	{
				201	return vpmax_s8(a, b);
				202	}
				203
				204	inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
				205	{
				206	return vmin_s8(a, b);
				207	}
				208
				209	inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
				210	{
				211	return vminq_s8(a, b);
				212	}
				213
				214	inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
				215	{
				216	return vpmin_s8(a, b);
				217	}
				218
				219	inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
				220	{
				221	return vadd_s8(a, b);
				222	}
				223
				224	inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
				225	{
				226	return vaddq_s8(a, b);
				227	}
				228
				229	inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
				230	{
				231	return vqadd_s8(a, b);
				232	}
				233
				234	inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
				235	{
				236	return vqaddq_s8(a, b);
				237	}
				238
				239	inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
				240	{
				241	return vqadd_s16(a, b);
				242	}
				243
				244	inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
				245	{
				246	return vqaddq_s16(a, b);
				247	}
				248
				249	inline int16x4_t vpaddl_qs8(qint8x8_t a)
				250	{
				251	return vpaddl_s8(a);
				252	}
				253
				254	inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
				255	{
				256	return vsub_s8(a, b);
				257	}
				258
				259	inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
				260	{
				261	return vsubq_s8(a, b);
				262	}
				263
				264	inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
				265	{
				266	return vqsub_s8(a, b);
				267	}
				268
				269	inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
				270	{
				271	return vqsubq_s8(a, b);
				272	}
				273
				274	inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				275	{
				276	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				277
				278	// Initialize the temporary result with a constant used to round up the result
				279	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				280
				281	// Vector multiply-accumulate long
				282	res = vmlal_s8(res, a, b);
				283
				284	// Shift right by fixed_point_position
				285	res = vshlq_s16(res, fixed_point_position_s16);
				286
				287	// Convert back to qint8
				288	return vmovn_s16(res);
				289	}
				290
				291	inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				292	{
				293	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				294
				295	// Initialize the temporary results with a constant used to round up the result
				296	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				297	qint16x8_t res1 = res0;
				298
				299	// Vector multiply-accumulate long
				300	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				301	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				302
				303	// Shift right by fixed_point_position
				304	res0 = vshlq_s16(res0, fixed_point_position_s16);
				305	res1 = vshlq_s16(res1, fixed_point_position_s16);
				306
				307	// Convert back to qint8
				308	return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
				309	}
				310
				311	inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				312	{
				313	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				314
				315	// Initialize the temporary result with a constant used to round up the result
				316	qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
				317
				318	// Vector multiply-accumulate long
				319	res = vmlal_s8(res, a, b);
				320
				321	// Shift right by fixed_point_position
				322	res = vqshlq_s16(res, fixed_point_position_s16);
				323
				324	// Convert back to qint8 and saturate
				325	return vqmovn_s16(res);
				326	}
				327
				328	inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				329	{
				330	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				331
				332	// Initialize the temporary results with a constant used to round up the result
				333	qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				334	qint16x8_t res1 = res0;
				335
				336	// Vector multiply-accumulate long
				337	res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
				338	res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
				339
				340	// Shift right by fixed_point_position
				341	res0 = vqshlq_s16(res0, fixed_point_position_s16);
				342	res1 = vqshlq_s16(res1, fixed_point_position_s16);
				343
				344	// Convert back to qint8 and saturate
				345	return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
				346	}
				347
				348	inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				349	{
				350	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				351
				352	qint16x8_t res = vmull_s8(a, b);
				353
				354	return vqrshlq_s16(res, fixed_point_position_s16);
				355	}
				356
				357	inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				358	{
				359	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				360
				361	// Initialize the temporary results with a constant used to round up the result
				362	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				363
				364	// Vector multiply-accumulate long
				365	tmp = vmlal_s8(tmp, b, c);
				366
				367	// Shift right by fixed_point_position
				368	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				369
				370	// Convert back to qint8 and accumulate
				371	return vadd_s8(a, vmovn_s16(tmp));
				372	}
				373
				374	inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				375	{
				376	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				377
				378	// Initialize the temporary results with a constant used to round up the result
				379	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				380	qint16x8_t tmp1 = tmp0;
				381
				382	// Vector multiply-accumulate long
				383	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				384	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				385
				386	// Shift right by fixed_point_position
				387	tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
				388	tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
				389
				390	// Convert back to qint8 and accumulate
				391	return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
				392	}
				393
				394	inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				395	{
				396	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				397
				398	// Initialize the temporary results with a constant used to round up the result
				399	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				400
				401	// Vector multiply-accumulate long
				402	tmp = vmlal_s8(tmp, b, c);
				403
				404	// Shift right by fixed_point_position
				405	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				406
				407	// Convert back to qint8 and accumulate
				408	return vqadd_s8(a, vqmovn_s16(tmp));
				409	}
				410
				411	inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
				412	{
				413	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				414
				415	// Initialize the temporary results with a constant used to round up the result
				416	qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
				417	qint16x8_t tmp1 = tmp0;
				418
				419	// Vector multiply-accumulate long
				420	tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
				421	tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
				422
				423	// Shift right by fixed_point_position
				424	tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
				425	tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
				426
				427	// Convert back to qint8 and accumulate
				428	qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
				429	return vqaddq_s8(a, res);
				430	}
				431
				432	inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				433	{
				434	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				435
				436	// Initialize the temporary results with a constant used to round up the result
				437	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				438
				439	// Vector multiply-accumulate long
				440	tmp = vmlal_s8(tmp, b, c);
				441
				442	// Shift right by fixed_point_position
				443	tmp = vshlq_s16(tmp, fixed_point_position_s16);
				444
				445	// Accumulate
				446	return vaddq_s16(a, tmp);
				447	}
				448
				449	inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
				450	{
				451	const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
				452
				453	// Initialize the temporary results with a constant used to round up the result
				454	qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
				455
				456	// Vector multiply-accumulate long
				457	tmp = vmlal_s8(tmp, b, c);
				458
				459	// Shift right by fixed_point_position
				460	tmp = vqshlq_s16(tmp, fixed_point_position_s16);
				461
				462	// Accumulate
				463	return vqaddq_s16(a, tmp);
				464	}
				465
				466	inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
				467	{
				468	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				469
				470	float32x4x2_t res_f32 =
				471	{
				472	{
				473	vdupq_n_f32(0.5f),
				474	vdupq_n_f32(0.5f)
				475	}
				476	};
				477
				478	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				479	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				480
				481	const int32x4x2_t res_s32 =
				482	{
				483	{
				484	vcvtq_s32_f32(res_f32.val[0]),
				485	vcvtq_s32_f32(res_f32.val[1]),
				486	}
				487	};
				488
				489	const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
				490
				491	return vqmovn_s16(res_s16);
				492	}
				493
				494	inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
				495	{
				496	const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
				497
				498	float32x4x4_t res_f32 =
				499	{
				500	{
				501	vdupq_n_f32(0.5f),
				502	vdupq_n_f32(0.5f),
				503	vdupq_n_f32(0.5f),
				504	vdupq_n_f32(0.5f)
				505	}
				506	};
				507
				508	res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
				509	res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
				510	res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
				511	res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
				512
				513	const int32x4x4_t res_s32 =
				514	{
				515	{
				516	vcvtq_s32_f32(res_f32.val[0]),
				517	vcvtq_s32_f32(res_f32.val[1]),
				518	vcvtq_s32_f32(res_f32.val[2]),
				519	vcvtq_s32_f32(res_f32.val[3]),
				520	}
				521	};
				522
				523	const int16x8x2_t res_s16 =
				524	{
				525	{
				526	vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
				527	vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
				528	}
				529	};
				530
				531	return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
				532	}
				533
				534	inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
				535	{
				536	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				537
				538	const int16x8_t res_s16 = vmovl_s8(a);
				539
				540	const int32x4x2_t res_s32 =
				541	{
				542	{
				543	vmovl_s16(vget_low_s16(res_s16)),
				544	vmovl_s16(vget_high_s16(res_s16))
				545	}
				546	};
				547
				548	float32x4x2_t res_f32 =
				549	{
				550	{
				551	vcvtq_f32_s32(res_s32.val[0]),
				552	vcvtq_f32_s32(res_s32.val[1])
				553	}
				554	};
				555
				556	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				557	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				558
				559	return res_f32;
				560	}
				561
				562	inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
				563	{
				564	const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
				565
				566	const int16x8x2_t res_s16 =
				567	{
				568	{
				569	vmovl_s8(vget_low_s8(a)),
				570	vmovl_s8(vget_high_s8(a)),
				571	}
				572	};
				573
				574	const int32x4x4_t res_s32 =
				575	{
				576	{
				577	vmovl_s16(vget_low_s16(res_s16.val[0])),
				578	vmovl_s16(vget_high_s16(res_s16.val[0])),
				579	vmovl_s16(vget_low_s16(res_s16.val[1])),
				580	vmovl_s16(vget_high_s16(res_s16.val[1])),
				581	}
				582	};
				583
				584	float32x4x4_t res_f32 =
				585	{
				586	{
				587	vcvtq_f32_s32(res_s32.val[0]),
				588	vcvtq_f32_s32(res_s32.val[1]),
				589	vcvtq_f32_s32(res_s32.val[2]),
				590	vcvtq_f32_s32(res_s32.val[3])
				591	}
				592	};
				593
				594	res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
				595	res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
				596	res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
				597	res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
				598
				599	return res_f32;
				600	}
				601
				602	inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
				603	{
				604	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				605	const qint8x8_t const_48_over_17 = vdup_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823
				606	const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
				607	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				608
				609	// Find shift value
				610	const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				611	const qint8x8_t temp = vshl_s8(a, shift_value);
				612
				613	qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
				614
				615	uint8x8_t set_one = vcgt_s8(x, const_one);
				616	x = vbsl_s8(set_one, const_one, x);
				617
				618	// Use three iterations of Newton-Raphson method to get the result
				619	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				620	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				621	x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
				622
				623	return vshl_s8(x, shift_value);
				624	}
				625
				626	inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
				627	{
				628	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				629	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823
				630	const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				631	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				632
				633	// Find shift value
				634	const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				635	const qint8x16_t temp = vshlq_s8(a, shift_value);
				636
				637	qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
				638
				639	// Set initial guess to one if x > 1
				640	uint8x16_t set_one = vcgtq_s8(x, const_one);
				641	x = vbslq_s8(set_one, const_one, x);
				642
				643	// Use three iterations of Newton-Raphson method to get the result
				644	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				645	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				646	x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				647
				648	return vshlq_s8(x, shift_value);
				649	}
				650
				651	inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
				652	{
				653	// We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
				654	const qint8x16_t const_48_over_17 = vdupq_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823
				655	const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
				656	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				657
				658	// Find shift value
				659	const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				660	const qint8x16_t temp = vqshlq_s8(a, shift_value);
				661
				662	qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
				663
				664	// Set initial guess to one if x > 1
				665	uint8x16_t set_one = vcgtq_s8(x, const_one);
				666	x = vbslq_s8(set_one, const_one, x);
				667
				668	// Use three iterations of Newton-Raphson method to get the result
				669	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				670	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				671	x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
				672
				673	return vqshlq_s8(x, shift_value);
				674	}
				675
				676	inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
				677	{
				678	return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
				679	}
				680
				681	inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				682	{
				683	return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
				684	}
				685
				686	template <bool islog>
				687	inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
				688	{
				689	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				690	const qint8x8_t const_one = vdup_n_s8(1);
				691	const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
				692	const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				693	const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				694	const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				695	const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
				696	const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
				697	const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
				698	const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position);
				699	return res;
				700	}
				701
				702	template <bool islog>
				703	inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
				704	{
				705	const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
				706	const qint8x8_t const_one = vdup_n_s8(1);
				707	const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
				708	const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
				709	const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
				710	const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
				711	const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
				712	const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
				713	const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
				714	const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position);
				715	return res;
				716	}
				717
				718	template <bool islog>
				719	inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				720	{
				721	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				722	const qint8x16_t const_one = vdupq_n_s8(1);
				723	const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
				724	const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				725	const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				726	const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				727	const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
				728	const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
				729	const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
				730	const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position);
				731	return res;
				732	}
				733
				734	template <bool islog>
				735	inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
				736	{
				737	const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
				738	const qint8x16_t const_one = vdupq_n_s8(1);
				739	const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
				740	const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
				741	const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
				742	const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
				743	const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
				744	const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
				745	const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
				746	const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position);
				747	return res;
				748	}
				749
				750	inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
				751	{
				752	const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7);
				753	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				754	const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2)
				755	const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				756
				757	// Perform range reduction [-log(2),log(2)]
				758	const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				759
				760	// get decimal part from m
				761	const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
				762
				763	qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				764	alpha = vqabs_qs8(vqsub_s8(a, alpha));
				765
				766	// Polynomial Approximation
				767	qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
				768	poly = vqadd_s8(poly, const_one);
				769
				770	// Reconstruct
				771	poly = vqshl_s8(poly, dec_m);
				772
				773	return poly;
				774	}
				775
				776	inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
				777	{
				778	const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7);
				779	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				780	const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2)
				781	const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
				782
				783	// Perform range reduction [-log(2),log(2)]
				784	const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
				785
				786	// get decimal part from m
				787	const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
				788
				789	qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
				790	alpha = vqabsq_qs8(vqsubq_qs8(a, alpha));
				791
				792	// Polynomial Approximation
				793	qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
				794	poly = vqaddq_s8(poly, const_one);
				795
				796	// Reconstruct
				797	poly = vqshlq_s8(poly, dec_m);
				798
				799	return poly;
				800	}
				801
				802	inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
				803	{
				804	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				805	const qint8x8_t const_seven_dec = vdup_n_s8(7);
				806	const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				807
				808	// If 0 < a < 1, calculate log(1/x)
				809	uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
				810	qint8x8_t recip = vdup_n_s8(0);
				811	recip = vbsl_s8(calc_reciprocal, recip, a);
				812
				813	// Calculate reciprocal
				814	recip = vrecip_qs8(recip, fixed_point_position);
				815	a = vbsl_s8(calc_reciprocal, recip, a);
				816
				817	// Get decimal part of a
				818	qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
				819	qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position
				820
				821	// Get exponent of 2^n which is equal or less than dec_a
				822	shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
				823
				824	// Get x to range (1, 2]
				825	const qint8x8_t shift_value_neg = vneg_s8(shift_value);
				826	const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
				827	const qint8x8_t sum = vmul_s8(shift_value, const_one);
				828
				829	// Polynomial Approximation
				830	qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
				831
				832	// Reconstruct
				833	poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
				834
				835	// Set negative value for 0 < a < 1
				836	poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
				837
				838	return poly;
				839	}
				840
				841	inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
				842	{
				843	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				844	const qint8x16_t const_seven_dec = vdupq_n_s8(7);
				845	const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
				846
				847	// If 0 < a < 1, calculate log(1/x)
				848	uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
				849	qint8x16_t recip = vdupq_n_s8(0);
				850	recip = vbslq_s8(calc_reciprocal, a, recip);
				851
				852	// Calculate reciprocal
				853	recip = vrecipq_qs8(recip, fixed_point_position);
				854	a = vbslq_s8(calc_reciprocal, recip, a);
				855
				856	// Get decimal part of a
				857	qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
				858	qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position
				859
				860	// Get exponent of 2^n which is equal or less than dec_a
				861	shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
				862
				863	// Get x to range (1, 2]
				864	const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
				865	const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
				866	const qint8x16_t sum = vmulq_s8(shift_value, const_one);
				867
				868	// Polynomial Approximation
				869	qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
				870
				871	// Reconstruct
				872	poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
				873
				874	// Set negative value for 0 < a < 1
				875	poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
				876
				877	return poly;
				878	}
				879
				880	inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				881	{
				882	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				883
				884	// Find shift value. Number must be in (0.5, 2) range.
				885	qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				886
				887	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				888	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				889	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				890	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				891	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				892
				893	temp = vshl_s8(a, shift_value);
				894
				895	// Initial guess
				896	qint8x8_t x = temp;
				897
				898	// Calculate (x / 2) * (3 - a * x^2)
				899	// After three iterations we have the result for 8 bit
				900	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				901	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				902	x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				903
				904	return vshl_s8(x, shift_value2);
				905	}
				906
				907	inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
				908	{
				909	const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
				910
				911	// Find shift value. Number must be in (0.5, 2) range.
				912	qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
				913
				914	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				915	qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
				916	uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0));
				917	temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
				918	qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
				919
				920	temp = vshl_s8(a, shift_value);
				921
				922	// Initial guess
				923	qint8x8_t x = temp;
				924
				925	// Calculate (x / 2) * (3 - a * x^2)
				926	// After three iterations we have the result for 8 bit
				927	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				928	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				929	x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				930
				931	return vshl_s8(x, shift_value2);
				932	}
				933
				934	inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				935	{
				936	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				937
				938	// Find shift value. Number must be in (0.5, 2) range.
				939	qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				940
				941	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				942	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				943	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				944	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				945	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				946
				947	temp = vshlq_s8(a, shift_value);
				948
				949	// Initial guess
				950	qint8x16_t x = temp;
				951
				952	// Calculate (x / 2) * (3 - a * x^2)
				953	// After three iterations we have the result for 8 bit
				954	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				955	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				956	x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				957
				958	return vshlq_s8(x, shift_value2);
				959	}
				960
				961	inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
				962	{
				963	const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
				964
				965	// Find shift value. Number must be in (0.5, 2) range.
				966	qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
				967
				968	// Add one when the shift value is negative in order to get the correct result when we shift right with 1
				969	qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
				970	uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0));
				971	temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
				972	qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
				973
				974	temp = vshlq_s8(a, shift_value);
				975
				976	// Initial guess
				977	qint8x16_t x = temp;
				978
				979	// Calculate (x / 2) * (3 - a * x^2)
				980	// After three iterations we have the result for 8 bit
				981	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				982	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				983	x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
				984
				985	return vshlq_s8(x, shift_value2);
				986	}
				987
				988	inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
				989	{
				990	const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
				991	const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
				992
				993	qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
				994	qint8x8_t num = vqsub_qs8(exp2x, const_one);
				995	qint8x8_t den = vqadd_qs8(exp2x, const_one);
				996	qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
				997
				998	return tanh;
				999	}
				1000
				1001	inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
				1002	{
				1003	const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
				1004	const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
				1005
				1006	qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
				1007	qint8x16_t num = vqsubq_qs8(exp2x, const_one);
				1008	qint8x16_t den = vqaddq_qs8(exp2x, const_one);
				1009	qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
				1010
				1011	return tanh;
				1012	}
				1013
				1014	inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
				1015	{
				1016	return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
				1017	}
Michele Di Giorgio	8af2dd6	2017-06-19 15:19:29 +0100	[diff] [blame^]	1018
				1019	inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
				1020	{
				1021	float32x4x2_t res =
				1022	{
				1023	{
				1024	vmaxq_f32(a.val[0], b.val[0]),
				1025	vmaxq_f32(a.val[1], b.val[1])
				1026	}
				1027	};
				1028	return res;
				1029	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1030	}