Blame - arm_compute/core/NEON/NEAsymm.h - ml/ComputeLibrary

blob: 234d48882c32a9178161e221e37217217d630812 [file] [log] [blame]

Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	1	/*
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	f464337	2019-11-29 16:17:13 +0000	[diff] [blame]	24	#ifndef ARM_COMPUTE_NEASYMM_H
				25	#define ARM_COMPUTE_NEASYMM_H
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	26
Manuel Bottini	7bb56c6	2019-06-26 15:17:09 +0100	[diff] [blame]	27	#include "arm_compute/core/NEON/NEMath.h"
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	28	#include <arm_neon.h>
				29
				30	namespace arm_compute
				31	{
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	32	using qasymm8x8_t = uint8x8_t; /*< 8 bit quantized asymmetric vector with 8 elements /
				33	using qasymm8x8x2_t = uint8x8x2_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				34	using qasymm8x8x3_t = uint8x8x3_t; /*< 8 bit quantized asymmetric vector with 24 elements /
				35	using qasymm8x8x4_t = uint8x8x4_t; /*< 8 bit quantized asymmetric vector with 32 elements /
				36	using qasymm8x16_t = uint8x16_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				37
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame^]	38	using qasymm8x8_signed_t = int8x8_t; /*< 8 bit quantized signed asymmetric vector with 8 elements /
				39	using qasymm8x8x2_signed_t = int8x8x2_t; /*< 8 bit quantized signed asymmetric vector with 16 elements /
				40	using qasymm8x8x3_signed_t = int8x8x3_t; /*< 8 bit quantized signed asymmetric vector with 24 elements /
				41	using qasymm8x8x4_signed_t = int8x8x4_t; /*< 8 bit quantized signed asymmetric vector with 32 elements /
				42	using qasymm8x16_signed_t = int8x16_t; /*< 8 bit quantized signed asymmetric vector with 16 elements /
				43
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	44	/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
				45	*
				46	* vd*vs + vo
				47	*
				48	* @param[in] vd Input vector value in QASYMM8 format
				49	* @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
				50	* @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
				51	*
				52	* @return A 16-component vector in QASYMM8 format, saturated to fit
				53	*/
				54	uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
Georgios Pinitas	f72f936	2018-01-12 16:29:45 +0000	[diff] [blame]	55
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame^]	56	/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
				57	*
				58	* vd*vs + vo
				59	*
				60	* @param[in] vd Input vector value in QASYMM8_SIGNED format
				61	* @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
				62	* @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
				63	*
				64	* @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
				65	*/
				66	int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
				67
Georgios Pinitas	f72f936	2018-01-12 16:29:45 +0000	[diff] [blame]	68	/** Performs final quantization step on 16 elements
				69	*
				70	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				71	*
				72	* @param in_s32 Input to be quantized.
				73	* @param result_fixedpoint_multiplier Result multiplier parameter
				74	* @param result_shift Result shift parameter
				75	* @param result_offset_after_shift_s32 Result offset parameter
				76	* @param min_u8 Relu lower bound
				77	* @param max_u8 Relu upper bound
				78	*
				79	* @return Quantized values
				80	*/
				81	template <bool is_bounded_relu>
				82	uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
				83	int result_fixedpoint_multiplier,
				84	int32_t result_shift,
				85	int32x4_t result_offset_after_shift_s32,
				86	uint8x16_t min_u8,
				87	uint8x16_t max_u8)
				88	{
				89	const static int32x4_t zero_s32 = vdupq_n_s32(0);
				90
				91	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				92	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				93	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				94	in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
				95	in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
				96
				97	// Round to the nearest division by a power-of-two using result_shift_s32
				98	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				99	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				100	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
				101	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
				102
				103	// Add the offset terms
				104	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				105	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				106	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				107	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				108
				109	// Saturate negative values
				110	in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
				111	in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
				112	in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
				113	in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
				114
				115	// Convert S32 to S16
				116	const int16x8x2_t in_s16 =
				117	{
				118	{
				119	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				120	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				121	}
				122	};
				123
				124	// Convert S16 to U8
				125	uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
				126
				127	if(is_bounded_relu)
				128	{
				129	out_u8 = vmaxq_u8(out_u8, min_u8);
				130	out_u8 = vminq_u8(out_u8, max_u8);
				131	}
				132
				133	return out_u8;
				134	}
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	135
Georgios Pinitas	448a81f	2019-11-21 14:10:25 +0000	[diff] [blame]	136	/** Performs final quantization step on 16 elements
				137	*
				138	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				139	*
				140	* @param in_s32 Input to be quantized.
				141	* @param result_fixedpoint_multiplier Result multiplier parameter
				142	* @param result_shift Result shift parameter
				143	* @param result_offset_after_shift_s32 Result offset parameter
				144	* @param min_s8 Relu lower bound
				145	* @param max_s8 Relu upper bound
				146	*
				147	* @return Quantized values
				148	*/
				149	template <bool is_bounded_relu>
				150	int8x16_t finalize_quantization(int32x4x4_t &in_s32,
				151	int result_fixedpoint_multiplier,
				152	int32_t result_shift,
				153	int32x4_t result_offset_after_shift_s32,
				154	int8x16_t min_s8,
				155	int8x16_t max_s8)
				156	{
				157	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				158	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				159	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				160	in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
				161	in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
				162
				163	// Round to the nearest division by a power-of-two using result_shift_s32
				164	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				165	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				166	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
				167	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
				168
				169	// Add the offset terms
				170	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				171	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				172	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				173	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				174
				175	// Convert S32 to S16
				176	const int16x8x2_t in_s16 =
				177	{
				178	{
				179	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				180	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				181	}
				182	};
				183
				184	// Convert S16 to S8
				185	int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
				186
				187	if(is_bounded_relu)
				188	{
				189	out_s8 = vmaxq_s8(out_s8, min_s8);
				190	out_s8 = vminq_s8(out_s8, max_s8);
				191	}
				192
				193	return out_s8;
				194	}
				195
Georgios Pinitas	dbdea0d	2019-10-16 19:21:40 +0100	[diff] [blame]	196	/** Performs final quantization step on 16 elements for symmetric quantization
				197	*
				198	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				199	*
				200	* @param in_s32 Input to be quantized.
				201	* @param result_fixedpoint_multiplier Result multiplier parameter
				202	* @param result_shift Result shift parameter
				203	* @param result_offset_after_shift_s32 Result offset parameter
				204	* @param min_s8 Relu lower bound
				205	* @param max_s8 Relu upper bound
				206	*
				207	* @return Quantized values
				208	*/
				209	template <bool is_bounded_relu>
				210	inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
				211	const int32x4x4_t &result_fixedpoint_multiplier,
				212	const int32x4x4_t &result_shift,
				213	const int32x4_t &result_offset_after_shift_s32,
				214	const int8x16_t &min_s8,
				215	const int8x16_t &max_s8)
				216	{
				217	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				218	in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]);
				219	in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]);
				220	in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]);
				221	in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]);
				222
				223	// Round to the nearest division by a power-of-two using result_shift_s32
				224	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]);
				225	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]);
				226	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]);
				227	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]);
				228
				229	// Add the offset terms
				230	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				231	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				232	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				233	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				234
				235	// Convert S32 to S16
				236	const int16x8x2_t in_s16 =
				237	{
				238	{
				239	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				240	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				241	}
				242	};
				243
				244	// Convert S16 to S8
				245	int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
				246
				247	if(is_bounded_relu)
				248	{
				249	out_s8 = vmaxq_s8(out_s8, min_s8);
				250	out_s8 = vminq_s8(out_s8, max_s8);
				251	}
				252
				253	return out_s8;
				254	}
				255
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	256	/** Performs final quantization step on single element
				257	*
				258	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				259	*
				260	* @param[in] in_value Input to be quantized.
				261	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				262	* @param[in] result_shift Result shift parameter
				263	* @param[in] result_offset_after_shift_s32 Result offset parameter
				264	* @param[in] min_u8 Relu lower bound
				265	* @param[in] max_u8 Relu upper bound
				266	*
				267	* @return Quantized value
				268	*/
				269	template <bool is_bounded_relu>
				270	inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
				271	int32_t result_shift, int32_t result_offset_after_shift_s32,
				272	uint8_t min_u8, uint8_t max_u8)
				273	{
				274	int32x4_t in_s32 = vdupq_n_s32(in_value);
				275
				276	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				277	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				278
				279	// Shift value by result_shift_s32
				280	in_value = rounding_divide_by_pow2(in_value, result_shift);
				281
				282	// Add the offset term
				283	in_value += result_offset_after_shift_s32;
				284
				285	// Bound the result
Georgios Pinitas	6fa2638	2019-03-18 10:05:34 +0000	[diff] [blame]	286	uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	287	if(is_bounded_relu)
				288	{
				289	out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
				290	}
				291
				292	return out_u8;
				293	}
				294
Georgios Pinitas	dbdea0d	2019-10-16 19:21:40 +0100	[diff] [blame]	295	/** Performs final quantization step on single element
				296	*
				297	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				298	*
				299	* @param[in] in_value Input to be quantized.
				300	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				301	* @param[in] result_shift Result shift parameter
				302	* @param[in] result_offset_after_shift_s32 Result offset parameter
				303	* @param[in] min_s8 Relu lower bound
				304	* @param[in] max_s8 Relu upper bound
				305	*
				306	* @return Quantized value
				307	*/
				308	template <bool is_bounded_relu>
				309	inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
				310	int32_t result_shift, int32_t result_offset_after_shift_s32,
				311	int8_t min_s8, int8_t max_s8)
				312	{
				313	int32x4_t in_s32 = vdupq_n_s32(in_value);
				314
				315	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				316	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				317
				318	// Shift value by result_shift_s32
				319	in_value = rounding_divide_by_pow2(in_value, result_shift);
				320
				321	// Add the offset term
				322	in_value += result_offset_after_shift_s32;
				323
				324	// Bound the result
				325	int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
				326	if(is_bounded_relu)
				327	{
				328	out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
				329	}
				330
				331	return out_s8;
				332	}
				333
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	334	/** Dequantize a neon vector holding 8 quantized values.
				335	*
				336	* @param[in] qv Input values to be dequantized.
				337	* @param[in] qi Quantization information to be used in the computation.
				338	*
				339	* @return Dequantized values in a neon vector
				340	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	341	inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	342	{
				343	const float scale = qi.scale;
				344	const int offset = qi.offset;
				345	const int32x4_t voffset = vdupq_n_s32(offset);
				346	const float32x4_t vscale = vdupq_n_f32(scale);
				347	const float32x4x2_t vdequantized_input =
				348	{
				349	{
				350	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
				351	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
				352	}
				353	};
				354	return vdequantized_input;
				355	}
				356
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame^]	357	/** Dequantize a neon vector holding 8 singed quantized values.
				358	*
				359	* @param[in] qv Input values to be dequantized.
				360	* @param[in] qi Quantization information to be used in the computation.
				361	*
				362	* @return Dequantized values in a neon vector
				363	*/
				364	inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
				365	{
				366	const float scale = qi.scale;
				367	const int offset = qi.offset;
				368	const int32x4_t voffset = vdupq_n_s32(offset);
				369	const float32x4_t vscale = vdupq_n_f32(scale);
				370	const float32x4x2_t vdequantized_input =
				371	{
				372	{
				373	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
				374	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
				375	}
				376	};
				377	return vdequantized_input;
				378	}
				379
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	380	/** Dequantize a neon vector holding 16 quantized values.
				381	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	382	* @param[in] qv Input values to be dequantized.
				383	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	384	*
				385	* @return Dequantized values in a neon vector
				386	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	387	inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	388	{
				389	const float scale = qi.scale;
				390	const int offset = qi.offset;
				391	const int32x4_t voffset = vdupq_n_s32(offset);
				392	const float32x4_t vscale = vdupq_n_f32(scale);
				393	const float32x4x4_t vdequantized_input =
				394	{
				395	{
				396	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				397	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				398	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				399	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				400	}
				401	};
				402	return vdequantized_input;
				403	}
				404
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame^]	405	/** Dequantize a neon vector holding 16 signed quantized values.
				406	*
				407	* @param[in] qv Input values to be dequantized.
				408	* @param[in] qi Quantization information to be used in the computation.
				409	*
				410	* @return Dequantized values in a neon vector
				411	*/
				412	inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
				413	{
				414	const float scale = qi.scale;
				415	const int offset = qi.offset;
				416	const int32x4_t voffset = vdupq_n_s32(offset);
				417	const float32x4_t vscale = vdupq_n_f32(scale);
				418	const float32x4x4_t vdequantized_input =
				419	{
				420	{
				421	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
				422	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
				423	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
				424	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
				425	}
				426	};
				427	return vdequantized_input;
				428	}
				429
Georgios Pinitas	3d13af8	2019-06-04 13:04:16 +0100	[diff] [blame]	430	/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
				431	*
				432	* @param[in] qv Input values to be dequantized.
				433	* @param[in] scale Quantization scaling factor.
				434	* @param[in] offset Zero quantization offset.
				435	*
				436	* @return Dequantized values in a neon vector
				437	*/
				438	inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
				439	{
				440	const int32x4_t voffset = vdupq_n_s32(offset);
				441	const float32x4_t vscale = vdupq_n_f32(scale);
				442	const float32x4x4_t vdequantized_input =
				443	{
				444	{
				445	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				446	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				447	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				448	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				449	}
				450	};
				451	return vdequantized_input;
				452	}
				453
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	454	/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values.
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	455	*
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	456	* @param[in] qv Input values to be dequantized.
				457	* @param[in] vscale Vector containing quantization scaling factors.
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	458	*
				459	* @return Dequantized values in a neon vector
				460	*/
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	461	inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	462	{
				463	const float32x4x4_t vdequantized_input =
				464	{
				465	{
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	466	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
				467	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
				468	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
				469	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	470	}
				471	};
				472	return vdequantized_input;
				473	}
				474
Georgios Pinitas	3d13af8	2019-06-04 13:04:16 +0100	[diff] [blame]	475	/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
				476	*
				477	* @param[in] qv Input values to be dequantized.
				478	* @param[in] scale Quantization scaling factor.
				479	*
				480	* @return Dequantized values in a neon vector
				481	*/
				482	inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
				483	{
				484	const float32x4_t vscale = vdupq_n_f32(scale);
				485	const float32x4x4_t vdequantized_input =
				486	{
				487	{
				488	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				489	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				490	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				491	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				492	}
				493	};
				494	return vdequantized_input;
				495	}
				496
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	497	/** Quantize a neon vector holding 8 floating point values.
				498	*
				499	* @param[in] qv Input values to be quantized.
				500	* @param[in] qi Quantization information to be used in the computation.
				501	*
				502	* @return A neon vector holding the quantized values
				503	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	504	inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	505	{
				506	const float scale = qi.scale;
				507	const int offset = qi.offset;
				508	const float32x4_t voffset = vdupq_n_f32(offset);
				509	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				510	const int32x4x4_t rf =
				511	{
				512	{
				513	#ifdef __aarch64__
				514	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				515	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				516	#else //__aarch64__
				517	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				518	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				519	#endif //__aarch64__
				520	}
				521	};
				522	return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				523	}
				524
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame^]	525	/** Quantize a neon vector holding 8 floating point values.
				526	*
				527	* @param[in] qv Input values to be quantized.
				528	* @param[in] qi Quantization information to be used in the computation.
				529	*
				530	* @return A neon vector holding the singed quantized values
				531	*/
				532	inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
				533	{
				534	const float scale = qi.scale;
				535	const int offset = qi.offset;
				536	const float32x4_t voffset = vdupq_n_f32(offset);
				537	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				538	const int32x4x4_t rf =
				539	{
				540	{
				541	#ifdef __aarch64__
				542	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				543	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				544	#else //__aarch64__
				545	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				546	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				547	#endif //__aarch64__
				548	}
				549	};
				550	return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				551	}
				552
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	553	/** Quantize a neon vector holding 16 floating point values.
				554	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	555	* @param[in] qv Input values to be quantized.
				556	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	557	*
				558	* @return A neon vector holding the quantized values
				559	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	560	inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	561	{
				562	const float scale = qi.scale;
				563	const int offset = qi.offset;
				564	const float32x4_t voffset = vdupq_n_f32(offset);
				565	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				566	const int32x4x4_t rf =
				567	{
				568	{
				569	#ifdef __aarch64__
				570	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				571	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				572	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				573	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				574	#else //__aarch64__
				575	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				576	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				577	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				578	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				579	#endif //__aarch64__
				580	}
				581	};
				582	const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				583	const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				584	return vcombine_u8(pa, pb);
				585	}
Michele Di Giorgio	d64a46c	2019-10-01 12:25:49 +0100	[diff] [blame]	586
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame^]	587	/** Signed quantize a neon vector holding 16 floating point values.
				588	*
				589	* @param[in] qv Input values to be quantized.
				590	* @param[in] qi Quantization information to be used in the computation.
				591	*
				592	* @return A neon vector holding the quantized values
				593	*/
				594
				595	inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
				596	{
				597	const float scale = qi.scale;
				598	const int offset = qi.offset;
				599	const float32x4_t voffset = vdupq_n_f32(offset);
				600	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				601	const int32x4x4_t rf =
				602	{
				603	{
				604	#ifdef __aarch64__
				605	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				606	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				607	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				608	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				609	#else //__aarch64__
				610	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				611	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				612	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				613	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				614	#endif //__aarch64__
				615
				616	}
				617	};
				618	const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				619	const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				620	return vcombine_s8(pa, pb);
				621	}
				622
Michele Di Giorgio	d64a46c	2019-10-01 12:25:49 +0100	[diff] [blame]	623	/** Quantize to QASYMM16 a neon vector holding 16 floating point values.
				624	*
				625	* @param[in] qv Input values to be quantized.
				626	* @param[in] qi Quantization information to be used in the computation.
				627	*
				628	* @return A neon vector holding the quantized values
				629	*/
				630	inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
				631	{
				632	const float scale = qi.scale;
				633	const int offset = qi.offset;
				634	const float32x4_t voffset = vdupq_n_f32(offset);
				635	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				636	const int32x4x4_t rf =
				637	{
				638	{
				639	#ifdef __aarch64__
				640	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				641	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				642	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				643	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				644	#else //__aarch64__
				645	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				646	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				647	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				648	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				649	#endif //__aarch64__
				650	}
				651	};
				652	const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
				653	const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
				654	return { pa, pb };
				655	}
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	656	} // namespace arm_compute
				657	#include "arm_compute/core/NEON/NEAsymm.inl"
Michalis Spyrou	f464337	2019-11-29 16:17:13 +0000	[diff] [blame]	658	#endif // ARM_COMPUTE_NEASYMM_H