Blame - arm_compute/core/NEON/NEAsymm.h - ml/ComputeLibrary

blob: 67adcef9b1734e8b38a4a5e6c7c80da80c7ddc0c [file] [log] [blame]

Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	1	/*
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	f464337	2019-11-29 16:17:13 +0000	[diff] [blame]	24	#ifndef ARM_COMPUTE_NEASYMM_H
				25	#define ARM_COMPUTE_NEASYMM_H
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	26
Manuel Bottini	7bb56c6	2019-06-26 15:17:09 +0100	[diff] [blame]	27	#include "arm_compute/core/NEON/NEMath.h"
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	28	#include <arm_neon.h>
				29
				30	namespace arm_compute
				31	{
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	32	using qasymm8x8_t = uint8x8_t; /*< 8 bit quantized asymmetric vector with 8 elements /
				33	using qasymm8x8x2_t = uint8x8x2_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				34	using qasymm8x8x3_t = uint8x8x3_t; /*< 8 bit quantized asymmetric vector with 24 elements /
				35	using qasymm8x8x4_t = uint8x8x4_t; /*< 8 bit quantized asymmetric vector with 32 elements /
				36	using qasymm8x16_t = uint8x16_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				37
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame]	38	using qasymm8x8_signed_t = int8x8_t; /*< 8 bit quantized signed asymmetric vector with 8 elements /
				39	using qasymm8x8x2_signed_t = int8x8x2_t; /*< 8 bit quantized signed asymmetric vector with 16 elements /
				40	using qasymm8x8x3_signed_t = int8x8x3_t; /*< 8 bit quantized signed asymmetric vector with 24 elements /
				41	using qasymm8x8x4_signed_t = int8x8x4_t; /*< 8 bit quantized signed asymmetric vector with 32 elements /
				42	using qasymm8x16_signed_t = int8x16_t; /*< 8 bit quantized signed asymmetric vector with 16 elements /
				43
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	44	/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
				45	*
				46	* vd*vs + vo
				47	*
				48	* @param[in] vd Input vector value in QASYMM8 format
				49	* @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
				50	* @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
				51	*
				52	* @return A 16-component vector in QASYMM8 format, saturated to fit
				53	*/
				54	uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
Georgios Pinitas	f72f936	2018-01-12 16:29:45 +0000	[diff] [blame]	55
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame]	56	/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
				57	*
				58	* vd*vs + vo
				59	*
				60	* @param[in] vd Input vector value in QASYMM8_SIGNED format
				61	* @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
				62	* @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
				63	*
				64	* @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
				65	*/
				66	int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
				67
Georgios Pinitas	f72f936	2018-01-12 16:29:45 +0000	[diff] [blame]	68	/** Performs final quantization step on 16 elements
				69	*
				70	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				71	*
				72	* @param in_s32 Input to be quantized.
				73	* @param result_fixedpoint_multiplier Result multiplier parameter
				74	* @param result_shift Result shift parameter
				75	* @param result_offset_after_shift_s32 Result offset parameter
				76	* @param min_u8 Relu lower bound
				77	* @param max_u8 Relu upper bound
				78	*
				79	* @return Quantized values
				80	*/
				81	template <bool is_bounded_relu>
				82	uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
				83	int result_fixedpoint_multiplier,
				84	int32_t result_shift,
				85	int32x4_t result_offset_after_shift_s32,
				86	uint8x16_t min_u8,
				87	uint8x16_t max_u8)
				88	{
				89	const static int32x4_t zero_s32 = vdupq_n_s32(0);
				90
				91	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				92	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				93	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				94	in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
				95	in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
				96
				97	// Round to the nearest division by a power-of-two using result_shift_s32
				98	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				99	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				100	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
				101	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
				102
				103	// Add the offset terms
				104	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				105	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				106	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				107	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				108
				109	// Saturate negative values
				110	in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
				111	in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
				112	in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
				113	in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
				114
				115	// Convert S32 to S16
				116	const int16x8x2_t in_s16 =
				117	{
				118	{
				119	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				120	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				121	}
				122	};
				123
				124	// Convert S16 to U8
				125	uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
				126
				127	if(is_bounded_relu)
				128	{
				129	out_u8 = vmaxq_u8(out_u8, min_u8);
				130	out_u8 = vminq_u8(out_u8, max_u8);
				131	}
				132
				133	return out_u8;
				134	}
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	135
Georgios Pinitas	448a81f	2019-11-21 14:10:25 +0000	[diff] [blame]	136	/** Performs final quantization step on 16 elements
				137	*
				138	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				139	*
				140	* @param in_s32 Input to be quantized.
				141	* @param result_fixedpoint_multiplier Result multiplier parameter
				142	* @param result_shift Result shift parameter
				143	* @param result_offset_after_shift_s32 Result offset parameter
				144	* @param min_s8 Relu lower bound
				145	* @param max_s8 Relu upper bound
				146	*
				147	* @return Quantized values
				148	*/
				149	template <bool is_bounded_relu>
				150	int8x16_t finalize_quantization(int32x4x4_t &in_s32,
				151	int result_fixedpoint_multiplier,
				152	int32_t result_shift,
				153	int32x4_t result_offset_after_shift_s32,
				154	int8x16_t min_s8,
				155	int8x16_t max_s8)
				156	{
				157	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				158	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				159	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				160	in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
				161	in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
				162
				163	// Round to the nearest division by a power-of-two using result_shift_s32
				164	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				165	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				166	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
				167	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
				168
				169	// Add the offset terms
				170	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				171	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				172	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				173	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				174
				175	// Convert S32 to S16
				176	const int16x8x2_t in_s16 =
				177	{
				178	{
				179	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				180	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				181	}
				182	};
				183
				184	// Convert S16 to S8
				185	int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
				186
				187	if(is_bounded_relu)
				188	{
				189	out_s8 = vmaxq_s8(out_s8, min_s8);
				190	out_s8 = vminq_s8(out_s8, max_s8);
				191	}
				192
				193	return out_s8;
				194	}
				195
Georgios Pinitas	dbdea0d	2019-10-16 19:21:40 +0100	[diff] [blame]	196	/** Performs final quantization step on 16 elements for symmetric quantization
				197	*
				198	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				199	*
				200	* @param in_s32 Input to be quantized.
				201	* @param result_fixedpoint_multiplier Result multiplier parameter
				202	* @param result_shift Result shift parameter
				203	* @param result_offset_after_shift_s32 Result offset parameter
				204	* @param min_s8 Relu lower bound
				205	* @param max_s8 Relu upper bound
				206	*
				207	* @return Quantized values
				208	*/
				209	template <bool is_bounded_relu>
				210	inline int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32,
				211	const int32x4x4_t &result_fixedpoint_multiplier,
				212	const int32x4x4_t &result_shift,
				213	const int32x4_t &result_offset_after_shift_s32,
				214	const int8x16_t &min_s8,
				215	const int8x16_t &max_s8)
				216	{
				217	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				218	in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]);
				219	in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]);
				220	in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]);
				221	in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]);
				222
				223	// Round to the nearest division by a power-of-two using result_shift_s32
				224	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]);
				225	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]);
				226	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]);
				227	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]);
				228
				229	// Add the offset terms
				230	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				231	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				232	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				233	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				234
				235	// Convert S32 to S16
				236	const int16x8x2_t in_s16 =
				237	{
				238	{
				239	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				240	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				241	}
				242	};
				243
				244	// Convert S16 to S8
				245	int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
				246
				247	if(is_bounded_relu)
				248	{
				249	out_s8 = vmaxq_s8(out_s8, min_s8);
				250	out_s8 = vminq_s8(out_s8, max_s8);
				251	}
				252
				253	return out_s8;
				254	}
				255
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	256	/** Performs final quantization step on single element
				257	*
				258	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				259	*
				260	* @param[in] in_value Input to be quantized.
				261	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				262	* @param[in] result_shift Result shift parameter
				263	* @param[in] result_offset_after_shift_s32 Result offset parameter
				264	* @param[in] min_u8 Relu lower bound
				265	* @param[in] max_u8 Relu upper bound
				266	*
				267	* @return Quantized value
				268	*/
				269	template <bool is_bounded_relu>
				270	inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
				271	int32_t result_shift, int32_t result_offset_after_shift_s32,
				272	uint8_t min_u8, uint8_t max_u8)
				273	{
				274	int32x4_t in_s32 = vdupq_n_s32(in_value);
				275
				276	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				277	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				278
				279	// Shift value by result_shift_s32
				280	in_value = rounding_divide_by_pow2(in_value, result_shift);
				281
				282	// Add the offset term
				283	in_value += result_offset_after_shift_s32;
				284
				285	// Bound the result
Georgios Pinitas	6fa2638	2019-03-18 10:05:34 +0000	[diff] [blame]	286	uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	287	if(is_bounded_relu)
				288	{
				289	out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
				290	}
				291
				292	return out_u8;
				293	}
				294
Georgios Pinitas	dbdea0d	2019-10-16 19:21:40 +0100	[diff] [blame]	295	/** Performs final quantization step on single element
				296	*
				297	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				298	*
				299	* @param[in] in_value Input to be quantized.
				300	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				301	* @param[in] result_shift Result shift parameter
				302	* @param[in] result_offset_after_shift_s32 Result offset parameter
				303	* @param[in] min_s8 Relu lower bound
				304	* @param[in] max_s8 Relu upper bound
				305	*
				306	* @return Quantized value
				307	*/
				308	template <bool is_bounded_relu>
				309	inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
				310	int32_t result_shift, int32_t result_offset_after_shift_s32,
				311	int8_t min_s8, int8_t max_s8)
				312	{
				313	int32x4_t in_s32 = vdupq_n_s32(in_value);
				314
				315	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				316	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				317
				318	// Shift value by result_shift_s32
				319	in_value = rounding_divide_by_pow2(in_value, result_shift);
				320
				321	// Add the offset term
				322	in_value += result_offset_after_shift_s32;
				323
				324	// Bound the result
				325	int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
				326	if(is_bounded_relu)
				327	{
				328	out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
				329	}
				330
				331	return out_s8;
				332	}
				333
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	334	/** Dequantize a neon vector holding 8 quantized values.
				335	*
				336	* @param[in] qv Input values to be dequantized.
				337	* @param[in] qi Quantization information to be used in the computation.
				338	*
				339	* @return Dequantized values in a neon vector
				340	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	341	inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	342	{
				343	const float scale = qi.scale;
				344	const int offset = qi.offset;
				345	const int32x4_t voffset = vdupq_n_s32(offset);
				346	const float32x4_t vscale = vdupq_n_f32(scale);
				347	const float32x4x2_t vdequantized_input =
				348	{
				349	{
				350	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
				351	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
				352	}
				353	};
				354	return vdequantized_input;
				355	}
				356
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame]	357	/** Dequantize a neon vector holding 8 singed quantized values.
				358	*
				359	* @param[in] qv Input values to be dequantized.
				360	* @param[in] qi Quantization information to be used in the computation.
				361	*
				362	* @return Dequantized values in a neon vector
				363	*/
				364	inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
				365	{
				366	const float scale = qi.scale;
				367	const int offset = qi.offset;
				368	const int32x4_t voffset = vdupq_n_s32(offset);
				369	const float32x4_t vscale = vdupq_n_f32(scale);
				370	const float32x4x2_t vdequantized_input =
				371	{
				372	{
				373	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
				374	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
				375	}
				376	};
				377	return vdequantized_input;
				378	}
				379
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	380	/** Dequantize a neon vector holding 16 quantized values.
				381	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	382	* @param[in] qv Input values to be dequantized.
				383	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	384	*
				385	* @return Dequantized values in a neon vector
				386	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	387	inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	388	{
				389	const float scale = qi.scale;
				390	const int offset = qi.offset;
				391	const int32x4_t voffset = vdupq_n_s32(offset);
				392	const float32x4_t vscale = vdupq_n_f32(scale);
				393	const float32x4x4_t vdequantized_input =
				394	{
				395	{
				396	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				397	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				398	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				399	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				400	}
				401	};
				402	return vdequantized_input;
				403	}
				404
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame]	405	/** Dequantize a neon vector holding 16 signed quantized values.
				406	*
				407	* @param[in] qv Input values to be dequantized.
				408	* @param[in] qi Quantization information to be used in the computation.
				409	*
				410	* @return Dequantized values in a neon vector
				411	*/
				412	inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
				413	{
				414	const float scale = qi.scale;
				415	const int offset = qi.offset;
				416	const int32x4_t voffset = vdupq_n_s32(offset);
				417	const float32x4_t vscale = vdupq_n_f32(scale);
				418	const float32x4x4_t vdequantized_input =
				419	{
				420	{
				421	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
				422	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
				423	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
				424	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
				425	}
				426	};
				427	return vdequantized_input;
				428	}
				429
Georgios Pinitas	3d13af8	2019-06-04 13:04:16 +0100	[diff] [blame]	430	/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
				431	*
				432	* @param[in] qv Input values to be dequantized.
				433	* @param[in] scale Quantization scaling factor.
				434	* @param[in] offset Zero quantization offset.
				435	*
				436	* @return Dequantized values in a neon vector
				437	*/
				438	inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
				439	{
				440	const int32x4_t voffset = vdupq_n_s32(offset);
				441	const float32x4_t vscale = vdupq_n_f32(scale);
				442	const float32x4x4_t vdequantized_input =
				443	{
				444	{
				445	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				446	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				447	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				448	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				449	}
				450	};
				451	return vdequantized_input;
				452	}
				453
Sang-Hoon Park	d817647	2019-12-04 09:46:28 +0000	[diff] [blame]	454	/** Dequantize a vector of 16 values stored as signed asymmetric.
				455	*
				456	* @param[in] qv Input values to be dequantized.
				457	* @param[in] scale Quantization scaling factor.
				458	* @param[in] offset Zero quantization offset.
				459	*
				460	* @return Dequantized values in a neon vector
				461	*/
				462	inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
				463	{
				464	const int32x4_t voffset = vdupq_n_s32(offset);
				465	const float32x4_t vscale = vdupq_n_f32(scale);
				466	const float32x4x4_t vdequantized_input =
				467	{
				468	{
				469	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
				470	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
				471	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
				472	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
				473	}
				474	};
				475	return vdequantized_input;
				476	}
				477
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	478	/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values.
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	479	*
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	480	* @param[in] qv Input values to be dequantized.
				481	* @param[in] vscale Vector containing quantization scaling factors.
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	482	*
				483	* @return Dequantized values in a neon vector
				484	*/
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	485	inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	486	{
				487	const float32x4x4_t vdequantized_input =
				488	{
				489	{
Georgios Pinitas	8217c8e	2019-11-11 18:24:22 +0000	[diff] [blame]	490	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
				491	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
				492	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
				493	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
Michalis Spyrou	3f632f3	2019-08-22 16:52:00 +0100	[diff] [blame]	494	}
				495	};
				496	return vdequantized_input;
				497	}
				498
Georgios Pinitas	3d13af8	2019-06-04 13:04:16 +0100	[diff] [blame]	499	/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
				500	*
				501	* @param[in] qv Input values to be dequantized.
				502	* @param[in] scale Quantization scaling factor.
				503	*
				504	* @return Dequantized values in a neon vector
				505	*/
				506	inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
				507	{
				508	const float32x4_t vscale = vdupq_n_f32(scale);
				509	const float32x4x4_t vdequantized_input =
				510	{
				511	{
				512	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				513	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				514	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				515	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				516	}
				517	};
				518	return vdequantized_input;
				519	}
				520
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	521	/** Quantize a neon vector holding 8 floating point values.
				522	*
				523	* @param[in] qv Input values to be quantized.
				524	* @param[in] qi Quantization information to be used in the computation.
				525	*
				526	* @return A neon vector holding the quantized values
				527	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	528	inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	529	{
				530	const float scale = qi.scale;
				531	const int offset = qi.offset;
				532	const float32x4_t voffset = vdupq_n_f32(offset);
				533	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				534	const int32x4x4_t rf =
				535	{
				536	{
				537	#ifdef __aarch64__
				538	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				539	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				540	#else //__aarch64__
				541	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				542	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				543	#endif //__aarch64__
				544	}
				545	};
				546	return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				547	}
				548
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame]	549	/** Quantize a neon vector holding 8 floating point values.
				550	*
				551	* @param[in] qv Input values to be quantized.
				552	* @param[in] qi Quantization information to be used in the computation.
				553	*
				554	* @return A neon vector holding the singed quantized values
				555	*/
				556	inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
				557	{
				558	const float scale = qi.scale;
				559	const int offset = qi.offset;
				560	const float32x4_t voffset = vdupq_n_f32(offset);
				561	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				562	const int32x4x4_t rf =
				563	{
				564	{
				565	#ifdef __aarch64__
				566	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				567	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				568	#else //__aarch64__
				569	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				570	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				571	#endif //__aarch64__
				572	}
				573	};
				574	return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				575	}
				576
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	577	/** Quantize a neon vector holding 16 floating point values.
				578	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	579	* @param[in] qv Input values to be quantized.
				580	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	581	*
				582	* @return A neon vector holding the quantized values
				583	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	584	inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	585	{
				586	const float scale = qi.scale;
				587	const int offset = qi.offset;
				588	const float32x4_t voffset = vdupq_n_f32(offset);
				589	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				590	const int32x4x4_t rf =
				591	{
				592	{
				593	#ifdef __aarch64__
				594	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				595	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				596	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				597	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				598	#else //__aarch64__
				599	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				600	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				601	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				602	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				603	#endif //__aarch64__
				604	}
				605	};
				606	const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				607	const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				608	return vcombine_u8(pa, pb);
				609	}
Michele Di Giorgio	d64a46c	2019-10-01 12:25:49 +0100	[diff] [blame]	610
Michalis Spyrou	8d4d1b8	2019-11-28 11:31:23 +0000	[diff] [blame]	611	/** Signed quantize a neon vector holding 16 floating point values.
				612	*
				613	* @param[in] qv Input values to be quantized.
				614	* @param[in] qi Quantization information to be used in the computation.
				615	*
				616	* @return A neon vector holding the quantized values
				617	*/
				618
				619	inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
				620	{
				621	const float scale = qi.scale;
				622	const int offset = qi.offset;
				623	const float32x4_t voffset = vdupq_n_f32(offset);
				624	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				625	const int32x4x4_t rf =
				626	{
				627	{
				628	#ifdef __aarch64__
				629	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				630	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				631	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				632	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				633	#else //__aarch64__
				634	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				635	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				636	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				637	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				638	#endif //__aarch64__
				639
				640	}
				641	};
				642	const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				643	const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				644	return vcombine_s8(pa, pb);
				645	}
				646
Michele Di Giorgio	d64a46c	2019-10-01 12:25:49 +0100	[diff] [blame]	647	/** Quantize to QASYMM16 a neon vector holding 16 floating point values.
				648	*
				649	* @param[in] qv Input values to be quantized.
				650	* @param[in] qi Quantization information to be used in the computation.
				651	*
				652	* @return A neon vector holding the quantized values
				653	*/
				654	inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
				655	{
				656	const float scale = qi.scale;
				657	const int offset = qi.offset;
				658	const float32x4_t voffset = vdupq_n_f32(offset);
				659	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				660	const int32x4x4_t rf =
				661	{
				662	{
				663	#ifdef __aarch64__
				664	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				665	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				666	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				667	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				668	#else //__aarch64__
				669	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				670	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				671	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				672	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				673	#endif //__aarch64__
				674	}
				675	};
				676	const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
				677	const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
				678	return { pa, pb };
				679	}
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	680	} // namespace arm_compute
				681	#include "arm_compute/core/NEON/NEAsymm.inl"
Michalis Spyrou	f464337	2019-11-29 16:17:13 +0000	[diff] [blame]	682	#endif // ARM_COMPUTE_NEASYMM_H