Blame - arm_compute/core/NEON/NEAsymm.h - ml/ComputeLibrary

blob: 4c8f7973606a74e153f27ee97a926824b5ef580a [file] [log] [blame]

Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	1	/*
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#ifndef __ARM_COMPUTE_NEASYMM_H__
				25	#define __ARM_COMPUTE_NEASYMM_H__
				26
				27	#include <arm_neon.h>
				28
				29	namespace arm_compute
				30	{
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	31	using qasymm8x8_t = uint8x8_t; /*< 8 bit quantized asymmetric vector with 8 elements /
				32	using qasymm8x8x2_t = uint8x8x2_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				33	using qasymm8x8x3_t = uint8x8x3_t; /*< 8 bit quantized asymmetric vector with 24 elements /
				34	using qasymm8x8x4_t = uint8x8x4_t; /*< 8 bit quantized asymmetric vector with 32 elements /
				35	using qasymm8x16_t = uint8x16_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				36
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	37	/** Round to the nearest division by a power-of-two using exponent
				38	*
				39	* @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
				40	*
				41	* @param[in] x Vector of 4 elements
				42	* @param[in] exponent Integer value used to round to nearest division by a power-of-two
				43	*
				44	* @return the nearest division by a power-of-two using exponent
				45	*/
				46	int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	47
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	48	/** Round to the nearest division by a power-of-two using exponent
				49	*
				50	* @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
				51	*
				52	* @param[in] x Element to divide.
				53	* @param[in] exponent Integer value used to round to nearest division by a power-of-two
				54	*
				55	* @return the nearest division by a power-of-two using exponent
				56	*/
				57	int32_t rounding_divide_by_pow2(int32_t x, int exponent);
				58
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	59	/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
				60	*
				61	* vd*vs + vo
				62	*
				63	* @param[in] vd Input vector value in QASYMM8 format
				64	* @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
				65	* @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
				66	*
				67	* @return A 16-component vector in QASYMM8 format, saturated to fit
				68	*/
				69	uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
Georgios Pinitas	f72f936	2018-01-12 16:29:45 +0000	[diff] [blame]	70
				71	/** Performs final quantization step on 16 elements
				72	*
				73	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				74	*
				75	* @param in_s32 Input to be quantized.
				76	* @param result_fixedpoint_multiplier Result multiplier parameter
				77	* @param result_shift Result shift parameter
				78	* @param result_offset_after_shift_s32 Result offset parameter
				79	* @param min_u8 Relu lower bound
				80	* @param max_u8 Relu upper bound
				81	*
				82	* @return Quantized values
				83	*/
				84	template <bool is_bounded_relu>
				85	uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
				86	int result_fixedpoint_multiplier,
				87	int32_t result_shift,
				88	int32x4_t result_offset_after_shift_s32,
				89	uint8x16_t min_u8,
				90	uint8x16_t max_u8)
				91	{
				92	const static int32x4_t zero_s32 = vdupq_n_s32(0);
				93
				94	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				95	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				96	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				97	in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
				98	in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
				99
				100	// Round to the nearest division by a power-of-two using result_shift_s32
				101	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				102	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				103	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
				104	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
				105
				106	// Add the offset terms
				107	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				108	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				109	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				110	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				111
				112	// Saturate negative values
				113	in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
				114	in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
				115	in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
				116	in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
				117
				118	// Convert S32 to S16
				119	const int16x8x2_t in_s16 =
				120	{
				121	{
				122	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				123	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				124	}
				125	};
				126
				127	// Convert S16 to U8
				128	uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
				129
				130	if(is_bounded_relu)
				131	{
				132	out_u8 = vmaxq_u8(out_u8, min_u8);
				133	out_u8 = vminq_u8(out_u8, max_u8);
				134	}
				135
				136	return out_u8;
				137	}
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	138
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	139	/** Performs final quantization step on single element
				140	*
				141	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				142	*
				143	* @param[in] in_value Input to be quantized.
				144	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				145	* @param[in] result_shift Result shift parameter
				146	* @param[in] result_offset_after_shift_s32 Result offset parameter
				147	* @param[in] min_u8 Relu lower bound
				148	* @param[in] max_u8 Relu upper bound
				149	*
				150	* @return Quantized value
				151	*/
				152	template <bool is_bounded_relu>
				153	inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
				154	int32_t result_shift, int32_t result_offset_after_shift_s32,
				155	uint8_t min_u8, uint8_t max_u8)
				156	{
				157	int32x4_t in_s32 = vdupq_n_s32(in_value);
				158
				159	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				160	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				161
				162	// Shift value by result_shift_s32
				163	in_value = rounding_divide_by_pow2(in_value, result_shift);
				164
				165	// Add the offset term
				166	in_value += result_offset_after_shift_s32;
				167
				168	// Bound the result
Georgios Pinitas	6fa2638	2019-03-18 10:05:34 +0000	[diff] [blame]	169	uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	170	if(is_bounded_relu)
				171	{
				172	out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
				173	}
				174
				175	return out_u8;
				176	}
				177
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	178	/** Dequantize a neon vector holding 8 quantized values.
				179	*
				180	* @param[in] qv Input values to be dequantized.
				181	* @param[in] qi Quantization information to be used in the computation.
				182	*
				183	* @return Dequantized values in a neon vector
				184	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	185	inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	186	{
				187	const float scale = qi.scale;
				188	const int offset = qi.offset;
				189	const int32x4_t voffset = vdupq_n_s32(offset);
				190	const float32x4_t vscale = vdupq_n_f32(scale);
				191	const float32x4x2_t vdequantized_input =
				192	{
				193	{
				194	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
				195	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
				196	}
				197	};
				198	return vdequantized_input;
				199	}
				200
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	201	/** Dequantize a neon vector holding 16 quantized values.
				202	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	203	* @param[in] qv Input values to be dequantized.
				204	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	205	*
				206	* @return Dequantized values in a neon vector
				207	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	208	inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	209	{
				210	const float scale = qi.scale;
				211	const int offset = qi.offset;
				212	const int32x4_t voffset = vdupq_n_s32(offset);
				213	const float32x4_t vscale = vdupq_n_f32(scale);
				214	const float32x4x4_t vdequantized_input =
				215	{
				216	{
				217	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				218	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				219	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				220	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				221	}
				222	};
				223	return vdequantized_input;
				224	}
				225
Georgios Pinitas	3d13af8	2019-06-04 13:04:16 +0100	[diff] [blame]	226	/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
				227	*
				228	* @param[in] qv Input values to be dequantized.
				229	* @param[in] scale Quantization scaling factor.
				230	* @param[in] offset Zero quantization offset.
				231	*
				232	* @return Dequantized values in a neon vector
				233	*/
				234	inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
				235	{
				236	const int32x4_t voffset = vdupq_n_s32(offset);
				237	const float32x4_t vscale = vdupq_n_f32(scale);
				238	const float32x4x4_t vdequantized_input =
				239	{
				240	{
				241	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				242	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				243	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				244	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				245	}
				246	};
				247	return vdequantized_input;
				248	}
				249
				250	/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
				251	*
				252	* @param[in] qv Input values to be dequantized.
				253	* @param[in] scale Quantization scaling factor.
				254	*
				255	* @return Dequantized values in a neon vector
				256	*/
				257	inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
				258	{
				259	const float32x4_t vscale = vdupq_n_f32(scale);
				260	const float32x4x4_t vdequantized_input =
				261	{
				262	{
				263	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				264	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				265	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				266	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				267	}
				268	};
				269	return vdequantized_input;
				270	}
				271
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	272	/** Quantize a neon vector holding 8 floating point values.
				273	*
				274	* @param[in] qv Input values to be quantized.
				275	* @param[in] qi Quantization information to be used in the computation.
				276	*
				277	* @return A neon vector holding the quantized values
				278	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	279	inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	280	{
				281	const float scale = qi.scale;
				282	const int offset = qi.offset;
				283	const float32x4_t voffset = vdupq_n_f32(offset);
				284	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				285	const int32x4x4_t rf =
				286	{
				287	{
				288	#ifdef __aarch64__
				289	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				290	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				291	#else //__aarch64__
				292	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				293	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				294	#endif //__aarch64__
				295	}
				296	};
				297	return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				298	}
				299
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	300	/** Quantize a neon vector holding 16 floating point values.
				301	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	302	* @param[in] qv Input values to be quantized.
				303	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	304	*
				305	* @return A neon vector holding the quantized values
				306	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	307	inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	308	{
				309	const float scale = qi.scale;
				310	const int offset = qi.offset;
				311	const float32x4_t voffset = vdupq_n_f32(offset);
				312	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				313	const int32x4x4_t rf =
				314	{
				315	{
				316	#ifdef __aarch64__
				317	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				318	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				319	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				320	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				321	#else //__aarch64__
				322	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				323	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				324	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				325	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				326	#endif //__aarch64__
				327	}
				328	};
				329	const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				330	const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				331	return vcombine_u8(pa, pb);
				332	}
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	333	} // namespace arm_compute
				334	#include "arm_compute/core/NEON/NEAsymm.inl"
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	335	#endif // __ARM_COMPUTE_NEASYMM_H__