Blame - arm_compute/core/NEON/NEAsymm.h - ml/ComputeLibrary

blob: 981c7b075c6ec8fde3d6a216b53793c85cf69a9a [file] [log] [blame]

Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	1	/*
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	2	* Copyright (c) 2017-2019 ARM Limited.
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#ifndef __ARM_COMPUTE_NEASYMM_H__
				25	#define __ARM_COMPUTE_NEASYMM_H__
				26
Manuel Bottini	7bb56c6	2019-06-26 15:17:09 +0100	[diff] [blame^]	27	#include "arm_compute/core/NEON/NEMath.h"
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	28	#include <arm_neon.h>
				29
				30	namespace arm_compute
				31	{
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	32	using qasymm8x8_t = uint8x8_t; /*< 8 bit quantized asymmetric vector with 8 elements /
				33	using qasymm8x8x2_t = uint8x8x2_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				34	using qasymm8x8x3_t = uint8x8x3_t; /*< 8 bit quantized asymmetric vector with 24 elements /
				35	using qasymm8x8x4_t = uint8x8x4_t; /*< 8 bit quantized asymmetric vector with 32 elements /
				36	using qasymm8x16_t = uint8x16_t; /*< 8 bit quantized asymmetric vector with 16 elements /
				37
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	38	/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
				39	*
				40	* vd*vs + vo
				41	*
				42	* @param[in] vd Input vector value in QASYMM8 format
				43	* @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
				44	* @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
				45	*
				46	* @return A 16-component vector in QASYMM8 format, saturated to fit
				47	*/
				48	uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
Georgios Pinitas	f72f936	2018-01-12 16:29:45 +0000	[diff] [blame]	49
				50	/** Performs final quantization step on 16 elements
				51	*
				52	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				53	*
				54	* @param in_s32 Input to be quantized.
				55	* @param result_fixedpoint_multiplier Result multiplier parameter
				56	* @param result_shift Result shift parameter
				57	* @param result_offset_after_shift_s32 Result offset parameter
				58	* @param min_u8 Relu lower bound
				59	* @param max_u8 Relu upper bound
				60	*
				61	* @return Quantized values
				62	*/
				63	template <bool is_bounded_relu>
				64	uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
				65	int result_fixedpoint_multiplier,
				66	int32_t result_shift,
				67	int32x4_t result_offset_after_shift_s32,
				68	uint8x16_t min_u8,
				69	uint8x16_t max_u8)
				70	{
				71	const static int32x4_t zero_s32 = vdupq_n_s32(0);
				72
				73	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				74	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				75	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				76	in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
				77	in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
				78
				79	// Round to the nearest division by a power-of-two using result_shift_s32
				80	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				81	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				82	in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
				83	in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
				84
				85	// Add the offset terms
				86	in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
				87	in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
				88	in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
				89	in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
				90
				91	// Saturate negative values
				92	in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
				93	in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
				94	in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
				95	in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
				96
				97	// Convert S32 to S16
				98	const int16x8x2_t in_s16 =
				99	{
				100	{
				101	vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
				102	vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
				103	}
				104	};
				105
				106	// Convert S16 to U8
				107	uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
				108
				109	if(is_bounded_relu)
				110	{
				111	out_u8 = vmaxq_u8(out_u8, min_u8);
				112	out_u8 = vminq_u8(out_u8, max_u8);
				113	}
				114
				115	return out_u8;
				116	}
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	117
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	118	/** Performs final quantization step on single element
				119	*
				120	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				121	*
				122	* @param[in] in_value Input to be quantized.
				123	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				124	* @param[in] result_shift Result shift parameter
				125	* @param[in] result_offset_after_shift_s32 Result offset parameter
				126	* @param[in] min_u8 Relu lower bound
				127	* @param[in] max_u8 Relu upper bound
				128	*
				129	* @return Quantized value
				130	*/
				131	template <bool is_bounded_relu>
				132	inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
				133	int32_t result_shift, int32_t result_offset_after_shift_s32,
				134	uint8_t min_u8, uint8_t max_u8)
				135	{
				136	int32x4_t in_s32 = vdupq_n_s32(in_value);
				137
				138	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				139	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				140
				141	// Shift value by result_shift_s32
				142	in_value = rounding_divide_by_pow2(in_value, result_shift);
				143
				144	// Add the offset term
				145	in_value += result_offset_after_shift_s32;
				146
				147	// Bound the result
Georgios Pinitas	6fa2638	2019-03-18 10:05:34 +0000	[diff] [blame]	148	uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
George Wort	2d7e683	2019-02-22 16:37:41 +0000	[diff] [blame]	149	if(is_bounded_relu)
				150	{
				151	out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
				152	}
				153
				154	return out_u8;
				155	}
				156
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	157	/** Dequantize a neon vector holding 8 quantized values.
				158	*
				159	* @param[in] qv Input values to be dequantized.
				160	* @param[in] qi Quantization information to be used in the computation.
				161	*
				162	* @return Dequantized values in a neon vector
				163	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	164	inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	165	{
				166	const float scale = qi.scale;
				167	const int offset = qi.offset;
				168	const int32x4_t voffset = vdupq_n_s32(offset);
				169	const float32x4_t vscale = vdupq_n_f32(scale);
				170	const float32x4x2_t vdequantized_input =
				171	{
				172	{
				173	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
				174	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
				175	}
				176	};
				177	return vdequantized_input;
				178	}
				179
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	180	/** Dequantize a neon vector holding 16 quantized values.
				181	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	182	* @param[in] qv Input values to be dequantized.
				183	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	184	*
				185	* @return Dequantized values in a neon vector
				186	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	187	inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	188	{
				189	const float scale = qi.scale;
				190	const int offset = qi.offset;
				191	const int32x4_t voffset = vdupq_n_s32(offset);
				192	const float32x4_t vscale = vdupq_n_f32(scale);
				193	const float32x4x4_t vdequantized_input =
				194	{
				195	{
				196	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				197	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				198	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				199	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				200	}
				201	};
				202	return vdequantized_input;
				203	}
				204
Georgios Pinitas	3d13af8	2019-06-04 13:04:16 +0100	[diff] [blame]	205	/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
				206	*
				207	* @param[in] qv Input values to be dequantized.
				208	* @param[in] scale Quantization scaling factor.
				209	* @param[in] offset Zero quantization offset.
				210	*
				211	* @return Dequantized values in a neon vector
				212	*/
				213	inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
				214	{
				215	const int32x4_t voffset = vdupq_n_s32(offset);
				216	const float32x4_t vscale = vdupq_n_f32(scale);
				217	const float32x4x4_t vdequantized_input =
				218	{
				219	{
				220	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				221	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
				222	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				223	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
				224	}
				225	};
				226	return vdequantized_input;
				227	}
				228
				229	/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
				230	*
				231	* @param[in] qv Input values to be dequantized.
				232	* @param[in] scale Quantization scaling factor.
				233	*
				234	* @return Dequantized values in a neon vector
				235	*/
				236	inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
				237	{
				238	const float32x4_t vscale = vdupq_n_f32(scale);
				239	const float32x4x4_t vdequantized_input =
				240	{
				241	{
				242	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				243	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
				244	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				245	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
				246	}
				247	};
				248	return vdequantized_input;
				249	}
				250
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	251	/** Quantize a neon vector holding 8 floating point values.
				252	*
				253	* @param[in] qv Input values to be quantized.
				254	* @param[in] qi Quantization information to be used in the computation.
				255	*
				256	* @return A neon vector holding the quantized values
				257	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	258	inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	259	{
				260	const float scale = qi.scale;
				261	const int offset = qi.offset;
				262	const float32x4_t voffset = vdupq_n_f32(offset);
				263	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				264	const int32x4x4_t rf =
				265	{
				266	{
				267	#ifdef __aarch64__
				268	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				269	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				270	#else //__aarch64__
				271	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				272	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				273	#endif //__aarch64__
				274	}
				275	};
				276	return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				277	}
				278
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	279	/** Quantize a neon vector holding 16 floating point values.
				280	*
Georgios Pinitas	d66094e	2019-04-15 15:44:17 +0100	[diff] [blame]	281	* @param[in] qv Input values to be quantized.
				282	* @param[in] qi Quantization information to be used in the computation.
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	283	*
				284	* @return A neon vector holding the quantized values
				285	*/
Georgios Pinitas	4c5469b	2019-05-21 13:32:43 +0100	[diff] [blame]	286	inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Pablo Tello	54e98d9	2019-02-05 16:16:19 +0000	[diff] [blame]	287	{
				288	const float scale = qi.scale;
				289	const int offset = qi.offset;
				290	const float32x4_t voffset = vdupq_n_f32(offset);
				291	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				292	const int32x4x4_t rf =
				293	{
				294	{
				295	#ifdef __aarch64__
				296	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				297	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				298	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				299	vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				300	#else //__aarch64__
				301	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
				302	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
				303	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
				304	vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
				305	#endif //__aarch64__
				306	}
				307	};
				308	const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				309	const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				310	return vcombine_u8(pa, pb);
				311	}
Gian Marco	58c5794	2017-11-28 09:10:03 +0000	[diff] [blame]	312	} // namespace arm_compute
				313	#include "arm_compute/core/NEON/NEAsymm.inl"
Michel Iwaniec	5dfeae6	2017-11-29 10:48:23 +0000	[diff] [blame]	314	#endif // __ARM_COMPUTE_NEASYMM_H__