Blame - arm_compute/core/NEON/NESymm.h - ml/ComputeLibrary

blob: 364a317bc77e29b55274bc54b48bcc786df4b5f9 [file] [log] [blame]

Gian Marco Iodice	bc415af	2019-06-13 15:58:32 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2019 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#ifndef __ARM_COMPUTE_NESYMM_H__
				25	#define __ARM_COMPUTE_NESYMM_H__
				26
				27	#include "NEAsymm.h"
				28	#include <arm_neon.h>
				29
				30	namespace arm_compute
				31	{
				32	/** Performs final quantization step on 8 signed 16-bit elements
				33	*
				34	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				35	*
				36	* @param[in] in_s32 Input to be quantized.
				37	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				38	* @param[in] result_shift Result shift parameter
				39	* @param[in] min_s16 Relu lower bound
				40	* @param[in] max_s16 Relu upper bound
				41	*
				42	* @return Quantized values
				43	*/
				44	template <bool is_bounded_relu>
				45	int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
				46	int result_fixedpoint_multiplier,
				47	int32_t result_shift,
				48	int16x8_t min_s16,
				49	int16x8_t max_s16)
				50	{
				51	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				52	in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
				53	in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
				54
				55	// Round to the nearest division by a power-of-two using result_shift_s32
				56	in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
				57	in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
				58
				59	// Convert S32 to S16
				60	int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
				61
				62	if(is_bounded_relu)
				63	{
				64	out_s16 = vmaxq_s16(out_s16, min_s16);
				65	out_s16 = vminq_s16(out_s16, max_s16);
				66	}
				67
				68	return out_s16;
				69	}
				70
				71	/** Performs final quantization step on single signed 16-bit element
				72	*
				73	* @tparam is_bounded_relu Specified if a fused bounded relu should be applied
				74	*
				75	* @param[in] in_value Input to be quantized.
				76	* @param[in] result_fixedpoint_multiplier Result multiplier parameter
				77	* @param[in] result_shift Result shift parameter
				78	* @param[in] min_s16 Relu lower bound
				79	* @param[in] max_s16 Relu upper bound
				80	*
				81	* @return Quantized values
				82	*/
				83	template <bool is_bounded_relu>
				84	inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
				85	int32_t result_shift, int16_t min_s16, int16_t max_s16)
				86	{
				87	int32x4_t in_s32 = vdupq_n_s32(in_value);
				88
				89	// Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
				90	in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
				91
				92	// Shift value by result_shift_s32
				93	in_value = rounding_divide_by_pow2(in_value, result_shift);
				94
				95	// Bound the result
				96	int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
				97
				98	if(is_bounded_relu)
				99	{
				100	out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
				101	}
				102
				103	return out_s16;
				104	}
giuros01	c9573f3	2019-06-20 10:30:17 +0100	[diff] [blame]	105
				106	/** Dequantize a neon vector holding 8 16-bit quantized values.
				107	*
				108	* @param[in] qv Input values to be dequantized.
				109	* @param[in] scale Quantization scale
				110	*
				111	* @return Dequantized values in a neon vector
				112	*/
				113	inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
				114	{
				115	const float32x4_t vscale = vdupq_n_f32(scale);
				116	const float32x4x2_t vdequantized_input =
				117	{
				118	{
				119	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
				120	vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
				121	}
				122	};
				123	return vdequantized_input;
				124	}
				125
				126	/** Quantize a neon vector holding 8 floating point values.
				127	*
				128	* @param[in] qv Input values to be quantized.
				129	* @param[in] scale Quantization scale
				130	*
				131	* @return A neon vector holding the quantized values
				132	*/
				133	inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
				134	{
				135	const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
				136
				137	const int32x4x2_t rf =
				138	{
				139	{
				140	#ifdef __aarch64__
				141	vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
				142	vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
				143	#else //__aarch64__
				144	vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
				145	vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
				146	#endif //__aarch64__
				147	}
				148	};
				149	return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
				150	}
				151
Gian Marco Iodice	bc415af	2019-06-13 15:58:32 +0100	[diff] [blame]	152	} // namespace arm_compute
				153	#endif // __ARM_COMPUTE_NESYMM_H__