COMPMID-3862: Add support QASYMM8 LEAKY RELU activation
- LEAKY RELU activation is supported for QASYMM8 data type
- vquantize on NEON side has been modified to match with
other backends (OpenCL and reference)
Change-Id: I194631225c8d4f3cc96027d64812ec2be2b4328a
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4593
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index 70d48d5..9b92a86 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEASYMM_H
#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
#include <arm_neon.h>
namespace arm_compute
@@ -647,6 +648,29 @@
return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
}
+inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int32_t offset)
+{
+ const int32x4_t voffset = vdupq_n_s32(offset);
+ const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+ const int32x4x4_t rf =
+ {
+ {
+#ifdef __aarch64__
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+#else //__aarch64__
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+ vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+#endif //__aarch64__
+ }
+ };
+ return rf;
+}
+
/** Quantize a neon vector holding 16 floating point values.
*
* @param[in] qv Input values to be quantized.
@@ -656,26 +680,7 @@
*/
inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
return vcombine_u8(pa, pb);
@@ -690,26 +695,7 @@
*/
inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
return vcombine_s8(pa, pb);
@@ -724,26 +710,7 @@
*/
inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
{
- const float scale = qi.scale;
- const int offset = qi.offset;
- const float32x4_t voffset = vdupq_n_f32(offset);
- const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
- vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
- }
- };
+ auto rf = vquantize_internal(qv, qi.scale, qi.offset);
const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
return { pa, pb };