Adding GELU activation
OpenCL implementation uses built in erf.
NEON implementation requires new vectorized erf.
Uses the following approximation:
erf(x) = 1 - 1 / (1 + a1x + a2x^2 + a3x^3 + a4x^4)^4
a1 = 0.278393, a2 = 0.230389, a3 = 0.000972, a4 = 0.078108
From https://en.wikipedia.org/wiki/Error_function#Numerical_approximations
Signed-off-by: Murray Kornelsen <murray.kornelsen@mail.mcgill.ca>
Change-Id: I2d3964b2c26a4334166b17135f9104bc6324fad2
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7921
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Pablo Marquez Tello <pablo.tello@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index 67d9e0a..05a0b50 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
@@ -58,9 +58,13 @@
const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
+ const auto vconst_1 = vdupq_n_f32(1.f);
+
#ifndef __aarch64__
- const auto vconst_1 = vdupq_n_f32(1.f);
const auto vconst_0_f32 = vdupq_n_f32(0);
+#else // #ifndef __aarch64__
+ const auto const_inv_2 = vdupq_n_f32(0.5f);
+ const auto const_inv_sqrt_2 = vdupq_n_f32(0.70710678118f);
#endif // __aarch64__
const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
@@ -193,6 +197,23 @@
tmp = vquantize(tmp_dep, qi_out);
}
+#else // #ifndef __aarch64__
+ else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+ {
+ const auto vin_deq = vdequantize(vin, qi_in);
+ // Perform activation
+ const float32x4x4_t tmp_dep =
+ {
+ {
+ wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))),
+ wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))),
+ }
+ };
+ // Re-quantize to new output space
+ tmp = vquantize(tmp_dep, qi_out);
+ }
#endif // __aarch64__
else
{
@@ -248,6 +269,12 @@
tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
tmp = quantize_qasymm8(tmp_f, qi_out);
}
+ else if(act == ActivationLayerInfo::ActivationFunction::GELU)
+ {
+ float tmp_f = dequantize_qasymm8(in, qi_in);
+ tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+ tmp = quantize_qasymm8(tmp_f, qi_out);
+ }
#endif // __aarch64__
else
{