Use lookup table for Fp16 Tanh activation in hardware with SVE

Resolves: COMPMID-6901

Change-Id: Idcd3f5f5d90f4073aaf116c0586e46013fbd64f7
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11605
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 9390d0c..83b12d5 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -121,6 +121,20 @@
         _lut_fp16 = lut;
     }
 #endif // __aarch64__
+
+    // The < and == are added to be able to use this data type as an attribute for LUTInfo
+    friend bool operator<(const ActivationLayerInfo &l, const ActivationLayerInfo &r)
+    {
+        const auto l_tup = std::make_tuple(l._act, l._a, l._b, l._enabled);
+        const auto r_tup = std::make_tuple(r._act, r._a, r._b, r._enabled);
+
+        return l_tup < r_tup;
+    }
+    bool operator==(const ActivationLayerInfo &l) const
+    {
+        return this->_act == l._act && this->_a == l._a && this->_b == l._b && this->_enabled == l._enabled;
+    }
+
 private:
     ActivationFunction _act     = {ActivationLayerInfo::ActivationFunction::IDENTITY};
     float              _a       = {};
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index a5f61d6..d9c2c84 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -41,6 +41,10 @@
 
 @section S2_2_changelog Changelog
 
+v24.08 Public major release
+ - Optimize CPU activation functions using LUT-based implementation:
+   - Tanh function for FP16.
+
 v24.05 Public major release
  - Add @ref CLScatter operator for FP32/16, S32/16/8, U32/16/8 data types
  - Various fixes to enable FP16 kernels in armv8a multi_isa builds.
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
index 06e35ee..2effffb 100644
--- a/src/core/helpers/LUTManager.cpp
+++ b/src/core/helpers/LUTManager.cpp
@@ -30,17 +30,38 @@
 namespace
 {
 
-void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut)
+float16_t activation(float16_t x, const LUTInfo &info)
+{
+    float16_t out = 0.f;
+    switch (info.act)
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+            out = 1.f / (1.f + std::exp(-x));
+            break;
+        case ActivationLayerInfo::ActivationFunction::TANH:
+        {
+            out = static_cast<float16_t>(info.alpha * std::tanh(info.beta * x));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Activation for 16-bit LUT table");
+            break;
+    }
+    return out;
+}
+
+void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut, const LUTInfo &info)
 {
     union Element
     {
         uint16_t  i = 0;
         float16_t fp;
     } item;
+
     // Fill lut by iterating over all 16 bit values using the union.
     while (true)
     {
-        (*lut)[item.i] = 1.f / (1.f + std::exp(-item.fp));
+        (*lut)[item.i] = activation(item.fp, info);
         if (item.i == 65535)
             break;
         item.i++;
@@ -62,7 +83,7 @@
         // Not found, or pointer not valid
         // We do not use make_shared to prevent the weak_ptr keeping the control block alive
         std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536);
-        init_lut_fp16(ptr.get());
+        init_lut_fp16(ptr.get(), info);
         map_fp16[info] = ptr;
         return ptr;
     }
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
index 4e13ead..f3f4bf2 100644
--- a/src/core/helpers/LUTManager.h
+++ b/src/core/helpers/LUTManager.h
@@ -38,19 +38,23 @@
 struct LUTInfo
 {
     ActivationLayerInfo::ActivationFunction act;
+    float                                   alpha;
+    float                                   beta;
     DataType                                dt;
-    QuantizationInfo                        qinfo;
+    UniformQuantizationInfo                 qinfo;
+
     // Operators enable use of map with Lutinfo as key
     friend bool operator<(const LUTInfo &l, const LUTInfo &r)
     {
-        return (l.act < r.act) || ((l.act == r.act) && (l.dt < r.dt)) ||
-               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() < r.qinfo.scale())) ||
-               ((l.act == r.act) && (l.dt == r.dt) && (l.qinfo.scale() == r.qinfo.scale()) &&
-                (l.qinfo.offset() < l.qinfo.offset()));
+        const auto l_tup = std::make_tuple(l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset);
+        const auto r_tup = std::make_tuple(r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset);
+
+        return l_tup < r_tup;
     }
-    bool operator==(const LUTInfo &l)
+    bool operator==(const LUTInfo &l) const
     {
-        return this->act == l.act && this->dt == l.dt && this->qinfo == l.qinfo;
+        return this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && this->dt == l.dt &&
+               this->qinfo == l.qinfo;
     }
 };
 
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 7cfa39b..4253027 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -43,6 +43,13 @@
 {
 namespace
 {
+
+bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func)
+{
+    return func == ActivationLayerInfo::ActivationFunction::LOGISTIC ||
+           func == ActivationLayerInfo::ActivationFunction::TANH;
+}
+
 static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
     {"sve2_q8_activation_lut",
@@ -85,10 +92,7 @@
      REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
     {"sve_fp16_activation_lut",
      [](const ActivationDataTypeISASelectorData &data)
-     {
-         return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve &&
-                data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC;
-     },
+     { return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); },
      REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
     {"sve_fp16_activation",
      [](const ActivationDataTypeISASelectorData &data)
@@ -299,10 +303,10 @@
         activation_info.setLookupTable256(tmp_lut);
     }
 
-    if (src->data_type() == DataType::F16 &&
-        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+    if (std::string(uk->name) == "sve_fp16_activation_lut")
     {
-        const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()};
+        const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(),
+                              src->quantization_info().uniform()};
         activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
     }
 #endif // __aarch64__