Prevent RELU from being processed thru LUT in INT8

- For quantized RELU activation, de-quantization and re-quantization
  is not required since comparison against the quantization bias is
  only required.

Resolves: COMPMID-6340
Change-Id: I574bd220f3d0d893b7f7c4819a883e2a131f61f4
Signed-off-by: Sangwon Ha <sangwon.ha@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10916
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: <felixjohnny.thomasmathibalan@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 50bf672..3f3d72e 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,8 @@
      [](const ActivationDataTypeISASelectorData &data)
      {
          return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
-                data.cpumodel == CPUModel::A510 && data.isa.sve2;
+                data.cpumodel == CPUModel::A510 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
      },
      REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
 #endif // ARM_COMPUTE_ENABLE_SVE
@@ -57,7 +58,10 @@
     {// Neon LUT implementantion takes precedence
      "neon_q8_activation_lut",
      [](const ActivationDataTypeISASelectorData &data)
-     { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
+     },
      REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
 #endif // __aarch64__
     {"sve2_qu8_activation",
@@ -214,9 +218,6 @@
             case ActivationLayerInfo::ActivationFunction::LINEAR:
                 tmp_f = a * tmp_f + b;
                 break;
-            case ActivationLayerInfo::ActivationFunction::RELU:
-                tmp_f = std::max<>(0.f, tmp_f);
-                break;
             case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
                 tmp_f = std::min<>(a, std::max(0.f, tmp_f));
                 break;
@@ -278,7 +279,8 @@
     _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
 
 #ifdef __aarch64__
-    if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
+    if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) &&
+        activation_info.activation() != ActivationFunction::RELU)
     {
         ActivationLayerInfo::LookupTable256 tmp_lut;
         init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
index f289c80..ddd186f 100644
--- a/src/cpu/kernels/activation/generic/neon/lut.cpp
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,9 @@
 #ifdef __aarch64__
 void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
-                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+        (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
index 2ed667d..5db8595 100644
--- a/src/cpu/kernels/activation/generic/sve2/lut.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,9 @@
 #ifdef __aarch64__
 void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
-                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation
+        (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) ||
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));