Integrate improved CPU depthwise convolution kernels

* Replace assembly kernels for depthwise convolution with more optimized
  ones.
* Add int8 assembly kernels.
* Fix implicit padding on optimized kernels

Resolves: COMPMID-3867, COMPMID-4361

Change-Id: I0b0867e05f61be4f368f62190d55e14d0ab3ebf2
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5622
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index ea3742f..1101e05 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -27,6 +27,7 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/core/utils/AssemblyUtils.h"
 
 #include <arm_neon.h>
 #include <cstdlib>
@@ -89,38 +90,6 @@
     return p;
 }
 
-arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
-{
-    arm_gemm::Activation gemm_act;
-
-    // Early exit in case lower bound is other than 0, as it's not yet supported
-    if(act.b() != 0.f)
-    {
-        return gemm_act;
-    }
-
-    switch(act.activation())
-    {
-        case ActivationLayerInfo::ActivationFunction::RELU:
-            gemm_act.type = arm_gemm::Activation::Type::ReLU;
-            break;
-        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
-            gemm_act.param1 = act.a();
-            gemm_act.param2 = 0.f;
-            break;
-        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-            gemm_act.type   = arm_gemm::Activation::Type::BoundedReLU;
-            gemm_act.param1 = act.a();
-            gemm_act.param2 = act.b();
-            break;
-        default:
-            gemm_act.type = arm_gemm::Activation::Type::None;
-    }
-
-    return gemm_act;
-}
-
 IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
 {
     // Schedule assembly kernel
@@ -788,14 +757,14 @@
 
 bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
 {
-    arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
     return act.type != arm_gemm::Activation::Type::None;
 }
 
 void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
+    arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
     if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))