Reduce binary size footprint of CpuGemmInterleave4x4Kernel

Change-Id: I7f3790d8ca592ec46ff2e2de810cf402191a990e
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5881
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
index 67f2a49..a6b080c 100644
--- a/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
+++ b/src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
@@ -40,66 +40,6 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-namespace
-{
-template <typename ScalarType>
-void gemm_interleave4x4(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const size_t window_start_x = window.x().start();
-    const size_t window_end_x   = window.x().end();
-
-    const size_t in_height = src->info()->dimension(1);
-    const size_t in_stride = src->info()->strides_in_bytes()[1];
-
-    const size_t partial_y = in_height % 4;
-
-    // Set window for the src tensor
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Set window for the dst tensor
-    Window win_out(window);
-    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win_out.scale(Window::DimY, 0.25f);
-
-    Iterator in(src, win);
-    Iterator out(dst, win_out);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        if(id.y() + 4 <= static_cast<int>(in_height))
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                const ScalarType data[4] =
-                {
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 0 * in_stride) + x),
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 1 * in_stride) + x),
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 2 * in_stride) + x),
-                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 3 * in_stride) + x),
-                };
-                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
-            }
-        }
-        else
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                ScalarType data[4] = { 0, 0, 0, 0 };
-
-                for(size_t y = 0; y < partial_y; ++y)
-                {
-                    data[y] = *(reinterpret_cast<const ScalarType *>(in.ptr() + y * in_stride) + x);
-                }
-
-                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
-            }
-        }
-    },
-    in, out);
-}
-} // namespace
-
 void CpuGemmInterleave4x4Kernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -110,22 +50,6 @@
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmInterleave4x4Kernel::validate(src, dst));
 
-    switch(src->element_size())
-    {
-        case 1:
-            _func = &gemm_interleave4x4<uint8_t>;
-            break;
-        case 2:
-            _func = &gemm_interleave4x4<uint16_t>;
-            break;
-        case 4:
-            _func = &gemm_interleave4x4<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR_ON("Element size not supported");
-            break;
-    }
-
     Window win = calculate_max_window(*src, Steps(1, 4));
     ICPPKernel::configure(win);
 }
@@ -152,7 +76,6 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
     ARM_COMPUTE_ERROR_ON(tensors.empty());
     /*
     *  This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
@@ -166,7 +89,57 @@
     const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    (*_func)(src, dst, window);
+    const size_t window_start_x = window.x().start();
+    const size_t window_end_x   = window.x().end();
+
+    const size_t in_height = src->info()->dimension(1);
+    const size_t in_stride = src->info()->strides_in_bytes()[1];
+
+    const size_t partial_y = in_height % 4;
+
+    const size_t element_size = src->info()->element_size();
+
+    // Set window for the src tensor
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Set window for the dst tensor
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.scale(Window::DimY, 0.25f);
+
+    Iterator in(src, win);
+    Iterator out(dst, win_out);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        if(id.y() + 4 <= static_cast<int>(in_height))
+        {
+            for(size_t x = window_start_x; x < window_end_x; ++x)
+            {
+                std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size);
+                std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size);
+                std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size);
+                std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size);
+            }
+        }
+        else
+        {
+            for(size_t x = window_start_x; x < window_end_x; ++x)
+            {
+                size_t y = 0;
+                for(; y < partial_y; ++y)
+                {
+                    std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size);
+                }
+                for(; y < 4; ++y)
+                {
+                    std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                }
+            }
+        }
+    },
+    in, out);
 }
 
 const char *CpuGemmInterleave4x4Kernel::name() const