COMPMID-2012: Remove unnecessary templates from NEON kernels

Change-Id: I2deb26188c7de7c6ad10d2f51f83e729fed7e5e2
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/961
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
index 7e0fb43..076af4f 100644
--- a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,24 +75,9 @@
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Template function to run the reorg
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_reorg(const Window &window);
-
-    /** Common signature for all the specialised reorg functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ReorgFunctionPtr = void (NEReorgLayerKernel::*)(const Window &window);
-
-private:
-    ReorgFunctionPtr _func;
-    const ITensor   *_input;
-    ITensor         *_output;
-    int32_t          _stride;
+    const ITensor *_input;
+    ITensor       *_output;
+    int32_t        _stride;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEREORGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
index 3a9e81f..42a0539 100644
--- a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
@@ -84,24 +84,10 @@
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Template function to run the stack
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_stack(const Window &window);
-
-    /** Common signature for all the specialised stack functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using StackFunctionPtr = void (NEStackLayerKernel::*)(const Window &window);
-
     const ITensor *_input;
     ITensor       *_output;
     unsigned int   _axis;
     unsigned int   _idx_input;
-    StackFunctionPtr _func;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NESTACKLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index 21f36f6..bba18a8 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,12 +99,9 @@
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
-
-    WeightsReshapeKernel *_func;
-    const ITensor        *_input;
-    const ITensor        *_bias;
-    ITensor              *_output;
+    const ITensor *_input;
+    const ITensor *_bias;
+    ITensor       *_output;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index 8baea2b..ece5aa4 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,9 +67,47 @@
 }
 } // namespace
 
-template <typename T>
-void NEReorgLayerKernel::run_reorg(const Window &window)
+NEReorgLayerKernel::NEReorgLayerKernel()
+    : _input(nullptr), _output(nullptr), _stride(1)
 {
+}
+
+void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+    _input  = input;
+    _output = output;
+    _stride = stride;
+
+    // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+
+    ICPPKernel::configure(win);
+}
+
+Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+    return Status{};
+}
+
+void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
     const DataLayout data_layout = _input->info()->data_layout();
     const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -101,72 +139,8 @@
         map_coords.set(idx_c, c % out_c);
 
         // Perform mapping
-        *(reinterpret_cast<T *>(out.ptr())) = *(reinterpret_cast<const T *>(in_ptr + _input->info()->offset_element_in_bytes(map_coords)));
+        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
     },
     out);
 }
-
-NEReorgLayerKernel::NEReorgLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _stride(1)
-{
-}
-
-void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-
-    _func   = nullptr;
-    _input  = input;
-    _output = output;
-    _stride = stride;
-
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEReorgLayerKernel::run_reorg<uint8_t>;
-            break;
-        case 2:
-            _func = &NEReorgLayerKernel::run_reorg<uint16_t>;
-            break;
-        case 4:
-            _func = &NEReorgLayerKernel::run_reorg<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
-    // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps());
-
-    ICPPKernel::configure(win);
-}
-
-Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
-    return Status{};
-}
-
-void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-    if(_func != nullptr)
-    {
-        (this->*_func)(window);
-    }
-}
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 62e4882..36398cf 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -189,31 +189,21 @@
     switch(_input->info()->data_type())
     {
         case DataType::F32:
-            run_reverse<float>(window, _input, _axis, _output);
+        case DataType::U32:
+        case DataType::S32:
+            run_reverse<uint32_t>(window, _input, _axis, _output);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            run_reverse<float16_t>(window, _input, _axis, _output);
-            break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::U32:
-            run_reverse<uint32_t>(window, _input, _axis, _output);
-            break;
-        case DataType::S32:
-            run_reverse<int32_t>(window, _input, _axis, _output);
-            break;
         case DataType::S16:
-            run_reverse<int16_t>(window, _input, _axis, _output);
-            break;
         case DataType::U16:
             run_reverse<uint16_t>(window, _input, _axis, _output);
             break;
         case DataType::QASYMM8:
         case DataType::U8:
-            run_reverse<uint8_t>(window, _input, _axis, _output);
-            break;
         case DataType::S8:
-            run_reverse<int8_t>(window, _input, _axis, _output);
+            run_reverse<uint8_t>(window, _input, _axis, _output);
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 0c33f36..3447d59 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -87,7 +87,7 @@
 } // namespace
 
 NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input(), _func(nullptr)
+    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
 {
 }
 
@@ -101,22 +101,6 @@
     _axis      = axis;
     _idx_input = idx_input;
 
-    switch(input->info()->element_size())
-    {
-        case 1:
-            _func = &NEStackLayerKernel::run_stack<uint8_t>;
-            break;
-        case 2:
-            _func = &NEStackLayerKernel::run_stack<uint16_t>;
-            break;
-        case 4:
-            _func = &NEStackLayerKernel::run_stack<uint32_t>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), axis, num_tensors, output->info());
 
@@ -137,15 +121,6 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
-    {
-        (this->*_func)(window);
-    }
-}
-
-template <typename T>
-void NEStackLayerKernel::run_stack(const Window &window)
-{
     Window window_out;
     window_out.use_tensor_dimensions(_output->info()->tensor_shape());
 
@@ -160,9 +135,9 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        Coordinates id_out                           = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx                              = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        *(reinterpret_cast<T *>(output.ptr() + idx)) = *(reinterpret_cast<const T *>(input.ptr()));
+        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
+        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
     },
     input);
 }
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 4a0cf27..624833a 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -34,59 +34,6 @@
 
 namespace
 {
-template <typename T>
-void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
-{
-    const unsigned int kernel_size_x   = input->info()->dimension(0);
-    const unsigned int kernel_size_y   = input->info()->dimension(1);
-    const unsigned int kernel_depth    = input->info()->dimension(2);
-    const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(input, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
-                {
-                    *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(bias != nullptr)
-        {
-            *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
-        }
-    },
-    in);
-}
-
 TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
 {
     TensorShape output_shape{ input->tensor_shape() };
@@ -141,7 +88,7 @@
 } // namespace
 
 NEWeightsReshapeKernel::NEWeightsReshapeKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
 
@@ -161,30 +108,6 @@
     _bias   = bias;
     _output = output;
 
-    switch(_input->info()->element_size())
-    {
-        case 4:
-        {
-            _func = &weights_reshape<uint32_t>;
-            break;
-        }
-        case 2:
-        {
-            _func = &weights_reshape<uint16_t>;
-            break;
-        }
-        case 1:
-        {
-            _func = &weights_reshape<uint8_t>;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR_ON("Element size not supported");
-            break;
-        }
-    }
-
     // Configure kernel
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -205,5 +128,52 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    (*_func)(_input, _bias, _output, window);
+    const unsigned int kernel_size_x   = _input->info()->dimension(0);
+    const unsigned int kernel_size_y   = _input->info()->dimension(1);
+    const unsigned int kernel_depth    = _input->info()->dimension(2);
+    const unsigned int input_stride_x  = _input->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = _input->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = _input->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(_input, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                {
+                    std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(_bias != nullptr)
+        {
+            std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
+        }
+    },
+    in);
 }