Rename NEGEMMAssembly to CpuGemmAssembly

- Dispatch, WrapperKernel has been renamed and moved
- Header files for assembly kernels have been moved

Partially Resolves: COMPMID-4506

Change-Id: I6c2f391bb95ba1ce7ca195d0efa57b9c3225570f
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5637
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 6ece3f8..f88ddc4 100644
--- a/Android.bp
+++ b/Android.bp
@@ -568,7 +568,6 @@
         "src/runtime/NEON/functions/NEFullyConnectedLayer.cpp",
         "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp",
         "src/runtime/NEON/functions/NEGEMM.cpp",
-        "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp",
         "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
         "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
         "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
@@ -650,6 +649,7 @@
         "src/runtime/cpu/operators/CpuSoftmax.cpp",
         "src/runtime/cpu/operators/CpuSub.cpp",
         "src/runtime/cpu/operators/CpuTranspose.cpp",
+        "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
         "src/runtime/gpu/cl/operators/ClActivation.cpp",
         "src/runtime/gpu/cl/operators/ClAdd.cpp",
         "src/runtime/gpu/cl/operators/ClConcatenate.cpp",
diff --git a/SConscript b/SConscript
index d5b2221..83c5a7d 100644
--- a/SConscript
+++ b/SConscript
@@ -260,7 +260,8 @@
                                       "src/core/NEON/kernels/convolution/winograd/",
                                       "src/core/NEON/kernels/convolution/depthwise/",
                                       "src/core/NEON/kernels/assembly/",
-                                      "arm_compute/core/NEON/kernels/assembly/"])
+                                      "arm_compute/core/NEON/kernels/assembly/",
+                                      "src/core/cpu/kernels/assembly/",])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
@@ -367,7 +368,8 @@
                           'src/runtime/cpu/operators/CpuSub.cpp',
                           'src/runtime/cpu/operators/CpuTranspose.cpp',
                          ]
-    runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files ]
+    cpu_internal_operator_files = ['src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp',]
+    runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files, cpu_internal_operator_files ]
 
 bootcode_o = []
 if env['os'] == 'bare_metal':
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index d4a9f68..9df2e08 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -41,12 +41,15 @@
 class NEGEMMMatrixAdditionKernel;
 class NEGEMMMatrixMultiplyKernel;
 class NEGEMMTranspose1xWKernel;
-class NEGEMMAssemblyDispatch;
+namespace cpu
+{
+class CpuGemmAssemblyDispatch;
+}
 
 /** Basic function to execute GEMM. This function calls the following kernels:
  *
  * If optimized assembly is available:
- *  -# @ref NEGEMMAssemblyDispatch
+ *  -# @ref cpu::CpuGemmAssemblyDispatch
  *  -# @ref NEActivationLayer (if alpha != 1.0)
  * Else:
  *  -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
@@ -119,16 +122,16 @@
     void prepare() override;
 
 private:
-    MemoryGroup                                 _memory_group;
-    IWeightsManager                            *_weights_manager;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>  _interleave_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>   _transpose_kernel;
-    std::unique_ptr<NEGEMMMatrixMultiplyKernel> _mm_kernel;
-    std::unique_ptr<NEGEMMAssemblyDispatch>     _asm_glue;
-    std::unique_ptr<NEGEMMMatrixAdditionKernel> _ma_kernel;
-    NEActivationLayer                           _alpha_scale_func;
-    NEArithmeticAddition                        _add_bias;
-    NEActivationLayer                           _activation_func;
+    MemoryGroup                                   _memory_group;
+    IWeightsManager                              *_weights_manager;
+    std::unique_ptr<NEGEMMInterleave4x4Kernel>    _interleave_kernel;
+    std::unique_ptr<NEGEMMTranspose1xWKernel>     _transpose_kernel;
+    std::unique_ptr<NEGEMMMatrixMultiplyKernel>   _mm_kernel;
+    std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _asm_glue;
+    std::unique_ptr<NEGEMMMatrixAdditionKernel>   _ma_kernel;
+    NEActivationLayer                             _alpha_scale_func;
+    NEArithmeticAddition                          _add_bias;
+    NEActivationLayer                             _activation_func;
 
     Tensor         _tmp_a;
     Tensor         _tmp_b;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index b2ffd03..6c71f0e 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -36,13 +36,16 @@
 {
 // Forward declarations
 class ITensor;
-class NEGEMMAssemblyDispatch;
+namespace cpu
+{
+class CpuGemmAssemblyDispatch;
+}
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
  * Supports only NHWC data layout
  *
- * -# @ref NEGEMMAssemblyDispatch
+ * -# @ref cpu::CpuGemmAssemblyDispatch
  * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch
  *
  * Weights are transformed from OHWI to HWIO format using the following kernels:
@@ -111,13 +114,13 @@
     void prepare() override;
 
 private:
-    std::unique_ptr<NEGEMMAssemblyDispatch> _gemm_asm_func;
-    NEActivationLayer                       _activation_func;
-    NEPermute                               _weights_permute_func;
-    const ITensor                          *_original_weights;
-    Tensor                                  _permuted_weights;
-    bool                                    _is_prepared;
-    bool                                    _run_activation;
+    std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _gemm_asm_func;
+    NEActivationLayer                             _activation_func;
+    NEPermute                                     _weights_permute_func;
+    const ITensor                                *_original_weights;
+    Tensor                                        _permuted_weights;
+    bool                                          _is_prepared;
+    bool                                          _run_activation;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGEMMCONV2D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 780723e..a292712 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -44,7 +44,10 @@
 class NEGEMMLowpMatrixAReductionKernel;
 class NEGEMMLowpMatrixBReductionKernel;
 class NEGEMMTranspose1xWKernel;
-class NEGEMMAssemblyDispatch;
+namespace cpu
+{
+class CpuGemmAssemblyDispatch;
+}
 
 /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
  *
@@ -135,7 +138,7 @@
 private:
     MemoryGroup                                                    _memory_group;
     IWeightsManager                                               *_weights_manager;
-    std::unique_ptr<NEGEMMAssemblyDispatch>                        _asm_glue;
+    std::unique_ptr<cpu::CpuGemmAssemblyDispatch>                  _asm_glue;
     std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                _mm_kernel;
     std::unique_ptr<NEGEMMInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
     std::unique_ptr<NEGEMMTranspose1xWKernel>                      _mtx_b_reshape_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 77f9093..f9ebf60 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -47,7 +47,7 @@
  * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
  * -# @ref NEWinogradLayerTransformInputKernel
  * -# @ref NEWinogradLayerTransformOutputKernel
- * -# @ref NEGEMMAssemblyDispatch
+ * -# @ref cpu::CpuGemmAssemblyDispatch
  * -# @ref CPPPermute (three times: weights, input and output)
  *
  * @note  Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true
diff --git a/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
similarity index 80%
rename from src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
rename to src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 7fcf2b1..4b7b092 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ b/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,8 @@
 
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_gemm_compute_iface.hpp"
 #include "src/core/NEON/INEKernel.h"
+#include "src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
 
 #include "gemm_common.hpp"
 
@@ -35,11 +35,15 @@
 {
 class ITensor;
 
+namespace cpu
+{
+namespace kernel
+{
 /** This class is a wrapper for the assembly kernels.
   *
   * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
   * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
-  * of NEGEMMAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
+  * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
   * in the context of an NEFunctions.
   *
   * The type T is the type of the actual kernel implemented in assembly which is of type
@@ -48,19 +52,19 @@
   *
   */
 template <typename TypeInput, typename TypeOutput>
-class NEGEMMAssemblyWrapperKernel final : public INEKernel
+class CpuGemmAssemblyWrapperKernel final : public INEKernel
 {
 public:
     /** Constructor
      */
-    NEGEMMAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("NEGEMMAssemblyWrapperKernel")
+    CpuGemmAssemblyWrapperKernel()
+        : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
     {
     }
 
-    NEGEMMAssemblyWrapperKernel(NEGEMMAssemblyWrapperKernel &)  = delete;
-    NEGEMMAssemblyWrapperKernel(NEGEMMAssemblyWrapperKernel &&) = default;
-    NEGEMMAssemblyWrapperKernel &operator=(NEGEMMAssemblyWrapperKernel &) = delete;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)  = delete;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
+    CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
 
     const char *name() const override
     {
@@ -94,8 +98,8 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in] kernel      Pointer to an assembly kernel implementation.
-     * @param[in] num_threads Number of concurrent threads which will execute the kernel.
+     * @param[in] kernel          Pointer to an assembly kernel implementation.
+     * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name.
      */
     void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
     {
@@ -116,5 +120,7 @@
     arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
     std::string _name;
 };
+} // namespace kernel
+} // namespace cpu
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp
similarity index 99%
rename from src/core/NEON/kernels/assembly/arm_gemm.hpp
rename to src/core/cpu/kernels/assembly/arm_gemm.hpp
index 3088b08..624e9e9 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/cpu/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
similarity index 89%
rename from src/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
rename to src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
index d620477..718fcd1 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,8 +23,8 @@
  */
 #pragma once
 
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Window.h"
 
 #include "ndrange.hpp"
 
@@ -35,14 +35,14 @@
  * so maintain their own types which represent similar information.
  */
 
-namespace arm_gemm {
-
+namespace arm_gemm
+{
 //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
 constexpr std::size_t ndrange_max =
     arm_compute::Dimensions<unsigned int>::num_max_dimensions;
 
-using ndrange_t=NDRange<ndrange_max>;
-using ndcoord_t=NDCoordinate<ndrange_max>;
+using ndrange_t = NDRange<ndrange_max>;
+using ndcoord_t = NDCoordinate<ndrange_max>;
 
 /* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
  *
@@ -52,10 +52,12 @@
  * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
  * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
  */
-inline arm_compute::Window to_window(const ndrange_t& ndr) {
+inline arm_compute::Window to_window(const ndrange_t &ndr)
+{
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i!=ndrange_max; ++i) {
+    for(unsigned int i = 0; i != ndrange_max; ++i)
+    {
         //populate the window with the dimensions of the NDRange
         win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
     }
@@ -69,10 +71,12 @@
  * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
  * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
  */
-inline arm_compute::Window to_window(const ndcoord_t& ndc) {
+inline arm_compute::Window to_window(const ndcoord_t &ndc)
+{
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i!=ndrange_max; ++i) {
+    for(unsigned int i = 0; i != ndrange_max; ++i)
+    {
         const auto start = ndc.get_position(i);
         const auto size  = ndc.get_size(i);
         const auto stop  = start + size;
@@ -92,8 +96,10 @@
  * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
  * @return the resultant ndrange_t
  */
-inline ndrange_t to_ndrange(const arm_compute::Window& win) {
-    return {
+inline ndrange_t to_ndrange(const arm_compute::Window &win)
+{
+    return
+    {
         static_cast<unsigned int>(win[0].end() - win[0].start()),
         static_cast<unsigned int>(win[1].end() - win[1].start()),
         static_cast<unsigned int>(win[2].end() - win[2].start()),
@@ -108,8 +114,10 @@
  * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
  * @return the resultant ndcoord_t
  */
-inline ndcoord_t to_ndcoord(const arm_compute::Window& win) {
-    return {
+inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
+{
+    return
+    {
         { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
         { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
         { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
diff --git a/src/core/NEON/kernels/assembly/arm_gemm_local.hpp b/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
similarity index 96%
rename from src/core/NEON/kernels/assembly/arm_gemm_local.hpp
rename to src/core/cpu/kernels/assembly/arm_gemm_local.hpp
index c08ed2d..78e0adf 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ b/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/kernels/assembly/convolution_parameters.hpp b/src/core/cpu/kernels/assembly/convolution_parameters.hpp
similarity index 98%
rename from src/core/NEON/kernels/assembly/convolution_parameters.hpp
rename to src/core/cpu/kernels/assembly/convolution_parameters.hpp
index d0ef5b5..0c1ae58 100644
--- a/src/core/NEON/kernels/assembly/convolution_parameters.hpp
+++ b/src/core/cpu/kernels/assembly/convolution_parameters.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp
similarity index 98%
rename from src/core/NEON/kernels/assembly/gemm_common.hpp
rename to src/core/cpu/kernels/assembly/gemm_common.hpp
index e1fb7a4..4af85ed 100644
--- a/src/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/cpu/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,7 +81,7 @@
     /** Main execute member fucntion
      * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
      * @param [in] thread_locator where are we inside of the thread space
-     * @naram [in] threadid       a unique threadid
+     * @param [in] threadid       a unique threadid
      */
     virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
 
diff --git a/src/core/NEON/kernels/assembly/ndrange.hpp b/src/core/cpu/kernels/assembly/ndrange.hpp
similarity index 98%
rename from src/core/NEON/kernels/assembly/ndrange.hpp
rename to src/core/cpu/kernels/assembly/ndrange.hpp
index a2bb60f..1c8261a 100644
--- a/src/core/NEON/kernels/assembly/ndrange.hpp
+++ b/src/core/cpu/kernels/assembly/ndrange.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 6d83480..b84128e 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
 #include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include <cmath>
 
@@ -48,10 +48,10 @@
 {
 namespace
 {
-AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 {
-    AsmGemmInfo asm_info;
-    asm_info.method                  = AsmConvMethod::Im2Col;
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
     asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
     asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
     asm_info.activation_info         = info.activation_info();
@@ -61,7 +61,7 @@
 } // namespace
 
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<NEGEMMAssemblyDispatch>()), _ma_kernel(),
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()), _ma_kernel(),
       _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
       _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
@@ -73,9 +73,9 @@
 {
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
 
-    const AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool        is_c_bias     = gemm_info.reshape_b_only_on_first_run();
-    bool              run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
+    const cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
+    const bool             is_c_bias     = gemm_info.reshape_b_only_on_first_run();
+    bool                   run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
@@ -85,7 +85,7 @@
     _run_alpha_scale                  = alpha != 1.f;
     _run_bias_addition                = c != nullptr && gemm_info.reshape_b_only_on_first_run();
     _run_addition                     = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
-    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !NEGEMMAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
 
     if(run_optimised)
     {
@@ -235,8 +235,8 @@
     }
 
     // Check if we need to run the optimized assembly kernel
-    AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool  run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
+    cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
+    const bool       run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
 
     if(!run_optimised)
     {
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 0f6f930..ddeacc8 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -26,7 +26,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include <set>
 
@@ -66,10 +66,10 @@
     quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
     return os_info;
 }
-AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
 {
-    AsmGemmInfo asm_info;
-    asm_info.method                  = is_indirect ? AsmConvMethod::Indirect : AsmConvMethod::Conv;
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
     asm_info.ps_info                 = info.conv_info;
     asm_info.activation_info         = info.act_info;
     asm_info.depth_output_gemm3d     = true;
@@ -83,7 +83,7 @@
 } // namespace
 
 NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _gemm_asm_func(std::make_unique<NEGEMMAssemblyDispatch>(memory_manager)), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false),
+    : _gemm_asm_func(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager)), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false),
       _run_activation(false)
 {
 }
@@ -102,7 +102,7 @@
     _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 });
 
     // Configure assembly dispatch
-    AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
     if(is_data_type_quantized(input->info()->data_type()))
     {
         asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info);
@@ -149,8 +149,8 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMAssemblyDispatch::validate(input, weights, biases, output, asm_info));
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(input, weights, biases, output, asm_info));
     return Status{};
 }
 void NEGEMMConv2d::run()
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 921626f..53dd39e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,16 +42,16 @@
 #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 namespace arm_compute
 {
 namespace
 {
-AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 {
-    AsmGemmInfo asm_info;
-    asm_info.method                  = AsmConvMethod::Im2Col;
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
     asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
     asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
     asm_info.activation_info         = info.activation_info();
@@ -66,7 +66,7 @@
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<NEGEMMAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
+    : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
       _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(),
       _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
       _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
@@ -135,7 +135,7 @@
     }
 
     // Initialize assembly kernel meta-data
-    const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+    const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 #ifdef __aarch64__
     switch(a->info()->data_type())
     {
@@ -261,7 +261,7 @@
     }
     // Configure activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    _run_activation                       = activation.enabled() && (!_assembly_path || !NEGEMMAssemblyDispatch::is_activation_supported(activation));
+    _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
     if(_run_activation)
     {
         _activation_func.configure(output, nullptr, activation);
@@ -362,19 +362,19 @@
     }
 
     // Initialize assembly kernel meta-data
-    const AsmGemmInfo asm_info = init_assembly_metadata(info);
+    const cpu::AsmGemmInfo asm_info = init_assembly_metadata(info);
 
     // Check if we need to run the optimized assembly kernel
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
     if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
+        run_optimised             = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
         run_optimised_requantized = run_optimised;
     }
     else
     {
-        run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+        run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
     }
 
     if(run_optimised)
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 941cb21..0bf1738 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -34,7 +34,7 @@
 #include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-#include "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
similarity index 95%
rename from src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
rename to src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index c58a662..36c1bbb 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
-#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
+#include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
+#include "src/core/cpu/kernels/assembly/arm_gemm.hpp"
 
 #include <arm_neon.h>
 #include <cstdlib>
 
 namespace arm_compute
 {
+namespace cpu
+{
 namespace
 {
 struct free_delete
@@ -213,7 +215,7 @@
 
 /** Fallback in case ACL doesn't have a function */
 template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
-class Fallback : public NEGEMMAssemblyDispatch::IFallback
+class Fallback : public CpuGemmAssemblyDispatch::IFallback
 {
 public:
     /** Destructor */
@@ -484,7 +486,7 @@
     }
 
     // arm_compute wrapper for the Gemm object (see above)
-    std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = std::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
+    auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
     ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
     acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
     const size_t workspace_size = _gemm_kernel_asm->get_working_size();
@@ -679,7 +681,7 @@
 }
 
 template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
+void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
                      const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                      IWeightsManager *weights_manager)
 {
@@ -696,7 +698,7 @@
 }
 
 template <typename TypeInput, typename TypeOutput>
-void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
+void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
                            const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                            IWeightsManager *weights_manager)
 {
@@ -742,12 +744,12 @@
 
 } //namespace
 
-NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _arm_gemm(nullptr), _memory_group(std::move(memory_manager)), _weights_manager(weights_manager)
 {
 }
 
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_UNUSED(c, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
@@ -778,19 +780,19 @@
     return Status{};
 }
 
-bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
 {
     arm_gemm::Activation act = map_to_arm_gemm_activation(activation);
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info))
+    if(!CpuGemmAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info))
     {
         return;
     }
@@ -839,22 +841,23 @@
     }
 }
 
-void NEGEMMAssemblyDispatch::prepare()
+void CpuGemmAssemblyDispatch::prepare()
 {
     ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
     _arm_gemm->prepare();
 }
 
-bool NEGEMMAssemblyDispatch::is_configured() const
+bool CpuGemmAssemblyDispatch::is_configured() const
 {
     return _arm_gemm != nullptr && _arm_gemm->is_configured();
 }
 
-void NEGEMMAssemblyDispatch::run()
+void CpuGemmAssemblyDispatch::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
     _arm_gemm->run();
 }
-} //namespace arm_compute
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
similarity index 87%
rename from src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
rename to src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index 381fa4d..0bbae49 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -32,6 +32,8 @@
 
 namespace arm_compute
 {
+namespace cpu
+{
 /* Convolution method supported by the assembly gemm interface */
 enum class AsmConvMethod
 {
@@ -55,18 +57,21 @@
 };
 
 /** Assembly kernel glue */
-class NEGEMMAssemblyDispatch : public IFunction
+class CpuGemmAssemblyDispatch : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
     /** Prevent instances of this class from being copy constructed */
-    NEGEMMAssemblyDispatch(const NEGEMMAssemblyDispatch &) = delete;
+    CpuGemmAssemblyDispatch(const CpuGemmAssemblyDispatch &) = delete;
     /** Prevent instances of this class from being copied */
-    NEGEMMAssemblyDispatch &operator=(const NEGEMMAssemblyDispatch &) = delete;
-    NEGEMMAssemblyDispatch(NEGEMMAssemblyDispatch &&)                 = default;
-    NEGEMMAssemblyDispatch &operator=(NEGEMMAssemblyDispatch &&) = default;
-    ~NEGEMMAssemblyDispatch()                                    = default;
+    CpuGemmAssemblyDispatch &operator=(const CpuGemmAssemblyDispatch &) = delete;
+    /** Default move constructor */
+    CpuGemmAssemblyDispatch(CpuGemmAssemblyDispatch &&) = default;
+    /** Default move assignment operator */
+    CpuGemmAssemblyDispatch &operator=(CpuGemmAssemblyDispatch &&) = default;
+    /** Defautl destructor */
+    ~CpuGemmAssemblyDispatch() = default;
 
     class IFallback
     {
@@ -121,5 +126,6 @@
     MemoryGroup                _memory_group;    /**< Function memory group */
     IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
 };
+} // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_NEGEMMASSEMBLYDISPATCH_H */