Port NEGEMMConvolutionLayer

Details:
port NEWeightsReshapeKernel to CpuWeightsReshapeKernel
port NEGEMMConvolutionLayer to CpuGEMMConvolutionLayer

Resolves: COMPMID-4509

Change-Id: I3c7051e2c3f6d808a7ccb898aad70e5b221b9dc3
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5938
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 6c6c51d..6d45a9d 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -65,6 +65,5 @@
 #include "src/core/NEON/kernels/NEStackLayerKernel.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "src/core/NEON/kernels/NETileKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 
 #endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
deleted file mode 100644
index 9bef9c3..0000000
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
-{
-    TensorShape output_shape{ input->tensor_shape() };
-
-    output_shape.collapse(3);
-    const size_t tmp_dim = output_shape[0];
-    output_shape.set(0, output_shape[1]);
-    output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
-
-    return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
-    }
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input)
-{
-    Window window = calculate_max_window(*input, Steps());
-    window.set(Window::DimX, Window::Dimension(0, input->dimension(0), input->dimension(0)));
-    window.set(Window::DimY, Window::Dimension(0, input->dimension(1), input->dimension(1)));
-    window.set(Window::DimZ, Window::Dimension(0, input->dimension(2), input->dimension(2)));
-
-    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, window);
-}
-} // namespace
-
-NEWeightsReshapeKernel::NEWeightsReshapeKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), (bias != nullptr))));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info()));
-
-    _input  = input;
-    _bias   = bias;
-    _output = output;
-
-    // Configure kernel
-    auto win_config = validate_and_configure_window(input->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get()).first);
-
-    return Status{};
-}
-
-void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const unsigned int kernel_size_x   = _input->info()->dimension(0);
-    const unsigned int kernel_size_y   = _input->info()->dimension(1);
-    const unsigned int kernel_depth    = _input->info()->dimension(2);
-    const unsigned int input_stride_x  = _input->info()->strides_in_bytes().x();
-    const unsigned int input_stride_y  = _input->info()->strides_in_bytes().y();
-    const unsigned int input_stride_z  = _input->info()->strides_in_bytes().z();
-    const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
-
-    // Create iterators
-    Iterator in(_input, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
-        {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
-            {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
-                {
-                    std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
-                }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
-            }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
-
-        // Add bias
-        if(_bias != nullptr)
-        {
-            std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
-        }
-    },
-    in);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
deleted file mode 100644
index 5701c84..0000000
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class NEWeightsReshapeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWeightsReshapeKernel";
-    }
-    /** Constructor.*/
-    NEWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~NEWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: All
-     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output The output tensor. Data types supported: Same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
-     *
-     * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
-     *                   Data types supported: All
-     * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_bias;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.cpp b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
index a5dbcc2..ca6c9bf 100644
--- a/src/core/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
@@ -331,7 +331,7 @@
     in, out);
 }
 
-void CpuIm2ColKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
                                 bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.h b/src/core/cpu/kernels/CpuIm2ColKernel.h
index 4301a23..ffac507 100644
--- a/src/core/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/core/cpu/kernels/CpuIm2ColKernel.h
@@ -77,7 +77,7 @@
      * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
                    bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..79f0589
--- /dev/null
+++ b/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
+{
+    TensorShape output_shape{ src->tensor_shape() };
+
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
+    }
+
+    // Checks performed when output is configured
+    if(dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
+                                                  biases,
+                                                  dst));
+
+    // Configure kernel
+    Window window = calculate_max_window(*src, Steps());
+    window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0)));
+    window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1)));
+    window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2)));
+    ICpuKernel::configure(window);
+}
+
+Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst));
+    return Status{};
+}
+
+void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src    = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS);
+    auto dst    = tensors.get_tensor(TensorType::ACL_DST);
+
+    const unsigned int kernel_size_x   = src->info()->dimension(0);
+    const unsigned int kernel_size_y   = src->info()->dimension(1);
+    const unsigned int kernel_depth    = src->info()->dimension(2);
+    const unsigned int input_stride_x  = src->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = src->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = src->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = dst->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(src, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                {
+                    std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(biases != nullptr)
+        {
+            std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
+        }
+    },
+    in);
+}
+const char *CpuWeightsReshapeKernel::name() const
+{
+    return "CpuWeightsReshapeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.h b/src/core/cpu/kernels/CpuWeightsReshapeKernel.h
new file mode 100644
index 0000000..eea150a
--- /dev/null
+++ b/src/core/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
+#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CpuWeightsReshapeKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuWeightsReshapeKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  src    The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
+     *                    Data types supported: All
+     * @param[in]  biases The shared biases tensor info to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] dst    The output tensor info. Data types supported: Same as @p src
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuWeightsReshapeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 5bd61b4..712f41f 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -28,7 +28,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 7c06b0a..6386a67 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -26,618 +26,99 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemmConvolution.h"
 
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "src/core/cpu/kernels/CpuCol2ImKernel.h"
-#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
-
-#include <set>
-#include <tuple>
+using namespace arm_compute::experimental;
 
 namespace arm_compute
 {
-using namespace arm_compute::misc::shape_calculator;
-
-NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default;
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights() noexcept
-    : _weights_reshape_kernel()
+struct NEGEMMConvolutionLayer::Impl
 {
-}
-
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
-{
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
-                                                                          (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info()));
-    const bool     append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
-
-    _weights_reshape_kernel = std::make_unique<NEWeightsReshapeKernel>();
-    _weights_reshape_kernel->configure(weights, biases_to_use, output);
-
-    output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1,
-                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
-        NEWeightsReshapeKernel::validate(weights, biases, output);
-    }
-
-    return Status{};
-}
-
-void NEConvolutionLayerReshapeWeights::run()
-{
-    NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
-}
-
-NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
+    const ITensor                           *weights{ nullptr };
+    std::unique_ptr<cpu::CpuGemmConvolution> op{ nullptr };
+    ITensorPack                              run_pack{};
+    ITensorPack                              prep_pack{};
+    MemoryGroup                              memory_group{};
+    IWeightsManager                         *weights_manager{ nullptr };
+    MemoryRequirements                       aux_mem_req{};
+    WorkspaceData<Tensor>                    workspace_tensors{};
+    bool                                     is_prepared{ false };
+};
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
-      _col2im_kernel(), _reshape_layer(), _input(nullptr), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(),
-      _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false)
+    : _impl(std::make_unique<Impl>())
 {
+    _impl->weights_manager = weights_manager;
+    _impl->memory_group    = MemoryGroup(memory_manager);
 }
-
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
-                                           act_info, gemm_3d_depth, _skip_im2col));
-
-    // Create GEMMInfo structure
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
-    // Supported activations in GEMM
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-
-    if(_is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo        iqinfo    = input->info()->quantization_info();
-        const QuantizationInfo        wqinfo    = weights->info()->quantization_info();
-        const QuantizationInfo        oqinfo    = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
-        const UniformQuantizationInfo uiqinfo   = iqinfo.uniform();
-        const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
-        const DataType                data_type = input->info()->data_type();
-
-        input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
-        if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
-        {
-            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
-            weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
-        }
-
-        // Merge activation with output stage
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-        int32_t min_activation = type_min.get<int32_t>();
-        int32_t max_activation = type_max.get<int32_t>();
-
-        if(supported_acts.count(act_info.activation()) != 0)
-        {
-            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
-        }
-
-        GEMMLowpOutputStageInfo output_info;
-        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset          = uoqinfo.offset;
-        output_info.gemmlowp_min_bound       = min_activation;
-        output_info.gemmlowp_max_bound       = max_activation;
-        output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
-        quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
-
-        _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
-
-        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-        input->info()->set_quantization_info(iqinfo);
-        weights->info()->set_quantization_info(wqinfo);
-    }
-    else
-    {
-        // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
-    }
-}
-
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                           const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
-    const DataType data_type             = input->data_type();
-    const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
-    const bool     is_activation_enabled = act_info.enabled();
-
-    // Create GEMMInfo structure
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
-    if(is_quantized)
-    {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo       &iqinfo  = input->quantization_info();
-        const QuantizationInfo       &wqinfo  = weights->quantization_info();
-        const QuantizationInfo       &oqinfo  = (output->total_size() == 0) ? iqinfo : output->quantization_info();
-        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
-
-        // Merge activation with output stage
-        PixelValue type_min{};
-        PixelValue type_max{};
-        std::tie(type_min, type_max) = get_min_max(data_type);
-        int32_t min_activation = type_min.get<int32_t>();
-        int32_t max_activation = type_max.get<int32_t>();
-
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
-        {
-            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
-        }
-
-        GEMMLowpOutputStageInfo output_info;
-        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset          = uoqinfo.offset;
-        output_info.gemmlowp_min_bound       = min_activation;
-        output_info.gemmlowp_max_bound       = max_activation;
-        output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
-
-        // Perform validation step on GEMMLowp
-        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
-        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
-        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
-    }
-    else
-    {
-        // Perform validation step on Matrix multiply function
-        return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
-    }
-}
-
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
-    const DataType     data_type = input_info->data_type();
-    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
-    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
-
-    // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
-    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
-
-    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
-}
+NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
 
 void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
                                        const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_UNUSED(num_groups, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
-                                                                weights->info(),
-                                                                biases != nullptr ? biases->info() : nullptr,
-                                                                output->info(),
-                                                                conv_info,
-                                                                weights_info,
-                                                                dilation,
-                                                                act_info,
-                                                                num_groups));
+    _impl->weights = weights;
+    _impl->op      = std::make_unique<cpu::CpuGemmConvolution>();
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, num_groups);
 
-    const DataType   data_type   = input->info()->data_type();
-    const DataLayout data_layout = input->info()->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->info()->dimension(idx_width);
-    const unsigned int kernel_height = weights->info()->dimension(idx_height);
-
-    _input            = input;
-    _is_prepared      = weights_info.retain_internal_weights();
-    _original_weights = weights;
-    _original_output  = output;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _data_layout      = data_layout;
-    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    const ITensor *gemm_input_to_use  = input;
-    ITensor       *gemm_output_to_use = output;
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
-                                                 input->info()->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    // Check if GEMM3D is supported
-    if(data_layout == DataLayout::NHWC)
+    _impl->run_pack =
     {
-        _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!_skip_col2im)
-        {
-            _skip_im2col = false;
-        }
-    }
-    else
+        { TensorType::ACL_SRC_0, input },
+        { TensorType::ACL_SRC_1, weights },
+        { TensorType::ACL_SRC_2, biases },
+        { TensorType::ACL_DST, output }
+    };
+    _impl->prep_pack =
     {
-        _skip_col2im = false;
-    }
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
-    const ITensor *weights_to_use = weights;
-
-    if(_weights_manager && _weights_manager->are_weights_managed(weights))
-    {
-        _reshape_weights_managed.configure(weights, nullptr);
-        weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
-    }
-    else
-    {
-        _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
-        weights_to_use = &_weights_reshaped;
-    }
-
-    // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
-    {
-        _memory_group.manage(&_im2col_output);
-
-        // Configure
-        _im2col_kernel = std::make_unique<cpu::kernels::CpuIm2ColKernel>();
-        _im2col_kernel->configure(input->info(), _im2col_output.info(), Size2D(kernel_width, kernel_height), conv_info, false, dilation);
-
-        // Update GEMM input
-        gemm_input_to_use = &_im2col_output;
-    }
-
-    // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!_skip_col2im)
-    {
-        TensorShape shape_gemm;
-
-        // Calculate GEMM output shape
-        shape_gemm = _im2col_output.info()->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
-        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-        TensorInfo info_gemm(shape_gemm, 1, output_data_type);
-        info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
-        _gemm_output.allocator()->init(info_gemm);
-        _gemm_output_3d.allocator()->init(info_gemm);
-        _memory_group.manage(&_gemm_output);
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output;
-    }
-    else
-    {
-        TensorInfo out_info{ *output->info() };
-        out_info.set_data_type(output_data_type).set_data_layout(input->info()->data_layout()).set_is_resizable(true);
-        _gemm_output.allocator()->init(out_info);
-        _gemm_output_3d.allocator()->init(out_info);
-        _memory_group.manage(&_gemm_output);
-
-        // Update GEMM output
-        gemm_output_to_use = &_gemm_output_3d;
-    }
-
-    // Configure GEMM
-    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
-    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
-    configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
-
-    if(!_skip_im2col)
-    {
-        _im2col_output.allocator()->allocate();
-    }
-
-    if(!_skip_col2im)
-    {
-        if(_data_layout == DataLayout::NCHW)
-        {
-            // Configure col2im
-            _col2im_kernel = std::make_unique<cpu::kernels::CpuCol2ImKernel>();
-            _col2im_kernel->configure(gemm_output_to_use->info(), output->info(), Size2D(conv_w, conv_h));
-        }
-        else
-        {
-            // Configure reshape layer
-            _reshape_layer.configure(gemm_output_to_use, output);
-        }
-    }
-    else
-    {
-        // Configure reshape layer
-        _reshape_layer.configure(gemm_output_to_use, output);
-    }
-
-    if(_is_quantized && !_skip_col2im)
-    {
-        _tmp_output.allocator()->allocate();
-    }
-
-    _gemm_output.allocator()->allocate();
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
-                             "Output shape does not match the expected one");
+        { TensorType::ACL_SRC_1, weights },
+        { TensorType::ACL_SRC_2, biases },
+    };
+    _impl->aux_mem_req       = _impl->op->workspace();
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                         const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
-
-    const DataLayout data_layout = input->data_layout();
-    const DataType   data_type   = input->data_type();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
-    const unsigned int kernel_width  = weights->dimension(idx_width);
-    const unsigned int kernel_height = weights->dimension(idx_height);
-
-    TensorInfo         im2col_reshaped_info{};
-    TensorInfo         info_gemm{};
-    TensorInfo         tmp_info{};
-    TensorInfo         weights_reshaped_info{};
-    const ITensorInfo *gemm_input_to_use  = input;
-    const ITensorInfo *gemm_output_to_use = output;
-    const ITensorInfo *weights_to_use     = weights;
-
-    const bool append_bias  = false;
-    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
-    const bool is_bf16      = data_type == DataType::BFLOAT16;
-    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
-                                                 input->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
-
-    // Check if GEMM3D is supported
-    bool skip_col2im = false;
-    if(data_layout == DataLayout::NHWC)
-    {
-        skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!skip_col2im)
-        {
-            skip_im2col = false;
-        }
-    }
-
-    if(skip_col2im)
-    {
-        // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
-        {
-            skip_im2col = false;
-            skip_col2im = false;
-        }
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_quantized)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else if(is_bf16)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
-
-    // Output tensor auto inizialization if not yet initialized
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
-    weights_reshaped_info.set_quantization_info(weights->quantization_info());
-    weights_to_use = &weights_reshaped_info;
-
-    if(!skip_im2col)
-    {
-        // Create tensor info for im2col reshaped inputs
-        // For CPU, the batch size is on the fourth dimension
-        TensorShape shape_im2col = input->tensor_shape();
-        shape_im2col.set(0, mat_weights_rows);
-        shape_im2col.set(1, conv_w * conv_h);
-        shape_im2col.set(2, 1);
-
-        im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
-        im2col_reshaped_info.set_quantization_info(input->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
-        gemm_input_to_use = &im2col_reshaped_info;
-    }
-
-    // Create temporary GEMM output tensor in case we cannot skip col2im
-    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!skip_col2im)
-    {
-        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-        info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
-    }
-    else
-    {
-        info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type);
-    }
-    info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
-    gemm_output_to_use = &info_gemm;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
-
-    // Validate Col2Im/ReshapeLayer
-    if(!skip_col2im && (data_layout == DataLayout::NCHW))
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
-    }
-
-    return Status{};
+    return cpu::CpuGemmConvolution::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
 }
 
 void NEGEMMConvolutionLayer::run()
 {
     prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    bool out_has_padding = _skip_col2im && (_original_output->info()->padding().bottom != 0 || _original_output->info()->padding().top != 0);
-
-    if(!_skip_im2col)
-    {
-        // Run input reshaping
-        unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        ITensorPack  pack =
-        {
-            { TensorType::ACL_SRC, _input },
-            { TensorType::ACL_DST, &_im2col_output }
-        };
-        NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
-    }
-
-    // Handle the case where output has top/bottom padding
-    const ITensor *out_to_use = out_has_padding ? &_gemm_output : _original_output;
-    _gemm_output_3d.info()->extend_padding(out_to_use->info()->padding());
-    _gemm_output_3d.allocator()->import_memory(out_to_use->buffer());
-
-    // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
-    if(_is_quantized)
-    {
-        // Run gemmlowp
-        _mm_gemmlowp.run();
-    }
-    else
-    {
-        // Run gemm
-        _mm_gemm.run();
-    }
-
-    // Reshape output matrix
-    if(!_skip_col2im)
-    {
-        if(_data_layout == DataLayout::NCHW)
-        {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, &_gemm_output },
-                { TensorType::ACL_DST, _original_output }
-            };
-            NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
-        }
-        else
-        {
-            _reshape_layer.run();
-        }
-    }
-    else if(out_has_padding)
-    {
-        _reshape_layer.run();
-    }
-
-    _gemm_output_3d.allocator()->free();
+    MemoryGroupResourceScope scope_mg(_impl->memory_group);
+    _impl->op->run(_impl->run_pack);
 }
 
 void NEGEMMConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if(!_impl->is_prepared)
     {
-        if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
-        {
-            _weights_manager->run(_original_weights, &_reshape_weights_managed);
-        }
-        else
-        {
-            // Run weights reshaping and mark original weights tensor as unused
-            _weights_reshaped.allocator()->allocate();
-            _reshape_weights.run();
-            _original_weights->mark_as_unused();
-        }
+        _impl->op->prepare(_impl->prep_pack);
+        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
+                                        _impl->aux_mem_req.end(),
+                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        // Prepare GEMM
-        _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
-        if(!_weights_reshaped.is_used())
+        if(has_reshape != std::end(_impl->aux_mem_req))
         {
-            _weights_reshaped.allocator()->free();
+            _impl->weights->mark_as_unused();
         }
-
-        _is_prepared = true;
+        for(auto &ws : _impl->workspace_tensors)
+        {
+            const int slot = ws.first;
+            for(auto &m : _impl->aux_mem_req)
+            {
+                if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
+                {
+                    auto tensor = ws.second.get();
+                    tensor->allocator()->free();
+                    break;
+                }
+            }
+        }
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
new file mode 100644
index 0000000..a0424b1
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuGemmConvolution.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/cpu/kernels/CpuCol2ImKernel.h"
+#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
+#include "src/core/cpu/kernels/CpuReshapeKernel.h"
+#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemm.h"
+#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
+
+#include <set>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuGemmConvolution::CpuGemmConvolution()
+    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
+      _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+{
+}
+CpuGemmConvolution::~CpuGemmConvolution() = default;
+
+void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info, int gemm_3d_depth)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, gemm_3d_depth, _skip_im2col));
+
+    // Create GEMMInfo structure
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+                                         false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
+    // Supported activations in GEMM
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                             };
+
+    if(_is_quantized)
+    {
+        TensorInfo tmp_src{ *src };
+        TensorInfo tmp_weights{ *weights };
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo        iqinfo    = src->quantization_info();
+        const QuantizationInfo        wqinfo    = weights->quantization_info();
+        const QuantizationInfo        oqinfo    = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+        const UniformQuantizationInfo uiqinfo   = iqinfo.uniform();
+        const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+        const DataType                data_type = src->data_type();
+
+        tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+        if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+        {
+            const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+            tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+        }
+
+        // Merge activation with output stage
+        PixelValue type_min{};
+        PixelValue type_max{};
+        std::tie(type_min, type_max) = get_min_max(data_type);
+        int32_t min_activation = type_min.get<int32_t>();
+        int32_t max_activation = type_max.get<int32_t>();
+
+        if(supported_acts.count(act_info.activation()) != 0)
+        {
+            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset          = uoqinfo.offset;
+        output_info.gemmlowp_min_bound       = min_activation;
+        output_info.gemmlowp_max_bound       = max_activation;
+        output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL);
+        quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
+
+        _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
+
+        auto mm_mem_req = _mm_gemmlowp->workspace();
+        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+    else
+    {
+        // Configure matrix multiply function
+        _mm_gemm = std::make_unique<CpuGemm>();
+        _mm_gemm->configure(src, weights, biases, dst, 1.0f, 0.0f, gemm_info);
+        auto mm_mem_req = _mm_gemm->workspace();
+        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+}
+
+Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+                                       const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+{
+    const DataType data_type             = src->data_type();
+    const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
+    const bool     is_activation_enabled = act_info.enabled();
+
+    // Create GEMMInfo structure
+    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+                                        false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
+    if(is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo       &iqinfo  = src->quantization_info();
+        const QuantizationInfo       &wqinfo  = weights->quantization_info();
+        const QuantizationInfo       &oqinfo  = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+        // Merge activation with output stage
+        PixelValue type_min{};
+        PixelValue type_max{};
+        std::tie(type_min, type_max) = get_min_max(data_type);
+        int32_t min_activation = type_min.get<int32_t>();
+        int32_t max_activation = type_max.get<int32_t>();
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+        }
+
+        GEMMLowpOutputStageInfo output_info;
+        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset          = uoqinfo.offset;
+        output_info.gemmlowp_min_bound       = min_activation;
+        output_info.gemmlowp_max_bound       = max_activation;
+        output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
+
+        // Perform validation step on GEMMLowp
+        std::unique_ptr<ITensorInfo> input_qa   = src->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+        weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
+        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
+    }
+    else
+    {
+        // Perform validation step on Matrix multiply function
+        return CpuGemm::validate(src, weights, nullptr, dst, 1.0f, 0.0f, gemm_info);
+    }
+}
+
+Status CpuGemmConvolution::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+{
+    const DataType     data_type = input_info->data_type();
+    const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
+    const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
+
+    // Set dummy tensor shapes for the validation
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
+}
+
+void CpuGemmConvolution::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_UNUSED(num_groups, weights_info);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConvolution::validate(src,
+                                                            weights,
+                                                            biases,
+                                                            dst,
+                                                            conv_info,
+                                                            weights_info,
+                                                            dilation,
+                                                            act_info,
+                                                            num_groups));
+
+    const DataType   data_type   = src->data_type();
+    const DataLayout data_layout = src->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+
+    _is_prepared  = weights_info.retain_internal_weights();
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+    _data_layout  = data_layout;
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+    const ITensorInfo *gemm_input_to_use  = src;
+    ITensorInfo       *gemm_output_to_use = dst;
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+                                                 src->dimension(idx_height),
+                                                 kernel_width,
+                                                 kernel_height,
+                                                 conv_info,
+                                                 dilation);
+    ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+                             "Output shape does not match the expected one");
+
+    // Check if GEMM3D is supported
+    if(data_layout == DataLayout::NHWC)
+    {
+        _skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
+        // If not supported, we need to perform im2col and col2im (or reshape layer)
+        if(!_skip_col2im)
+        {
+            _skip_im2col = false;
+        }
+    }
+    else
+    {
+        _skip_col2im = false;
+    }
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+
+    // _weights_reshaped will be auto configured in the kernel.
+    // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm
+    _weights_reshape_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
+    _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped);
+    _weights_reshaped.set_quantization_info(weights->quantization_info());
+
+    // Create tensor to store im2col reshaped inputs
+    if(!_skip_im2col)
+    {
+        // Configure
+        _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
+        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
+
+        // Update GEMM input
+        gemm_input_to_use = &_im2col_output;
+    }
+
+    // Create temporary GEMM output tensor in case we cannot skip col2im
+    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+    if(!_skip_col2im)
+    {
+        TensorShape shape_gemm;
+
+        // Calculate GEMM output shape
+        shape_gemm = _im2col_output.tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+
+        _gemm_output = TensorInfo(shape_gemm, 1, output_data_type);
+        _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+        _gemm_output_3d = TensorInfo(_gemm_output);
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output;
+    }
+    else
+    {
+        _gemm_output_3d = TensorInfo(*dst);
+        _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true);
+        _gemm_output = TensorInfo(_gemm_output_3d);
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output_3d;
+    }
+
+    // Configure GEMM
+    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
+
+    if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+    {
+        // Configure col2im
+        _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
+        _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h));
+    }
+    else
+    {
+        // Configure reshape layer
+        _reshape_kernel = std::make_unique<kernels::CpuReshapeKernel>();
+        _reshape_kernel->configure(gemm_output_to_use, dst);
+    }
+
+    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Prepare, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[GemmOutput3d]    = MemoryInfo(offset_int_vec(GemmOutput3d), MemoryLifetime::Temporary, _gemm_output_3d.total_size());
+}
+
+Status CpuGemmConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
+
+    const DataLayout data_layout = src->data_layout();
+    const DataType   data_type   = src->data_type();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         tmp_info{};
+    TensorInfo         weights_reshaped_info{};
+    const ITensorInfo *gemm_input_to_use  = src;
+    const ITensorInfo *gemm_output_to_use = dst;
+    const ITensorInfo *weights_to_use     = weights;
+
+    const bool append_bias  = false;
+    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+    const bool is_bf16      = data_type == DataType::BFLOAT16;
+    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+                                                 src->dimension(idx_height),
+                                                 kernel_width,
+                                                 kernel_height,
+                                                 conv_info,
+                                                 dilation);
+
+    // Check if GEMM3D is supported
+    bool skip_col2im = false;
+    if(data_layout == DataLayout::NHWC)
+    {
+        skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
+        // If not supported, we need to perform im2col and col2im (or reshape layer)
+        if(!skip_col2im)
+        {
+            skip_im2col = false;
+        }
+    }
+
+    if(skip_col2im)
+    {
+        // If not supported, we need to perform im2col and col2im (or reshape layer)
+        if(!bool(validate_gemm3d(src, weights, act_info, conv_h, skip_im2col)))
+        {
+            skip_im2col = false;
+            skip_col2im = false;
+        }
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    // Validate biases
+    if(biases != nullptr)
+    {
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else if(is_bf16)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+
+    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
+    weights_reshaped_info.set_quantization_info(weights->quantization_info());
+    weights_to_use = &weights_reshaped_info;
+
+    if(!skip_im2col)
+    {
+        // Create tensor info for im2col reshaped inputs
+        // For CPU, the batch size is on the fourth dimension
+        TensorShape shape_im2col = src->tensor_shape();
+        shape_im2col.set(0, mat_weights_rows);
+        shape_im2col.set(1, conv_w * conv_h);
+        shape_im2col.set(2, 1);
+
+        im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
+        im2col_reshaped_info.set_quantization_info(src->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+        gemm_input_to_use = &im2col_reshaped_info;
+    }
+
+    // Create temporary GEMM output tensor in case we cannot skip col2im
+    const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+    if(!skip_col2im)
+    {
+        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+        info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
+    }
+    else
+    {
+        info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type);
+    }
+    info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+    gemm_output_to_use = &info_gemm;
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
+
+    // Validate Col2Im/ReshapeLayer
+    if(!skip_col2im && (data_layout == DataLayout::NCHW))
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+    }
+
+    return Status{};
+}
+
+void CpuGemmConvolution::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    auto src               = tensors.get_const_tensor(ACL_SRC_0);
+    auto weights           = tensors.get_const_tensor(ACL_SRC_1);
+    auto biases            = tensors.get_const_tensor(ACL_SRC_2);
+    auto dst               = tensors.get_tensor(ACL_DST);
+    auto gemm_input_to_use = src;
+
+    CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+    CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+
+    bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
+    if(!_skip_im2col)
+    {
+        // Run input reshaping
+        unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+        ITensorPack  pack =
+        {
+            { TensorType::ACL_SRC, src },
+            { TensorType::ACL_DST, im2col_output.get() }
+        };
+        NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
+        gemm_input_to_use = im2col_output.get();
+    }
+
+    // Handle the case where output has top/bottom padding
+    const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst;
+    _gemm_output_3d.extend_padding(out_to_use->info()->padding());
+    CpuAuxTensorHandler gemm_output_3d(offset_int_vec(GemmOutput3d), _gemm_output_3d, tensors, true);
+    auto                gemm_output_to_use = gemm_output.get();
+    if(_skip_im2col)
+    {
+        gemm_output_to_use = gemm_output_3d.get();
+    }
+    if(_skip_col2im && !out_has_padding)
+    {
+        gemm_output_to_use = dst;
+    }
+
+    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
+    ITensorPack pack_mm =
+    {
+        { TensorType::ACL_SRC_0, gemm_input_to_use },
+        { TensorType::ACL_SRC_1, weights },
+        { TensorType::ACL_SRC_2, biases },
+        { TensorType::ACL_DST, gemm_output_to_use }
+    };
+    if(_is_quantized)
+    {
+        // Run gemmlowp
+        _mm_gemmlowp->run(pack_mm);
+    }
+    else
+    {
+        // Run gemm
+        _mm_gemm->run(pack_mm);
+    }
+
+    // Reshape output matrix
+    if(!_skip_col2im)
+    {
+        if(_data_layout == DataLayout::NCHW)
+        {
+            ITensorPack pack =
+            {
+                { TensorType::ACL_SRC, gemm_output.get() },
+                { TensorType::ACL_DST, dst }
+            };
+            NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
+        }
+        else
+        {
+            ITensorPack pack =
+            {
+                { TensorType::ACL_SRC, gemm_output_to_use },
+                { TensorType::ACL_DST, dst }
+            };
+            NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+        }
+    }
+    else if(out_has_padding)
+    {
+        ITensorPack pack =
+        {
+            { TensorType::ACL_SRC, gemm_output_to_use },
+            { TensorType::ACL_DST, dst }
+        };
+        NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+    }
+}
+
+void CpuGemmConvolution::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        // Run weights reshaping and mark original weights tensor as unused
+        ITensor            *weights_reshaped_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        CpuAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
+        auto                weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        ITensorPack         pack =
+        {
+            { TensorType::ACL_SRC, weights },
+            { TensorType::ACL_DST, weights_reshaped.get() }
+        };
+        NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
+        tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
+        _is_prepared = true;
+    }
+}
+experimental::MemoryRequirements CpuGemmConvolution::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.h b/src/runtime/cpu/operators/CpuGemmConvolution.h
new file mode 100644
index 0000000..8b41cb4
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMCONVOLUTION_H
+#define ARM_COMPUTE_CPU_GEMMCONVOLUTION_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuGemm;
+class CpuGemmLowpMatrixMultiplyCore;
+class CpuGemmLowpOutputStage;
+namespace kernels
+{
+class CpuWeightsReshapeKernel;
+class CpuIm2ColKernel;
+class CpuCol2ImKernel;
+class CpuReshapeKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
+ *
+ * -# @ref cpu::kernels::CpuIm2ColKernel
+ * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32)
+ * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
+ * -# @ref kernels::CpuWeightsReshapeKernel
+ *
+ */
+class CpuGemmConvolution : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuGemmConvolution();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuGemmConvolution(const CpuGemmConvolution &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    CpuGemmConvolution(CpuGemmConvolution &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuGemmConvolution &operator=(const CpuGemmConvolution &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    CpuGemmConvolution &operator=(CpuGemmConvolution &&) = delete;
+    /** Destructor */
+    ~CpuGemmConvolution();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |BFLOAT16       |BFLOAT16           |BFLOAT16 |BFLOAT16       |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     *
+     * @param[in]  src          Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights      Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                          Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases       Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                          Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst          Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to CpuGemmConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    /** Configures the appropriate matrix multiply routine
+     *
+     * @param[in]  src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights       Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases        Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                           Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst           Output tensor info. Data types supported: Same as @p input,
+     *                           except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+     */
+    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+                      int gemm_3d_depth = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+     *
+     * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights       Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases        Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                          Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] dst           Output tensor info. Data types supported: Same as @p input,
+     *                          except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+     * @param[in] act_info      (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+     * @param[in] skip_im2col   (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+     *
+     * @return a status
+     */
+    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+                              int gemm_3d_depth = 1, bool skip_im2col = false);
+    /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
+     *
+     * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights       Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] act_info      Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] gemm_3d_depth Depth of GEMM 3D
+     * @param[in] skip_im2col   Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+     *
+     * @return a status
+     */
+    static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+
+    enum AuxTensorIdx
+    {
+        // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors
+        Im2ColOutput = 9,
+        WeightsReshaped,
+        GemmOutput,
+        GemmOutput3d,
+        Count
+    };
+
+    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_kernel;
+    std::unique_ptr<cpu::kernels::CpuIm2ColKernel>    _im2col_kernel;
+    std::unique_ptr<CpuGemm>                          _mm_gemm;
+    std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
+    std::unique_ptr<kernels::CpuCol2ImKernel>         _col2im_kernel;
+    std::unique_ptr<kernels::CpuReshapeKernel>        _reshape_kernel;
+
+    TensorInfo _im2col_output;
+    TensorInfo _weights_reshaped;
+    TensorInfo _gemm_output;
+    TensorInfo _gemm_output_3d;
+
+    DataLayout _data_layout;
+
+    bool _skip_im2col;
+    bool _skip_col2im;
+    bool _is_quantized;
+    bool _is_prepared;
+
+    experimental::MemoryRequirements _aux_mem{ Count };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMCONVOLUTION_H */