Add GemmLowp MMUL Reshaped Only Rhs Support for QASYMM8/QASYMM8_SIGNED

This patch introduces a GEMMLowp routine that is optimized for Arm(R) Mali(TM)-G715 and Arm(R) Mali(TM)-G615

Resolves: COMPMID-5398

Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Change-Id: I8d06453645688f3658b6c7c06f1ebc25a2505661
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7932
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index ad28cc3..4a6ba4f 100644
--- a/Android.bp
+++ b/Android.bp
@@ -43,6 +43,7 @@
         "src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl",
         "src/core/CL/cl_kernels/common/gemm_utils.cl",
         "src/core/CL/cl_kernels/common/gemmlowp.cl",
+        "src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl",
         "src/core/CL/cl_kernels/common/gemv.cl",
         "src/core/CL/cl_kernels/common/generate_proposals.cl",
         "src/core/CL/cl_kernels/common/generate_proposals_quantized.cl",
@@ -611,6 +612,7 @@
         "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
         "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
         "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+        "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp",
         "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
         "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
         "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
diff --git a/SConscript b/SConscript
index 522db94..d94745d 100644
--- a/SConscript
+++ b/SConscript
@@ -376,6 +376,7 @@
                        'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
                        'src/core/CL/cl_kernels/common/gemv.cl',
                        'src/core/CL/cl_kernels/common/gemmlowp.cl',
+                       'src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl',
                        'src/core/CL/cl_kernels/common/generate_proposals.cl',
                        'src/core/CL/cl_kernels/common/generate_proposals_quantized.cl',
                        'src/core/CL/cl_kernels/common/instance_normalization.cl',
diff --git a/filelist.json b/filelist.json
index a1a9778..c5de028 100644
--- a/filelist.json
+++ b/filelist.json
@@ -473,6 +473,7 @@
           "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
           "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
           "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp",
           "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
           "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
           "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
diff --git a/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000..72fe3d3
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=1, -DK0=1).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=1)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=4)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=4)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=16)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 4
+ *  - N0 = 1, 4, 8
+ *  - K0 = 4
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                               Pointer to the LHS tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                          Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                          Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                                 The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                                 The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                                 The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes     The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                               Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                          Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                          Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                                 The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                                 The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                                 The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes     The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                               (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bia_stride_y                          (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                          (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                                 (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                                 (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                                 (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes     (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data type: same as @p lhs_ptr or S32
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                                 The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                 The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                 The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  M                                     Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                     Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                     Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ * @param[in]  sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(ADD_BIAS)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(ADD_BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K
+#if defined(A_OFFSET)
+    ,
+    TENSOR3D_T(sum_col, BUFFER)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    TENSOR3D_T(sum_row, BUFFER)
+#endif // defined(B_OFFSET)
+)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_M0)
+#define VEC_SIZE 4 // For int8 types input to mmul instruction is a length 4 vector
+
+    uint x0 = get_global_id(0);
+    uint y0 = get_global_id(1);
+    uint z  = get_global_id(2);
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    uint lhs_x = VEC_SIZE * block_x;
+    uint lhs_y = dst_y;
+
+    uint rhs_x = VEC_SIZE * N0 * block_y;
+    uint rhs_y = 4 * block_id + block_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(OUT_DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, VEC_SIZE, a);
+        T_LOAD(DATA_TYPE, M0, VEC_SIZE, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        TILE(DATA_TYPE, N0, VEC_SIZE, b);
+        T_LOAD(DATA_TYPE, N0, VEC_SIZE, BUFFER, rhs, 0, 0, 1, VEC_SIZE, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_TYPE vec_a = (VEC_TYPE)(a[m0].s[0], a[m0].s[1], a[m0].s[2], a[m0].s[3]);
+                VEC_TYPE vec_b = (VEC_TYPE)(b[n0].s[0], b[n0].s[1], b[n0].s[2], b[n0].s[3]);
+                c[m0].s[n0]    = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+#if defined(A_OFFSET)
+
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, dst_x, z, 1, sum_col_stride_z, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+
+    TILE(int, M0, 1, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, dst_y, z * M, 1, 4, b_offset_s32);
+
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        offset_s32[m0].v += b_offset_s32[m0].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE) + z * bia_stride_y;
+
+    TILE(int, M0, N0, bias);
+
+    T_LOAD(int, M0, N0, BUFFER, bia, dst_x, dst_y, 1, 1, bias);
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, offset_s32, bias, offset_s32);
+
+#else // defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE);
+
+    TILE(int, 1, N0, bias);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias[0].v = VLOAD(N0)(0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias[0].v, 0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(ADD_BIAS)
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, c, offset_s32, c);
+    TILE(OUT_DATA_TYPE, M0, N0, c_lp);
+    T_QUANTIZE8(ACC_DATA_TYPE, OUT_DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c, 0, 0, c_lp);
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#else  // FUSED_OUTPUT_STAGE_FIXED_POINT
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+#endif // FUSED_OUTPUT_STAGE_FIXED_POINT
+}
+
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 52661d6..0f08f5d 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -303,6 +303,7 @@
     { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
     { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
     { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" },
     { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
     { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
     { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
@@ -618,6 +619,10 @@
 #include "./cl_kernels/common/gemmlowp.clembed"
     },
     {
+        "common/gemmlowp_reshaped_only_rhs_mmul.cl",
+#include "./cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.clembed"
+    },
+    {
         "common/gemv.cl",
 #include "./cl_kernels/common/gemv.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index bfcd152..6baa31e 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -52,7 +52,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
                                                          1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
                                                          DataType::U16, DataType::U32, DataType::S32, DataType::F16,
                                                          DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
index 5c223fc..7fadfa7 100644
--- a/src/gpu/cl/kernels/ClCastKernel.h
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,7 @@
      *
      *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
      *   - U8  -> S8, U16, S16, U32, S32, F16, F32
+     *   - S8  -> U8, U16, S16, U32, S32, F16, F32
      *   - U16 -> U8, S8, S16, U32, S32, F16, F32
      *   - S16 -> U8, S8, U16, U32, S32, F16, F32
      *   - U32 -> U8, S8, U16, S16, S32, F16, F32
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
new file mode 100644
index 0000000..cdd047c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m;
+    const int n = gemm_info.n;
+    const int k = gemm_info.k;
+
+    TensorShape tensor_shape1{ src1->tensor_shape() };
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+    if(gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if(dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+        }
+    }
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
+
+    // Checks performed if the dst stage needs to be fused
+    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if(gemm_info.a_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
+        }
+
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        if(gemm_info.b_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+            // Check if mm result is a 3D reinterpretation
+            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+
+            // Validate input
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
+
+            if(expected_dst_shape.num_dimensions() > 1)
+            {
+                const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+                vector_sum_row_shape.collapse_from(1);
+                TensorShape collapsed_dst_shape(expected_dst_shape);
+                collapsed_dst_shape.collapse_from(dst_batch_idx);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
+                                                "vector_sum_row must have the same number of batches of dst tensor");
+
+                if(gemm_info.a_offset != 0)
+                {
+                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                    vector_sum_col_shape.collapse_from(1);
+
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                }
+            }
+        }
+
+        if(dst->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+        if(output_multipliers != nullptr && output_shifts != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+            if(output_stage.is_quantized_per_channel)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
+            }
+        }
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
+                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+{
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
+
+    Window win{};
+    bool   window_changed = false;
+
+    constexpr unsigned int mmul_n0 = 4;
+    constexpr unsigned int mmul_m0 = 4;
+    constexpr unsigned int mmul_k0 = 16;
+
+    reinterpret_output_as_3d = false;
+    // dst tensor auto initialization if not yet initialized
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    {
+        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+    }
+    else
+    {
+        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
+    }
+
+    TensorInfo tmp_info(*dst);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = 1;
+    num_elems_processed_per_iteration_y = 1;
+
+    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        if(gemm_info.a_offset != 0)
+        {
+            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
+        }
+        // No access window needed for vector_sum_row
+        ARM_COMPUTE_UNUSED(vector_sum_row);
+
+        if(bias != nullptr)
+        {
+            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, bias_access);
+        }
+
+        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        {
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+        }
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    Window             collapsed             = win.collapse(win, dimension_to_collapse);
+
+    // Reconfigure window size, one arm_matrix_multiply kernel needs 16 threads to finish.
+    Window::Dimension x_dimension = collapsed.x();
+    Window::Dimension y_dimension = collapsed.y();
+
+    // Make M and N multiple of M0 and N0 respectively
+    const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(x_dimension.end(), gemm_info.rhs_info.n0);
+    const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(y_dimension.end(), gemm_info.lhs_info.m0);
+
+    // Divide M and N by M0 and N0 respectively
+    const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / gemm_info.rhs_info.n0;
+    const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / gemm_info.lhs_info.m0;
+
+    // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_k0 respectively
+    const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+    const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_k0);
+
+    // Ensure x_dimension is multiple of MMUL block size (mmul_n0 * mmul_m0)
+    x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_n0);
+    y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
+
+    collapsed.set(Window::DimX, x_dimension);
+    collapsed.set(Window::DimY, y_dimension);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
+                                                                  const GEMMKernelInfo &gemm_info,
+                                                                  ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
+                                                                  ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+
+    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+    const int32_t                 a_offset     = gemm_info.a_offset;
+    const int32_t                 b_offset     = gemm_info.b_offset;
+    constexpr int                 mmul_m0      = 4;
+    constexpr int                 mmul_n0      = 4;
+    constexpr int                 mmul_k0      = 16;
+
+    _m = gemm_info.m;
+    _n = gemm_info.n;
+    _k = gemm_info.k;
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    const unsigned int m0_leftover = _m % lhs_info.m0;
+    const unsigned int n0_leftover = _n % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DVEC_TYPE=" + get_cl_type_from_data_type(src0->data_type()) + "4");
+    build_opts.add_option("-DACC_DATA_TYPE=int");
+    build_opts.add_option("-DOUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+    build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+    build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+    build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+    build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
+
+    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
+        _fuse_output_stage = true;
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if(a_offset != 0 && vector_sum_col != nullptr)
+        {
+            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+            build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+        }
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
+        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+        build_opts.add_option_if(gemm_info.broadcast_bias == true, "-DBROADCAST_BIAS");
+        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+
+        const int min = output_stage.gemmlowp_min_bound;
+        const int max = output_stage.gemmlowp_max_bound;
+
+        PixelValue min_val{};
+        PixelValue max_val{};
+        std::tie(min_val, max_val) = get_min_max(dst->data_type());
+        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    }
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (bias != nullptr ? "add_bias_" : "");
+    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+    _config_id += lower_string(string_from_data_type(src0->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                                                                   const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
+                                                              src1->clone().get(),
+                                                              dst->clone().get(),
+                                                              gemm_info,
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              bias != nullptr ? bias->clone().get() : nullptr,
+                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    auto       dst            = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    if(src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    cl::Image2D src1_image2d;
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3d_tensor_nhw_argument(idx, src0);
+        add_3d_tensor_nhw_argument(idx, src1);
+
+        // Bias buffer (_add_bias == true)
+        if(src2 != nullptr)
+        {
+            add_3d_tensor_nhw_argument(idx, src2);
+        }
+        // dst buffer
+        add_3d_tensor_nhw_argument(idx, dst);
+
+        // Pass m, n and k at runtime as signed ints, to ensure results of any subtraction they could be operand in, would still be signed.
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
+        if(_fuse_output_stage)
+        {
+            if(vector_sum_col != nullptr)
+            {
+                add_3d_tensor_nhw_argument(idx, vector_sum_col);
+            }
+            if(vector_sum_row != nullptr)
+            {
+                add_3d_tensor_nhw_argument(idx, vector_sum_row);
+            }
+        }
+
+        enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
new file mode 100644
index 0000000..0ae549c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data types when only the input matrix RHS (src1) has been reshaped using the MMUL instruction
+ *
+ * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  src0               Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  src1               Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
+     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                                lhs_info.m0: 1,2,4
+     *                                Only the following values are supported for RHS info:
+     *                                rhs_info.n0: 1,4,8
+     *                                rhs_info.k0: same as lhs_info.k0: 4
+     *                                rhs_info.transpose: true
+     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in]  bias               (Optional) Biases tensor. Can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM] or same dimensionality as dst if gemm_info.broadcast_bias is false. Data type supported: S32.
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
+     * @param[in]  output_shifts      (Optional) Output shifts tensor. Supported data types: S32.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
+                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
+                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool       _fuse_output_stage{ false };
+    signed int _m{ 1 };
+    signed int _n{ 1 };
+    signed int _k{ 1 };
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
\ No newline at end of file
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 7a62186..2622274 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,23 +23,15 @@
  */
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Log.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
@@ -47,9 +39,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
 
-#include "src/common/utils/Log.h"
-#include "utils/TypePrinter.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -67,6 +56,7 @@
     {
         case CLGEMMKernelType::NATIVE:
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
             return true;
         }
@@ -165,6 +155,41 @@
     return true;
 }
 
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
+                                                         unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    // Since we ignore the output stage, output data type has to be S32 to pass the validation
+    TensorInfo output_info_copy(*output);
+    output_info_copy.set_data_type(DataType::S32);
+    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
                                                                                           const ITensorInfo *a,
@@ -184,6 +209,19 @@
     return { config.lhs_info, config.rhs_info };
 }
 
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
+                                                                                               const ITensorInfo *a,
+                                                                                               const ITensorInfo *b, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
+    auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+                                              to_string(config.rhs_info).c_str());
+    return { config.lhs_info, config.rhs_info };
+}
+
 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 {
     switch(kernel_type)
@@ -191,6 +229,7 @@
         case CLGEMMKernelType::NATIVE:
             return false;
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
             return true;
         default:
             ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
@@ -202,6 +241,7 @@
     : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
       _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
       _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
       _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
       _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
       _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
@@ -218,7 +258,7 @@
                                              const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
     ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -234,6 +274,7 @@
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
 
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
@@ -249,8 +290,7 @@
 
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run));
+    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
 
     if(_convert_to_qasymm8)
     {
@@ -261,7 +301,7 @@
     }
 
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_is_gemm_reshaped)
+    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
 
@@ -274,6 +314,19 @@
         // Configure reshape RHS kernel
         _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
     }
+    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    {
+        matrix_b = &_tmp_b;
+
+        // Pick up the GEMM configuration
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
+                                                                                      depth_output_gemm3d,
+                                                                                      a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+        // Configure reshape RHS kernel
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+    }
 
     // Using default reduction info
     const GEMMLowpReductionKernelInfo reduction_info {};
@@ -326,20 +379,30 @@
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
 
-        if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
                                                     _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
+        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            // Configure and tune matrix multiply kernel with fused output stage
+            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                         _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+        }
         else
         {
             _run_output_stage = true;
 
-            if(_is_gemm_reshaped)
+            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
                 _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
             }
+            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            {
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+            }
             else
             {
                 // Pick up the GEMM configuration
@@ -359,11 +422,16 @@
     else
     {
         _run_offset_contribution = true;
-        if(_is_gemm_reshaped)
+        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
+        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+        }
         else
         {
             // Pick up the GEMM configuration
@@ -382,7 +450,7 @@
 
     // Request memory
     _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(_is_gemm_reshaped)
+    if(is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
         _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
@@ -607,7 +675,7 @@
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
 
-    if(_is_gemm_reshaped)
+    if(is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
         if(!_reshape_b_only_on_first_run)
@@ -645,7 +713,7 @@
     }
 
     // Run matrix multiply
-    if(_is_gemm_reshaped)
+    if(is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
         if(_run_offset_contribution)
@@ -669,7 +737,18 @@
                 { TensorType::ACL_DST, dst },
             });
         }
-        CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+        }
+        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Invalid reshaped kernel");
+        }
     }
     else
     {
@@ -728,7 +807,7 @@
             b->mark_as_unused();
         }
 
-        if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
+        if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
             ITensorPack mtx_b_pack =
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
index 1965e3f..6fa4352 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@
 class ClCastKernel;
 class ClGemmLowpMatrixMultiplyNativeKernel;
 class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel;
 class ClGemmReshapeRhsMatrixKernel;
 class ClGemmLowpMatrixAReductionKernel;
 class ClGemmLowpMatrixBReductionKernel;
@@ -120,14 +121,15 @@
 
 private:
     // Kernels used
-    std::unique_ptr<kernels::ClCastKernel>                                  _weights_to_qasymm8;
-    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel>          _mm_native_kernel;
-    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                  _mtx_b_reshape_kernel;
-    std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+    std::unique_ptr<kernels::ClCastKernel>                                      _weights_to_qasymm8;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel>              _mm_native_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>     _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel;
+    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                      _mtx_b_reshape_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel>     _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     TensorInfo _qasymm8_weights{};
@@ -138,15 +140,15 @@
     TensorInfo _gemm_output_stage_multipliers{};
     TensorInfo _gemm_output_stage_shifts{};
 
-    int32_t  _a_offset{ 0 };
-    int32_t  _b_offset{ 0 };
-    bool     _is_gemm_reshaped{ true };
-    bool     _reshape_b_only_on_first_run{ false };
-    bool     _run_output_stage{ false };
-    bool     _convert_to_qasymm8{ false };
-    bool     _run_offset_contribution{ false };
-    bool     _is_prepared{ false };
-    GEMMInfo _gemm_info{};
+    int32_t          _a_offset{ 0 };
+    int32_t          _b_offset{ 0 };
+    bool             _reshape_b_only_on_first_run{ false };
+    bool             _run_output_stage{ false };
+    bool             _convert_to_qasymm8{ false };
+    bool             _run_offset_contribution{ false };
+    bool             _is_prepared{ false };
+    GEMMInfo         _gemm_info{};
+    CLGEMMKernelType _gemm_kernel_type{};
 
     experimental::MemoryRequirements _aux_mem{};
 };
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL.cpp
new file mode 100644
index 0000000..a0d13c3
--- /dev/null
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCast.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/CL/Helper.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/fixtures/GEMMLowpFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using namespace arm_compute::opencl::kernels;
+
+// Create function for CLGEMMReshapeRHSMatrixKernel
+using CLGEMMReshapeRHSMatrix = CLSynthetizeOperator<opencl::kernels::ClGemmReshapeRhsMatrixKernel>;
+
+// Create function for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = CLSynthetizeOperator<opencl::kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>;
+
+// Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULFixture =
+    GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULValidationFixture<CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS>;
+
+// Fixture for CLGEMMLowpMatrixMultiplyReshapedOnlyRHS
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureSigned =
+    GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageValidationFixture<int8_t, CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS, CLReductionOperation, CLCast>;
+
+using CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureUnsigned =
+    GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageValidationFixture<uint8_t, CLTensor, CLAccessor, CLGEMMReshapeRHSMatrix, CLGEMMLowpMatrixMultiplyReshapedOnlyRHS, CLReductionOperation, CLCast>;
+
+namespace
+{
+// *INDENT-OFF*
+// clang-format off
+
+/** M values to test */
+const auto m_values = framework::dataset::make("M", {16, 49});
+
+/** N values to test */
+const auto n_values = framework::dataset::make("N", {16, 259});
+
+/** K values to test */
+const auto k_values = framework::dataset::make("K", {192});
+
+/** Batch size values to test */
+const auto b_values = framework::dataset::make("batch_size", {1, 2});
+
+/** M0 values to test - Precommit */
+const auto m0 = framework::dataset::make("M0", {1, 2, 4});
+
+/** N0 values to test - Precommit */
+const auto n0 = framework::dataset::make("N0", { 1, 4, 8});
+
+/** K0 values to test - Precommit */
+const auto k0 = framework::dataset::make("K0", { 4 });
+
+/** H0 values to test - Precommit */
+const auto h0 = framework::dataset::make("H0", 1);
+
+/** Interleave values to test with RHS matrix */
+const auto i_values_rhs = framework::dataset::make("interleave_rhs", { false });
+
+/** Transpose values to test with RHS matrix */
+const auto t_values_rhs = framework::dataset::make("transpose_rhs", { true });
+
+const auto broadcast_bias = framework::dataset::make("broadcast_bias", {true, false});
+
+} // namespace
+
+TEST_SUITE(CL)
+TEST_SUITE(GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL)
+FIXTURE_DATA_TEST_CASE(Signed, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULFixture, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                    framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE(Unsigned, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULFixture, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                    framework::dataset::make("DataType", { DataType::QASYMM8})))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE(OutputStageSigned, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureSigned, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   broadcast_bias),
+                    framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED})))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+FIXTURE_DATA_TEST_CASE(OutputStageUnsigned, CLGEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageFixtureUnsigned, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0),
+                                                                   n0),
+                                                                   k0),
+                                                                   h0),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   broadcast_bias),
+                    framework::dataset::make("DataType", { DataType::QASYMM8})))
+{
+    // Validate output
+    if(arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_arm_matrix_multiply not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // GEMMLowpMatrixMultiplyReshapedOnlyRhsMMUL
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 5fe7d83..6d073cd 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -24,19 +24,10 @@
 #ifndef ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE
 #define ARM_COMPUTE_TEST_GEMMLOWP_FIXTURE
 
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "tests/AssetsLibrary.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
-#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/GEMMLowp.h"
-
-#include <random>
+#include "tests/validation/Validation.h"
 
 namespace arm_compute
 {
@@ -1362,6 +1353,370 @@
     SimpleTensor<int32_t> _reference{};
 };
 
+template <typename T, typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType, typename ReduceOperation, typename CastOperation>
+class GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULOutputStageValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0,
+               unsigned int k0, unsigned int h0, bool interleave_rhs, bool transpose_rhs, bool broadcast_bias, DataType data_type)
+    {
+        GEMMLowpOutputStageInfo output_stage;
+        output_stage.type                    = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_stage.output_data_type        = data_type;
+        output_stage.gemmlowp_multipliers    = std::vector<int32_t> { 1 };
+        output_stage.gemmlowp_shifts         = std::vector<int32_t> { 1 };
+        output_stage.gemmlowp_multipliers[0] = 1;
+        output_stage.gemmlowp_shifts[0]      = 1;
+        output_stage.gemmlowp_offset         = 0;
+        constexpr float scale                = 0.001f;
+        quantization::calculate_quantized_multiplier(scale, &output_stage.gemmlowp_multipliers[0], &output_stage.gemmlowp_shifts[0]);
+        output_stage.gemmlowp_min_bound = -100;
+        output_stage.gemmlowp_max_bound = 100;
+
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0 = m0;
+        lhs_info.k0 = k0;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0         = n0;
+        rhs_info.k0         = k0;
+        rhs_info.h0         = h0;
+        rhs_info.interleave = interleave_rhs;
+        rhs_info.transpose  = transpose_rhs;
+
+        int a_offset = 1;
+        int b_offset = 1;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+        const TensorShape bias_shape(n,
+                                     broadcast_bias ? 1 : m,
+                                     broadcast_bias ? 1 : batch_size);
+
+        _target    = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, output_stage, a_offset, b_offset);
+        if(gemm_validated == true)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, bias_shape, data_type, output_stage, a_offset, b_offset);
+        }
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::QASYMM8:
+            {
+                // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+                std::uniform_int_distribution<> distribution(1, 254);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            case DataType::QASYMM8_SIGNED:
+            {
+                std::uniform_int_distribution<> distribution(-127, 126);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            case DataType::S32:
+            {
+                std::uniform_int_distribution<> distribution(-10000, 10000);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported data type");
+        }
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info,
+                              const GEMMRHSMatrixInfo &rhs_info, DataType data_type, GEMMLowpOutputStageInfo output_stage, const int a_offset, const int b_offset)
+    {
+        // Create tensors
+        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, a_offset));
+        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, b_offset));
+        TensorType bias = create_tensor<TensorType>(bias_shape, DataType::S32, 1);
+        TensorType dst;
+        TensorType rhs_reshaped;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+
+        // Tensors for precomputing sum of lhs rows / rhs columns
+        TensorType vec_sum_rows = create_tensor<TensorType>(TensorShape(M, 1, lhs_shape[2]), DataType::S32, 1);
+        TensorType vec_sum_cols = create_tensor<TensorType>(TensorShape(N, 1, rhs_shape[2]), DataType::S32, 1);
+
+        GEMMKernelInfo gemm_info;
+        gemm_info.m            = M;
+        gemm_info.n            = N;
+        gemm_info.k            = K;
+        gemm_info.lhs_info     = lhs_info;
+        gemm_info.rhs_info     = rhs_info;
+        gemm_info.output_stage = output_stage;
+        gemm_info.a_offset     = a_offset;
+        gemm_info.b_offset     = b_offset;
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMFunctionType       gemm;
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+
+        // If GEMM is not validated, do not try to run. The validation will check
+        // if the technology supports this extension. If not, the test will be skipped.
+        // If it supports, the test will fail anyway because target and reference
+        // will not match.
+        gemm_validated = bool(gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, vec_sum_cols.info(), vec_sum_rows.info(), bias.info()));
+        if(gemm_validated == true)
+        {
+            gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, vec_sum_cols.info(), vec_sum_rows.info(), bias.info());
+
+            ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+
+            // Allocate tensors
+            lhs.allocator()->allocate();
+            rhs.allocator()->allocate();
+            rhs_reshaped.allocator()->allocate();
+            bias.allocator()->allocate();
+            vec_sum_cols.allocator()->allocate();
+            vec_sum_rows.allocator()->allocate();
+            dst.allocator()->allocate();
+
+            ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!vec_sum_cols.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!vec_sum_rows.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+            // Fill tensors
+            fill(AccessorType(lhs), 0);
+            fill(AccessorType(rhs), 1);
+            fill(AccessorType(bias), 2);
+
+            TensorType    lhs_32 = create_tensor<TensorType>(lhs_shape, DataType::S32, 1);
+            TensorType    rhs_32 = create_tensor<TensorType>(rhs_shape, DataType::S32, 1);
+            CastOperation cast_lhs;
+            CastOperation cast_rhs;
+            cast_lhs.configure(&lhs, &lhs_32, ConvertPolicy::SATURATE);
+            cast_rhs.configure(&rhs, &rhs_32, ConvertPolicy::SATURATE);
+            lhs_32.allocator()->allocate();
+            rhs_32.allocator()->allocate();
+            cast_lhs.run();
+            cast_rhs.run();
+
+            ReduceOperation lhs_sum_rows;
+            ReduceOperation rhs_sum_cols;
+
+            lhs_sum_rows.configure(&lhs_32, &vec_sum_rows, 0, ReductionOperation::SUM, false);
+            rhs_sum_cols.configure(&rhs_32, &vec_sum_cols, 1, ReductionOperation::SUM);
+
+            lhs_sum_rows.run();
+            rhs_sum_cols.run();
+
+            // Compute GEMM
+            ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+            reshape_rhs.run(reshape_rhs_pack);
+            ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_SRC_2, &bias }, { ACL_DST, &dst }, { ACL_VEC_COL_SUM, &vec_sum_cols }, { ACL_VEC_ROW_SUM, &vec_sum_rows } });
+            gemm.run(gemm_pack);
+        }
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, DataType data_type, GEMMLowpOutputStageInfo output_stage,
+                                      const int a_offset, const int b_offset)
+    {
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        // Create reference
+        SimpleTensor<T>       lhs{ lhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, a_offset) };
+        SimpleTensor<T>       rhs{ rhs_shape, data_type, 1, QuantizationInfo(1.0f / 255, b_offset) };
+        SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
+        SimpleTensor<int32_t> dst{ dst_shape, DataType::S32, 1 };
+        SimpleTensor<T>       dst_final{ dst_shape, data_type, 1 };
+
+        // Fill reference
+        fill(lhs, 0);
+        fill(rhs, 1);
+        fill(bias, 2);
+
+        dst       = reference::gemmlowp_matrix_multiply_core<int32_t, T>(lhs, rhs, dst_shape, a_offset, b_offset);
+        dst_final = reference::gemmlowp_quantize_down_scale_by_fixedpoint<int32_t, T>(dst, bias,
+                                                                                      output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+        return dst_final;
+    }
+
+    bool            gemm_validated = true;
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
+class GEMMLowpMatrixMultiplyReshapedOnlyRHSMMULValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0,
+               unsigned int k0, unsigned int h0, bool interleave_rhs, bool transpose_rhs, DataType data_type)
+    {
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0 = m0;
+        lhs_info.k0 = k0;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0         = n0;
+        rhs_info.k0         = k0;
+        rhs_info.h0         = h0;
+        rhs_info.interleave = interleave_rhs;
+        rhs_info.transpose  = transpose_rhs;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+
+        _target    = compute_target(lhs_shape, rhs_shape, lhs_info, rhs_info, data_type);
+        if(gemm_validated == true)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type);
+        }
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::QASYMM8:
+            {
+                // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+                std::uniform_int_distribution<> distribution(1, 254);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            case DataType::QASYMM8_SIGNED:
+            {
+                std::uniform_int_distribution<> distribution(-127, 126);
+                library->fill(tensor, distribution, i);
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported data type");
+        }
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const GEMMLHSMatrixInfo &lhs_info,
+                              const GEMMRHSMatrixInfo &rhs_info, DataType data_type)
+    {
+        // Create tensors
+        TensorType lhs = create_tensor<TensorType>(lhs_shape, data_type, 1);
+        TensorType rhs = create_tensor<TensorType>(rhs_shape, data_type, 1);
+        TensorType rhs_reshaped;
+        TensorType dst;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+
+        GEMMKernelInfo gemm_info;
+        gemm_info.m        = M;
+        gemm_info.n        = N;
+        gemm_info.k        = K;
+        gemm_info.lhs_info = lhs_info;
+        gemm_info.rhs_info = rhs_info;
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMFunctionType       gemm;
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+
+        // If GEMM is not validated, do not try to run. The validation will check
+        // if the technology supports this extension. If not, the test will be skipped.
+        // If it supports, the test will fail anyway because target and reference
+        // will not match.
+        gemm_validated = bool(gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, nullptr, nullptr, nullptr));
+        if(gemm_validated == true)
+        {
+            gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, nullptr, nullptr, nullptr);
+
+            ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+
+            // Allocate tensors
+            lhs.allocator()->allocate();
+            rhs.allocator()->allocate();
+            rhs_reshaped.allocator()->allocate();
+            dst.allocator()->allocate();
+
+            ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+            ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+            // Fill tensors
+            fill(AccessorType(lhs), 0);
+            fill(AccessorType(rhs), 1);
+
+            // Compute GEMM
+            ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+            reshape_rhs.run(reshape_rhs_pack);
+            ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, { ACL_SRC_1, &rhs_reshaped }, { ACL_DST, &dst } });
+            gemm.run(gemm_pack);
+        }
+
+        return dst;
+    }
+
+    SimpleTensor<int32_t> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type)
+    {
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        if(data_type == DataType::QASYMM8)
+        {
+            // Create reference
+            SimpleTensor<uint8_t> lhs{ lhs_shape, data_type, 1 };
+            SimpleTensor<uint8_t> rhs{ rhs_shape, data_type, 1 };
+            SimpleTensor<int32_t> dst{ dst_shape, DataType::S32, 1 };
+
+            // Fill reference
+            fill(lhs, 0);
+            fill(rhs, 1);
+
+            return reference::gemmlowp_matrix_multiply_core<int32_t, uint8_t>(lhs, rhs, dst_shape, 0, 0);
+        }
+        else
+        {
+            // Create reference
+            SimpleTensor<int8_t>  lhs{ lhs_shape, data_type, 1 };
+            SimpleTensor<int8_t>  rhs{ rhs_shape, data_type, 1 };
+            SimpleTensor<int32_t> dst{ dst_shape, DataType::S32, 1 };
+
+            // Fill reference
+            fill(lhs, 0);
+            fill(rhs, 1);
+
+            return reference::gemmlowp_matrix_multiply_core<int32_t, int8_t>(lhs, rhs, dst_shape, 0, 0);
+        }
+    }
+
+    bool                  gemm_validated = true;
+    TensorType            _target{};
+    SimpleTensor<int32_t> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename ReshapeRHSOperatorType, typename GEMMFunctionType>
 class GEMMLowpMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture
 {