COMPMID-2097: Implement a heuristic to dispatch CLGEMMReshapedOnlyRHS kernel from CLGEMM

Change-Id: I4170a80647b02501aa669e2c0347ddc39888ee76
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/928
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
similarity index 60%
rename from arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h
rename to arm_compute/core/CL/ICLGEMMKernelConfiguration.h
index 500d9cd..2e6d495 100644
--- a/arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h
+++ b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
@@ -21,20 +21,37 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_ICLGEMMRESHAPEDCONFIGURATION_H__
-#define __ARM_COMPUTE_ICLGEMMRESHAPEDCONFIGURATION_H__
+#ifndef __ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H__
 
+#include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
-/** Basic interface for the GEMM selection */
-class ICLGEMMReshapedConfiguration
+/** Basic interface for the GEMM kernel configuration */
+class ICLGEMMKernelConfiguration
 {
 public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    ICLGEMMKernelConfiguration(GPUTarget arch)
+        : _target(arch)
+    {
+    }
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLGEMMKernelConfiguration(const ICLGEMMKernelConfiguration &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLGEMMKernelConfiguration &operator=(const ICLGEMMKernelConfiguration &) = delete;
+    /** Default Move Constructor. */
+    ICLGEMMKernelConfiguration(ICLGEMMKernelConfiguration &&) = default;
+    /** Default move assignment operator */
+    ICLGEMMKernelConfiguration &operator=(ICLGEMMKernelConfiguration &&) = default;
     /** Virtual destructor */
-    virtual ~ICLGEMMReshapedConfiguration() = default;
-    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used with @ref CLGEMMMatrixMultiplyReshapedKernel
+    virtual ~ICLGEMMKernelConfiguration() = default;
+    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
      *
      * @param[in] m         Number of rows LHS matrix
      * @param[in] n         Number of columns RHS matrix
@@ -43,6 +60,9 @@
      * @param[in] data_type Data type
      */
     virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+
+protected:
+    GPUTarget _target;
 };
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_ICLGEMMRESHAPEDCONFIGURATION_H__ */
+#endif /*__ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H__ */
diff --git a/arm_compute/core/CL/gemm/CLGEMMHelpers.h b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
new file mode 100644
index 0000000..d263712
--- /dev/null
+++ b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMHELPERS_H__
+#define __ARM_COMPUTE_CLGEMMHELPERS_H__
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * @param[in] m              Number of rows (M) in the LHS matrix not reshaped
+ * @param[in] n              Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] m0             Number of rows processed by each thread/work-item
+ * @param[in] n0             Number of columns processed by each thread/work-item
+ * @param[in] k0             Number of inner accumulation performed by each thread/work-item
+ * @param[in] v0             Number of vertical blocks of size (m0xk0) stored on the same output row
+ * @param[in] h0             Number of horizontal blocks of size (k0xn0) stored on the same output row
+ * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row
+ * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row
+ * @param[in] lhs_transpose  True if the (m0xk0) block has to be transposed before been stored
+ * @param[in] rhs_transpose  True if the (k0xn0) block has to be transposed before been stored
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose);
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGEMMHELPERS_H__ */
diff --git a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
similarity index 63%
rename from arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h
rename to arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index 3458911..105a58a 100644
--- a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__
-#define __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H__
 
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
 
 #include <memory>
 
@@ -34,23 +33,27 @@
 {
 namespace cl_gemm
 {
-/** Tuner factory class */
-class CLGEMMReshapedConfigurationFactory final
+/** CLGEMMReshaped factory class */
+class CLGEMMReshapedKernelConfigurationFactory final
 {
 public:
-    static std::unique_ptr<ICLGEMMReshapedConfiguration> create()
+    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU architecture
+     *
+     * @param[in] arch GPU target
+     *
+     * @return CLGEMMReshaped kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget arch)
     {
-        GPUTarget arch = get_arch_from_target(CLScheduler::get().target());
-
-        switch(arch)
+        switch(get_arch_from_target(arch))
         {
             case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedConfigurationBifrost>();
+                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(arch);
             default:
                 return nullptr;
         }
     }
 };
-} // namespace tuners
+} // namespace cl_gemm
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__ */
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H__ */
diff --git a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
similarity index 61%
rename from arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h
rename to arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index c452e15..a0aae19 100644
--- a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -21,19 +21,33 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATIONBIFROST_H__
-#define __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATIONBIFROST_H__
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H__
 
-#include "arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
 namespace cl_gemm
 {
-/** Bifrost based OpenCL GEMM reshaped configuration */
-class CLGEMMReshapedConfigurationBifrost final : public ICLGEMMReshapedConfiguration
+/** Bifrost based OpenCL GEMMReshaped configuration */
+class CLGEMMReshapedKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
 {
 public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    CLGEMMReshapedKernelConfigurationBifrost(GPUTarget arch);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedKernelConfigurationBifrost(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedKernelConfigurationBifrost &operator=(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
+    /** Default Move Constructor. */
+    CLGEMMReshapedKernelConfigurationBifrost(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
+    /** Default move assignment operator */
+    CLGEMMReshapedKernelConfigurationBifrost &operator=(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
+
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
@@ -45,4 +59,4 @@
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATIONBIFROST_H__ */
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H__ */
diff --git a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
similarity index 61%
copy from arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h
copy to arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
index 3458911..b9bf150 100644
--- a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__
-#define __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H__
 
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
 
 #include <memory>
 
@@ -34,23 +33,27 @@
 {
 namespace cl_gemm
 {
-/** Tuner factory class */
-class CLGEMMReshapedConfigurationFactory final
+/** CLGEMMReshapedOnlyRHS factory class */
+class CLGEMMReshapedOnlyRHSKernelConfigurationFactory final
 {
 public:
-    static std::unique_ptr<ICLGEMMReshapedConfiguration> create()
+    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU architecture
+     *
+     * @param[in] arch GPU target
+     *
+     * @return CLGEMMReshapedOnlyRHS kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget arch)
     {
-        GPUTarget arch = get_arch_from_target(CLScheduler::get().target());
-
-        switch(arch)
+        switch(get_arch_from_target(arch))
         {
             case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedConfigurationBifrost>();
+                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(arch);
             default:
                 return nullptr;
         }
     }
 };
-} // namespace tuners
+} // namespace cl_gemm
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__ */
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H__ */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
new file mode 100644
index 0000000..3bed118
--- /dev/null
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H__
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
+class CLGEMMReshapedOnlyRHSKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget arch);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
+    /** Default Move Constructor. */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
+    /** Default move assignment operator */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 0d07266..384bd46 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -887,23 +887,20 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
 
+    const bool reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
     const bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d() : 1;
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
-    const int dim0 = gemm_info.n();
-    const int dim1 = gemm_info.m() / depth_output_gemm3d;
-    const int dim2 = input0.tensor_shape()[2];
-    const int dim3 = input0.tensor_shape()[3];
+    const int batch_size = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
 
     TensorShape output_shape{ input0.tensor_shape() };
 
-    output_shape.set(0, dim0);
-    output_shape.set(1, dim1);
-    output_shape.set(2, reinterpret_output_as_3d ? depth_output_gemm3d : dim2);
-    output_shape.set(3, reinterpret_output_as_3d ? dim2 : dim3);
-    output_shape.set(4, reinterpret_output_as_3d ? dim3 : 1);
+    output_shape.set(0, gemm_info.n());
+    output_shape.set(1, gemm_info.m() / depth_output_gemm3d);
+    output_shape.set(2, reinterpret_output_as_3d ? depth_output_gemm3d : batch_size);
+    output_shape.set(3, reinterpret_output_as_3d ? batch_size : 1);
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 0bad446..8c462fa 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
@@ -40,10 +41,11 @@
 
 /** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
  *
- *  -# @ref CLGEMMReshapeLHSMatrixKernel (only if the reshaped GEMM is selected by the heuristic model)
- *  -# @ref CLGEMMReshapeRHSMatrixKernel (only if the reshaped GEMM is selected by the heuristic model)
- *  -# @ref CLGEMMMatrixMultiplyKernel (if GPU target is NOT G76 or if the reshaped GEMM is NOT selected)
- *  -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target IS Mali-G76)
+ *  -# @ref CLGEMMReshapeLHSMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
+ *  -# @ref CLGEMMReshapeRHSMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_type method())
+ *  -# @ref CLGEMMMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_type method())
+ *  -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_type method())
+ *  -# @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_type method())
  *  -# @ref CLGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
  *
  */
@@ -102,20 +104,41 @@
     void prepare() override;
 
 private:
-    CLMemoryGroup                      _memory_group;
-    CLGEMMMatrixMultiplyKernel         _mm_kernel;
-    CLGEMMMatrixAdditionKernel         _ma_kernel;
-    CLGEMMReshapeLHSMatrixKernel       _reshape_lhs_kernel;
-    CLGEMMReshapeRHSMatrixKernel       _reshape_rhs_kernel;
-    CLGEMMMatrixMultiplyReshapedKernel _mm_reshaped_kernel;
-    CLTensor                           _tmp_a;
-    CLTensor                           _tmp_b;
-    const ICLTensor                   *_original_b;
-    bool                               _is_interleaved_transposed;
-    bool                               _run_addition;
-    bool                               _reshape_b_only_on_first_run;
-    bool                               _is_prepared;
-    bool                               _is_new_gemm_reshaped; // Remove when COMPMID-1892 is completed
+    enum class GEMMType
+    {
+        NATIVE,
+        RESHAPED_V1,
+        RESHAPED_V2,
+        RESHAPED_ONLY_RHS
+    };
+
+    // TODO (COMPMID-2095)
+    static GEMMType select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target);
+
+    void configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+    static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+    CLMemoryGroup                             _memory_group;
+    CLGEMMMatrixMultiplyKernel                _mm_kernel;
+    CLGEMMMatrixAdditionKernel                _ma_kernel;
+    CLGEMMReshapeLHSMatrixKernel              _reshape_lhs_kernel;
+    CLGEMMReshapeRHSMatrixKernel              _reshape_rhs_kernel;
+    CLGEMMMatrixMultiplyReshapedKernel        _mm_reshaped_kernel;
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
+    CLTensor                                  _tmp_a;
+    CLTensor                                  _tmp_b;
+    const ICLTensor                          *_original_b;
+    bool                                      _run_addition;
+    bool                                      _reshape_b_only_on_first_run;
+    bool                                      _is_prepared;
+    GEMMType                                  _gemm_type;
 };
 } // namespace arm_compute