Implement OpenCL MatMul for Lhs NT Rhs T/NT FP32/16

 - Implement ClNativeMatMulKernel class
 - Implement opencl kernel for LHS non-transposed and RHS non-transposed
 - Implement opencl kernel for LHS non-transposed and RHS transposed
 - Add test fixture and dataset for matmul
 - Implement transpose_tensor() for reference implementation to transpose high dimensional tensors

Resolves: COMPMID-5944, COMPMID-5951

Co-authored-by: Gunes Bayir <gunes.bayir@arm.com>
Co-authored-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I1d5b8978f41be27baddb3153ade880472141573f
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9333
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index db27ddc..c3af0a2 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -421,7 +421,8 @@
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const GEMMRHSMatrixInfo &gemm_info)
 {
-    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << " exp_img=" << gemm_info.export_to_cl_image << "})";
+    os << "( n0=" << (unsigned int)gemm_info.n0 << " k0=" << gemm_info.k0 << "  h0=" << gemm_info.h0 << "  trans=" << gemm_info.transpose << "  inter=" << gemm_info.interleave << " exp_img=" <<
+       gemm_info.export_to_cl_image << "})";
     return os;
 }
 
@@ -474,7 +475,8 @@
 inline ::std::ostream &operator<<(::std::ostream &os, const BoundingBoxTransformInfo &bbox_info)
 {
     auto weights = bbox_info.weights();
-    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={" << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] << "})";
+    os << "(" << bbox_info.img_width() << "x" << bbox_info.img_height() << ")~" << bbox_info.scale() << "(weights={" << weights[0] << ", " << weights[1] << ", " << weights[2] << ", " << weights[3] <<
+       "})";
     return os;
 }
 
@@ -3333,46 +3335,46 @@
 inline std::string to_string(const WeightFormat wf)
 {
 #define __CASE_WEIGHT_FORMAT(wf) \
-    case WeightFormat::wf:       \
-        return #wf;
+case WeightFormat::wf:       \
+    return #wf;
     switch(wf)
     {
-        __CASE_WEIGHT_FORMAT(UNSPECIFIED)
-        __CASE_WEIGHT_FORMAT(ANY)
-        __CASE_WEIGHT_FORMAT(OHWI)
-        __CASE_WEIGHT_FORMAT(OHWIo2)
-        __CASE_WEIGHT_FORMAT(OHWIo4)
-        __CASE_WEIGHT_FORMAT(OHWIo8)
-        __CASE_WEIGHT_FORMAT(OHWIo16)
-        __CASE_WEIGHT_FORMAT(OHWIo32)
-        __CASE_WEIGHT_FORMAT(OHWIo64)
-        __CASE_WEIGHT_FORMAT(OHWIo128)
-        __CASE_WEIGHT_FORMAT(OHWIo4i2)
-        __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo8i2)
-        __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo16i2)
-        __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo32i2)
-        __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo64i2)
-        __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo4i4)
-        __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo8i4)
-        __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo16i4)
-        __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo32i4)
-        __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo64i4)
-        __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
-        __CASE_WEIGHT_FORMAT(OHWIo2i8)
-        __CASE_WEIGHT_FORMAT(OHWIo4i8)
-        __CASE_WEIGHT_FORMAT(OHWIo8i8)
-        __CASE_WEIGHT_FORMAT(OHWIo16i8)
-        __CASE_WEIGHT_FORMAT(OHWIo32i8)
-        __CASE_WEIGHT_FORMAT(OHWIo64i8)
+            __CASE_WEIGHT_FORMAT(UNSPECIFIED)
+            __CASE_WEIGHT_FORMAT(ANY)
+            __CASE_WEIGHT_FORMAT(OHWI)
+            __CASE_WEIGHT_FORMAT(OHWIo2)
+            __CASE_WEIGHT_FORMAT(OHWIo4)
+            __CASE_WEIGHT_FORMAT(OHWIo8)
+            __CASE_WEIGHT_FORMAT(OHWIo16)
+            __CASE_WEIGHT_FORMAT(OHWIo32)
+            __CASE_WEIGHT_FORMAT(OHWIo64)
+            __CASE_WEIGHT_FORMAT(OHWIo128)
+            __CASE_WEIGHT_FORMAT(OHWIo4i2)
+            __CASE_WEIGHT_FORMAT(OHWIo4i2_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo8i2)
+            __CASE_WEIGHT_FORMAT(OHWIo8i2_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo16i2)
+            __CASE_WEIGHT_FORMAT(OHWIo16i2_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo32i2)
+            __CASE_WEIGHT_FORMAT(OHWIo32i2_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo64i2)
+            __CASE_WEIGHT_FORMAT(OHWIo64i2_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo4i4)
+            __CASE_WEIGHT_FORMAT(OHWIo4i4_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo8i4)
+            __CASE_WEIGHT_FORMAT(OHWIo8i4_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo16i4)
+            __CASE_WEIGHT_FORMAT(OHWIo16i4_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo32i4)
+            __CASE_WEIGHT_FORMAT(OHWIo32i4_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo64i4)
+            __CASE_WEIGHT_FORMAT(OHWIo64i4_bf16)
+            __CASE_WEIGHT_FORMAT(OHWIo2i8)
+            __CASE_WEIGHT_FORMAT(OHWIo4i8)
+            __CASE_WEIGHT_FORMAT(OHWIo8i8)
+            __CASE_WEIGHT_FORMAT(OHWIo16i8)
+            __CASE_WEIGHT_FORMAT(OHWIo32i8)
+            __CASE_WEIGHT_FORMAT(OHWIo64i8)
         default:
             return "invalid value";
     }
@@ -3677,6 +3679,40 @@
     return str.str();
 }
 
+/** Formatted output of the arm_compute::MatMulKernelInfo type.
+ *
+ * @param[out] os          Output stream.
+ * @param[in]  matmul_info arm_compute::MatMulKernelInfo  type to output.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const arm_compute::MatMulKernelInfo &matmul_info)
+{
+    os << "MatMulKernelInfo="
+       << "["
+       << "adj_lhs=" << matmul_info.adj_lhs << ", "
+       << "adj_rhs=" << matmul_info.adj_rhs << ", "
+       << "M0=" << matmul_info.m0 << ", "
+       << "N0=" << matmul_info.n0 << ", "
+       << "K0=" << matmul_info.k0 << ", "
+       << "export_rhs_to_cl_image=" << matmul_info.export_rhs_to_cl_image
+       << "]";
+
+    return os;
+}
+/** Formatted output of the arm_compute::MatMulKernelInfo type.
+ *
+ * @param[in] matmul_info arm_compute::MatMulKernelInfo type to output.
+ *
+ * @return Formatted string.
+ */
+inline std::string to_string(const arm_compute::MatMulKernelInfo &matmul_info)
+{
+    std::stringstream str;
+    str << matmul_info;
+    return str.str();
+}
+
 } // namespace arm_compute
 
 #endif /* __ARM_COMPUTE_TYPE_PRINTER_H__ */