Implement OpenCL MatMul for Lhs NT Rhs T/NT FP32/16

 - Implement ClNativeMatMulKernel class
 - Implement opencl kernel for LHS non-transposed and RHS non-transposed
 - Implement opencl kernel for LHS non-transposed and RHS transposed
 - Add test fixture and dataset for matmul
 - Implement transpose_tensor() for reference implementation to transpose high dimensional tensors

Resolves: COMPMID-5944, COMPMID-5951

Co-authored-by: Gunes Bayir <gunes.bayir@arm.com>
Co-authored-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I1d5b8978f41be27baddb3153ade880472141573f
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9333
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/SConscript b/SConscript
index a480c45..205764b 100644
--- a/SConscript
+++ b/SConscript
@@ -359,6 +359,7 @@
                        'src/core/CL/cl_kernels/common/cast.cl',
                        'src/core/CL/cl_kernels/common/comparisons.cl',
                        'src/core/CL/cl_kernels/common/concatenate.cl',
+                       'src/core/CL/cl_kernels/common/convolution_layer.cl',
                        'src/core/CL/cl_kernels/common/col2im.cl',
                        'src/core/CL/cl_kernels/common/convert_fc_weights.cl',
                        'src/core/CL/cl_kernels/common/copy_tensor.cl',
@@ -368,6 +369,9 @@
                        'src/core/CL/cl_kernels/common/elementwise_operation.cl',
                        'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
                        'src/core/CL/cl_kernels/common/elementwise_unary.cl',
+                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
+                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
+                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
                        'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
                        'src/core/CL/cl_kernels/common/fft.cl',
                        'src/core/CL/cl_kernels/common/fft_scale.cl',
@@ -377,21 +381,18 @@
                        'src/core/CL/cl_kernels/common/gemm.cl',
                        'src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl',
                        'src/core/CL/cl_kernels/common/gemm_utils.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped.cl',
-                       'src/core/CL/cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl',
-                       'src/core/CL/cl_kernels/common/gemv.cl',
                        'src/core/CL/cl_kernels/common/gemmlowp.cl',
                        'src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl',
+                       'src/core/CL/cl_kernels/common/gemv.cl',
                        'src/core/CL/cl_kernels/common/generate_proposals.cl',
                        'src/core/CL/cl_kernels/common/generate_proposals_quantized.cl',
                        'src/core/CL/cl_kernels/common/instance_normalization.cl',
                        'src/core/CL/cl_kernels/common/l2_normalize.cl',
+                       'src/core/CL/cl_kernels/common/mat_mul.cl',
                        'src/core/CL/cl_kernels/common/mean_stddev_normalization.cl',
-                       'src/core/CL/cl_kernels/common/unpooling_layer.cl',
                        'src/core/CL/cl_kernels/common/memset.cl',
-                       'src/core/CL/cl_kernels/common/nonmax.cl',
                        'src/core/CL/cl_kernels/common/minmax_layer.cl',
+                       'src/core/CL/cl_kernels/common/nonmax.cl',
                        'src/core/CL/cl_kernels/common/pad_layer.cl',
                        'src/core/CL/cl_kernels/common/permute.cl',
                        'src/core/CL/cl_kernels/common/pixelwise_mul_float.cl',
@@ -401,18 +402,18 @@
                        'src/core/CL/cl_kernels/common/range.cl',
                        'src/core/CL/cl_kernels/common/reduction_operation.cl',
                        'src/core/CL/cl_kernels/common/reshape_layer.cl',
-                       'src/core/CL/cl_kernels/common/convolution_layer.cl',
                        'src/core/CL/cl_kernels/common/reverse.cl',
                        'src/core/CL/cl_kernels/common/roi_align_layer.cl',
                        'src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl',
                        'src/core/CL/cl_kernels/common/roi_pooling_layer.cl',
                        'src/core/CL/cl_kernels/common/select.cl',
+                       'src/core/CL/cl_kernels/common/slice_ops.cl',
                        'src/core/CL/cl_kernels/common/softmax_layer.cl',
                        'src/core/CL/cl_kernels/common/softmax_layer_quantized.cl',
                        'src/core/CL/cl_kernels/common/stack_layer.cl',
-                       'src/core/CL/cl_kernels/common/slice_ops.cl',
                        'src/core/CL/cl_kernels/common/tile.cl',
-                       'src/core/CL/cl_kernels/common/transpose.cl'
+                       'src/core/CL/cl_kernels/common/transpose.cl',
+                       'src/core/CL/cl_kernels/common/unpooling_layer.cl'
                     ]
 
     # NCHW kernels