Implement OpenCL MatMul for Lhs NT Rhs T/NT FP32/16

 - Implement ClNativeMatMulKernel class
 - Implement opencl kernel for LHS non-transposed and RHS non-transposed
 - Implement opencl kernel for LHS non-transposed and RHS transposed
 - Add test fixture and dataset for matmul
 - Implement transpose_tensor() for reference implementation to transpose high dimensional tensors

Resolves: COMPMID-5944, COMPMID-5951

Co-authored-by: Gunes Bayir <gunes.bayir@arm.com>
Co-authored-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I1d5b8978f41be27baddb3153ade880472141573f
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9333
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/datasets/BatchMatMulDataset.h b/tests/datasets/BatchMatMulDataset.h
new file mode 100644
index 0000000..dad7cc0
--- /dev/null
+++ b/tests/datasets/BatchMatMulDataset.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TESTS_DATASETS_BATCHMATMULDATASET
+#define TESTS_DATASETS_BATCHMATMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class BatchMatMulDataset
+{
+public:
+    using type = std::tuple<TensorShape, TensorShape, TensorShape>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator a_it,
+                 std::vector<TensorShape>::const_iterator b_it,
+                 std::vector<TensorShape>::const_iterator dst_it)
+            : _a_it{ std::move(a_it) },
+              _b_it{ std::move(b_it) },
+              _dst_it{ std::move(dst_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "A=" << *_a_it << ":";
+            description << "B=" << *_b_it << ":";
+            description << "Out=" << *_dst_it << ":";
+            return description.str();
+        }
+
+        BatchMatMulDataset::type operator*() const
+        {
+            return std::make_tuple(*_a_it, *_b_it, *_dst_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_a_it;
+            ++_b_it;
+            ++_dst_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _a_it;
+        std::vector<TensorShape>::const_iterator _b_it;
+        std::vector<TensorShape>::const_iterator _dst_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_a_shapes.begin(), _b_shapes.begin(), _dst_shapes.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_a_shapes.size(), std::min(_b_shapes.size(), _dst_shapes.size()));
+    }
+
+    void add_config(TensorShape a, TensorShape b, TensorShape dst)
+    {
+        _a_shapes.emplace_back(std::move(a));
+        _b_shapes.emplace_back(std::move(b));
+        _dst_shapes.emplace_back(std::move(dst));
+    }
+
+protected:
+    BatchMatMulDataset()                      = default;
+    BatchMatMulDataset(BatchMatMulDataset &&) = default;
+
+private:
+    std::vector<TensorShape> _a_shapes{};
+    std::vector<TensorShape> _b_shapes{};
+    std::vector<TensorShape> _dst_shapes{};
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* TESTS_DATASETS_BATCHMATMULDATASET */
diff --git a/tests/datasets/LargeBatchMatMulDataset.h b/tests/datasets/LargeBatchMatMulDataset.h
new file mode 100644
index 0000000..0d8ff91
--- /dev/null
+++ b/tests/datasets/LargeBatchMatMulDataset.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_LARGEBATCHMATMULDATASET
+#define ACL_TESTS_DATASETS_LARGEBATCHMATMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/BatchMatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class LargeBatchMatMulDataset final : public BatchMatMulDataset
+{
+public:
+    LargeBatchMatMulDataset()
+    {
+        add_config(TensorShape(21U, 13U, 3U, 2U), TensorShape(33U, 21U, 3U, 2U), TensorShape(33U, 13U, 3U, 2U));
+        add_config(TensorShape(38U, 12U, 1U, 5U), TensorShape(21U, 38U, 1U, 5U), TensorShape(21U, 12U, 1U, 5U));
+        add_config(TensorShape(45U, 38U, 3U, 2U), TensorShape(21U, 45U, 3U, 2U), TensorShape(21U, 38U, 3U, 2U));
+    }
+};
+
+class HighDimensionalBatchMatMulDataset final : public BatchMatMulDataset
+{
+public:
+    HighDimensionalBatchMatMulDataset()
+    {
+        add_config(TensorShape(5U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U), TensorShape(5U, 5U, 2U, 2U, 2U, 2U)); // 6D tensor
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ACL_TESTS_DATASETS_LARGEBATCHMATMULDATASET */
diff --git a/tests/datasets/SmallBatchMatMulDataset.h b/tests/datasets/SmallBatchMatMulDataset.h
new file mode 100644
index 0000000..cfe76be
--- /dev/null
+++ b/tests/datasets/SmallBatchMatMulDataset.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_DATASETS_SMALLBATCHMATMULDATASET
+#define ACL_TESTS_DATASETS_SMALLBATCHMATMULDATASET
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/BatchMatMulDataset.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class SmallBatchMatMulDataset final : public BatchMatMulDataset
+{
+public:
+    SmallBatchMatMulDataset()
+    {
+        add_config(TensorShape(3U, 4U, 2U, 2U), TensorShape(2U, 3U, 2U, 2U), TensorShape(2U, 4U, 2U, 2U));
+        add_config(TensorShape(9U, 6U), TensorShape(5U, 9U), TensorShape(5U, 6U));
+        add_config(TensorShape(31U, 1U), TensorShape(23U, 31U), TensorShape(23U, 1U));
+        add_config(TensorShape(8U, 4U, 2U), TensorShape(16U, 8U, 2U), TensorShape(16U, 4U, 2U));
+        add_config(TensorShape(32U, 2U), TensorShape(17U, 32U), TensorShape(17U, 2U));
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ACL_TESTS_DATASETS_SMALLBATCHMATMULDATASET */