IVGCVSW-7308 Add GpuAcc Batch MatMul workload

* Call dedicated MatMul kernel in ACL
* Add int8 tests
* Add int8 to documentation
* Force tensors to be dynamic (nonConst) as per request of ACL

Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I7b7ac20deec8637dc46ca990d339d92c4587cbe4
diff --git a/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp b/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp
index 5277efc..d45fb7e 100644
--- a/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp
+++ b/src/backends/cl/workloads/ClBatchMatMulWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -7,35 +7,25 @@
 
 #include "ClBaseWorkload.hpp"
 
-#include <arm_compute/runtime/IFunction.h>
-#include <arm_compute/runtime/CL/CLTensor.h>
-#include <memory>
+#include <arm_compute/runtime/CL/functions/CLMatMul.h>
 
 namespace armnn
 {
-    arm_compute::Status ClBatchMatMulValidate(const TensorInfo& inputX,
-                                              const TensorInfo& inputY,
-                                              const TensorInfo& output,
-                                              const BatchMatMulDescriptor& descriptor);
+arm_compute::Status ClBatchMatMulValidate(const TensorInfo& inputX,
+                                          const TensorInfo& inputY,
+                                          const TensorInfo& output,
+                                          const BatchMatMulDescriptor& descriptor,
+                                          const ActivationDescriptor* activationDescriptor);
 
-    class ClBatchMatMulWorkload : public ClBaseWorkload<BatchMatMulQueueDescriptor>
-    {
-    public:
-        ClBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor,
-                              const WorkloadInfo& info,
-                              const arm_compute::CLCompileContext& clCompileContext);
-        virtual void Execute() const override;
+class ClBatchMatMulWorkload : public ClBaseWorkload<BatchMatMulQueueDescriptor>
+{
+public:
+    ClBatchMatMulWorkload(const BatchMatMulQueueDescriptor& descriptor,
+                          const WorkloadInfo& info,
+                          const arm_compute::CLCompileContext& clCompileContext);
+    virtual void Execute() const override;
 
-    private:
-        // ACL layers required to fully form a Batch Mat Mul layer.
-        std::unique_ptr<arm_compute::IFunction> m_GEMMLayer;
-        std::unique_ptr<arm_compute::IFunction> m_PermuteLayerX;
-        std::unique_ptr<arm_compute::IFunction> m_PermuteLayerY;
-
-        // Additional CL arm_compute::Tensors.
-        // Required to perform permutations.
-        arm_compute::CLTensor m_PermutedTensorX;
-        arm_compute::CLTensor m_PermutedTensorY;
-
-    };
+private:
+    mutable arm_compute::CLMatMul m_MatMulLayer;
+};
 } //namespace armnn