Connect CLMatMul function to quantized kernels and resolve NE BatchMatMul int_8 failures

* Adapt the CLMatMul function and ClMatMul operator to use quantized kernels.
* Add function-level tests.

Resolves: COMPMID-5929 and COMPMID-5811

Change-Id: I5348cdcf07b8074c138e04dfef0a73399377accd
Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
Signed-off-by: Omar Al Khatib <omar.alkhatib@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9575
Reviewed-by: Mohmun02 <MohammedSuhail.Munshi@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
index 712bac0..2af9a4a 100644
--- a/arm_compute/runtime/CL/functions/CLMatMul.h
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
-#define ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
 
 #include "arm_compute/runtime/IFunction.h"
 #include <memory>
+
 namespace arm_compute
 {
 // Forward declarations for used types instead of including their header, that could minimize compile time
@@ -64,10 +65,12 @@
      * - All
      *
      * Valid data type configurations:
-     * |lhs          |rhs          |output         |
-     * |:------------|:------------|:--------------|
-     * |F32          |F32          |F32            |
-     * |F16          |F16          |F16            |
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
      *
      * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
      *                    and stores the result in the dst tensor of the same batch size.
@@ -76,18 +79,18 @@
      * @note All tensors must have the same data type.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  lhs             LHS input tensor (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in]  rhs             RHS input tensor (Matrix B). Data type supported: same as @p lhs.
-     * @param[out] output          Output tensor. Data type supported: same as @p lhs.
-     * @param[in]  matmul_info     Attributes for MatMul
+     * @param[in]  lhs             Left-hand side tensor info containing the input activations as Matrix A. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs             Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs.
+     * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  settings        Class containing flags for function level settings
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+    void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
     /** Initialise the kernel's inputs and output
      *
      * Similar to @ref CLMatMul::configure()
      */
-    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
+    void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
      *
      * Similar to @ref CLMatMul::configure()
@@ -104,4 +107,4 @@
 };
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */
+#endif /* ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */