Add cl_khr_integer_dot_product extension support

* Replace arm_dot(_acc) with dot when cl_khr_integer_dot_product
  extension is available.

Resolves: COMPMID-5206
Change-Id: I7fd763e2421987584e4dae271008972644ea2f41
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7647
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index b9b2b56..2fed765 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -247,7 +247,7 @@
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_int8"))
+    if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 ";
     }
diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index f9d18ec..9fad66d 100644
--- a/src/core/CL/cl_kernels/common/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1096,17 +1096,17 @@
         VEC_DATA_TYPE(DATA_TYPE, 16)
         a0 = vload16(0, matrix_a + i);
 
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
 
         a0 = vload16(1, matrix_a + i);
 
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
     }
 
     // This for loop performs the leftover accumulations
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index ec57022..69898e1 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -333,7 +333,11 @@
     ({                                                \
         c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
     })
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
+#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
+#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
+#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
+#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) //  defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));