Use MatMul in fully connected layer with dynamic weights when supported

 - Use MatMul kernels in FC layer when using dynamic weights without broadcasting or bias.
 - Fix minor typo in IClMatMulNativeKernelConfig.h

Partially Resolves : [COMPMID-6193]

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: Id494062b5b4f4e75ff9714c202dde941955afa52
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9797
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
index bd415bb..8cf857d 100644
--- a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "activation_float_helpers.h"
 #include "helpers.h"
 #include "tile_helpers.h"
-#include "activation_float_helpers.h"
 
 #if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
 /** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
@@ -189,7 +189,7 @@
     {
         LOOP_UNROLLING(int, j, 0, 1, N0,
         {
-            acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
         })
     })
 
@@ -368,7 +368,7 @@
     {
         LOOP_UNROLLING(int, j, 0, 1, N0,
         {
-            acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
         })
     })
 
@@ -549,7 +549,7 @@
     {
         LOOP_UNROLLING(int, j, 0, 1, N0,
         {
-            acc[i].s[j] += ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
         })
     })
 
@@ -734,7 +734,7 @@
     {
         LOOP_UNROLLING(int, j, 0, 1, N0,
         {
-            acc[i].s[j] += ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
         })
     })