APPBROWSER-390,397,398: bugfix and fully connected validation issue on specific dataset

Change-Id: I227e90445715c3bd394e49930b010c0a5f5ca177
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118108
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Joel Liang <joel.liang@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index c81bed7..580acc1 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -509,7 +509,7 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(2));
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(4));
         TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
     {
         vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -549,7 +549,7 @@
         vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec  a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+        vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
@@ -615,7 +615,7 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
         TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
         vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -729,7 +729,7 @@
     acc[0] = vec4(0.0f);
     acc[1] = vec4(0.0f);
 
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
         TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
         vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -823,7 +823,7 @@
     VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
 
     vec4 u[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
-    vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, bias_iter);
+    vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
 
     vec4 r[2];
     r[0] = u[0] + v[0];