APPBROWSER-312 Fully connected performance optimization

Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830
Reviewed-by: Stephen Li <stephen.li@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
index 20f28cb..77a52b2 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
@@ -57,6 +57,7 @@
 private:
     IGCTensor       *_accum;
     const IGCTensor *_biases;
+    gles::NDRange    _lws;
 };
 }
 
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 1a0c9f1..87a109a 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -25,14 +25,6 @@
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 #include "helpers.h"
 
-#ifdef DATA_TYPE_FP16
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, restrict);
-#else  // DATA_TYPE_FP16
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, restrict);
-#endif // DATA_TYPE_FP16
-
 layout(std140) uniform shader_params
 {
 #ifdef IM2COL_GENERIC
@@ -58,10 +50,21 @@
 };
 
 #ifdef DATA_TYPE_FP16
+#if defined(IM2COL_REDUCED_8X)
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, restrict);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, restrict);
+#else                            /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, restrict);
+#endif                           /* IM2COL_REDUCED_8X */
 
 precision mediump float;
 
 #ifdef IM2COL_REDUCED
+#if defined(IM2COL_REDUCED_GENERIC)
 /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
@@ -142,9 +145,55 @@
     }
 #endif // HAS_BIAS
 }
-#endif // IM2COL_REDUCED
+#else /* IM2COL_REDUCED_GENERIC */
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+#if defined(IM2COL_REDUCED_8X)
+    uint     tmp_out_offset = dst.current_offset + ((pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uvec4    tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(4));
+    STORE1(dst, tmp_out_offset >> uint(4), tmp);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+    uint  tmp_out_offset = dst.current_offset + ((pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uvec2 tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(3));
+    STORE1(dst, tmp_out_offset >> uint(3), tmp);
+#else                            /* IM2COL_REDUCED_8X */
+    uint tmp_out_offset = dst.current_offset + ((pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uint tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(2));
+    STORE1(dst, tmp_out_offset >> uint(2), tmp);
+#endif                           /* IM2COL_REDUCED_8X */
+}
+#endif                           /* IM2COL_REDUCED_GENERIC */
+#endif                           // IM2COL_REDUCED
 
 #elif defined(DATA_TYPE_FP32)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
 
 #ifdef IM2COL_GENERIC
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
old mode 100755
new mode 100644
index ffa0ebb..3ed27d5
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -475,6 +475,7 @@
 #elif defined(DATA_TYPE_FP16)
 precision mediump float;
 #ifdef GEMM_MM_FLOATING_POINT
+#if defined(MM_PROCESS_4X)
 BUFFER_DECLARATION(src0, 1, uint, readonly);
 BUFFER_DECLARATION(src1, 2, uvec2, readonly);
 BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
@@ -526,14 +527,41 @@
 
     /* Reset accumulators */
     vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; src0.current_offset < (end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
+    for(; int(src0.current_offset) < int(end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
     {
-        uint packed_a0;
+        uint packed_a;
         vec2 a0;
 
-        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
-        a0 = vec2(unpackHalf2x16(packed_a0));
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a0 = vec2(unpackHalf2x16(packed_a));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec2 a1;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
+        a1 = vec2(unpackHalf2x16(packed_a));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec2 a2;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
+        a2 = vec2(unpackHalf2x16(packed_a));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec2 a3;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
+        a3 = vec2(unpackHalf2x16(packed_a));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
         uvec2 packed_b0;
         uvec2 packed_b1;
@@ -548,6 +576,18 @@
 
         acc0 += b0 * vec4(a0.x);
         acc0 += b1 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.x);
+        acc1 += b1 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.x);
+        acc2 += b1 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.x);
+        acc3 += b1 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     }
 
     for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)
@@ -557,6 +597,24 @@
 
         GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
         a0 = vec2(unpackHalf2x16(packed_a0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec2 a1;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 1);
+        a1 = vec2(unpackHalf2x16(packed_a0));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec2 a2;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 2);
+        a2 = vec2(unpackHalf2x16(packed_a0));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec2 a3;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 3);
+        a3 = vec2(unpackHalf2x16(packed_a0));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
         uvec2 packed_b0;
         vec4  b0;
@@ -566,6 +624,15 @@
         b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
 
         acc0 += b0 * (a0.x);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * (a1.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * (a2.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * (a3.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     }
 
     /* Multiply by the weight of vector-matrix product */
@@ -574,10 +641,340 @@
     uvec2 packed_d;
     packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
     GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    packed_d = uvec2(packHalf2x16(acc1.xy), packHalf2x16(acc1.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    packed_d = uvec2(packHalf2x16(acc2.xy), packHalf2x16(acc2.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    packed_d = uvec2(packHalf2x16(acc3.xy), packHalf2x16(acc3.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 3);
+#endif                                 // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
-#endif /* GEMM_MM_FLOATING_POINT */
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* PROCESS_4X */
+BUFFER_DECLARATION(src0, 1, uvec4, readonly);
+BUFFER_DECLARATION(src1, 2, uvec2, readonly);
+BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+    /* Compute end row address for matrix A */
+    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; int(src0.current_offset) < int(end_row_vec_a - uint(16)); src0.current_offset += uint(8) * src0_stride_x, src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a0[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a0[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a0[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec4 a1[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
+        a1[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a1[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec4 a2[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
+        a2[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a2[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 a3[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
+        a3[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a3[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        uvec2 packed_b;
+        vec4  b;
+
+        for(int i = 0; i < 8; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+
+            acc0 += b * vec4(a0[j][k]);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+            acc1 += b * vec4(a1[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+            acc2 += b * vec4(a2[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+            acc3 += b * vec4(a3[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        }
+    }
+
+    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 8), src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a0[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a0[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a0[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec4 a1[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
+        a1[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a1[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec4 a2[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
+        a2[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a2[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 a3[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
+        a3[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a3[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        uvec2 packed_b;
+        vec4  b;
+
+        int leftover = COLS_A % 8;
+
+        for(int i = 0; i < leftover; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+
+            acc0 += b * vec4(a0[j][k]);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+            acc1 += b * vec4(a1[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+            acc2 += b * vec4(a2[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+            acc3 += b * vec4(a3[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        }
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+
+    uvec2 packed_d;
+    packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    packed_d = uvec2(packHalf2x16(acc1.xy), packHalf2x16(acc1.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    packed_d = uvec2(packHalf2x16(acc2.xy), packHalf2x16(acc2.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    packed_d = uvec2(packHalf2x16(acc3.xy), packHalf2x16(acc3.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 3);
+#endif                       // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#elif defined(MM_PROCESS_8X) /* PROCESS_4X */
+BUFFER_DECLARATION(src0, 1, uvec4, readonly);
+BUFFER_DECLARATION(src1, 2, uvec4, readonly);
+BUFFER_DECLARATION(dst, 3, uvec4, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+    /* Compute end row address for matrix A */
+    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+    /* Reset accumulators */
+    vec4 acc[2];
+
+    acc[0] = vec4(0.0f);
+    acc[1] = vec4(0.0f);
+
+    for(; int(src0.current_offset) < int(end_row_vec_a - uint(16)); src0.current_offset += uint(8) * src0_stride_x, src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+        uvec4 packed_b;
+        vec4  b[2];
+
+        for(int i = 0; i < 8; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b[0] = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+            b[1] = vec4(unpackHalf2x16(packed_b.z), unpackHalf2x16(packed_b.w));
+
+            acc[0] += b[0] * vec4(a[j][k]);
+            acc[1] += b[1] * vec4(a[j][k]);
+        }
+    }
+
+    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 8), src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+        uvec4 packed_b;
+        vec4  b[2];
+
+        int leftover = COLS_A % 8;
+
+        for(int i = 0; i < leftover; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b[0] = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+            b[1] = vec4(unpackHalf2x16(packed_b.z), unpackHalf2x16(packed_b.w));
+
+            acc[0] += b[0] * vec4(a[j][k]);
+            acc[1] += b[1] * vec4(a[j][k]);
+        }
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc[0] = acc[0] * vec4(ALPHA);
+    acc[1] = acc[1] * vec4(ALPHA);
+
+    uvec4 packed_d;
+    packed_d = uvec4(packHalf2x16(acc[0].xy), packHalf2x16(acc[0].zw), packHalf2x16(acc[1].xy), packHalf2x16(acc[1].zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+}
+#endif                       /* PROCESS_4X */
+#endif                       /* GEMM_MM_FLOATING_POINT */
 
 #ifdef GEMM_ACCUMULATE_BIASES
+#if defined(ACCUM_PROCESS_4X)
 BUFFER_DECLARATION(accum, 1, uvec2, restrict);
 BUFFER_DECLARATION(biases, 2, uvec2, readonly);
 
@@ -617,7 +1014,54 @@
     packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));
     GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
 }
-#endif /* GEMM_ACCUMULATE_BIASES */
-#else  /* DATA_TYPE_FP32 */
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+BUFFER_DECLARATION(accum, 1, uvec4, restrict);
+BUFFER_DECLARATION(biases, 2, uvec4, readonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(accum);
+    VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image  accum  = GC_CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+
+    vec4  u[2];
+    vec4  v[2];
+    uvec4 packed_s[2];
+    GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
+    GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
+
+    u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    u[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
+
+    v[0] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    v[1] = vec4(unpackHalf2x16(packed_s[1].z), unpackHalf2x16(packed_s[1].w));
+
+    vec4 r[2];
+    r[0]        = u[0] + v[0];
+    r[1]        = u[1] + v[1];
+    packed_s[0] = uvec4(packHalf2x16(r[0].xy), packHalf2x16(r[0].zw), packHalf2x16(r[1].xy), packHalf2x16(r[1].zw));
+    GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+}
+#endif                          /* ACCUM_PROCESS_4X */
+#endif                          /* GEMM_ACCUMULATE_BIASES */
+#else                           /* DATA_TYPE_FP32 */
 #error Data type not supported
 #endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
index 6d020fe..c251d95 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
@@ -109,15 +109,16 @@
 #elif defined(DATA_TYPE_FP16)
 precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-
 layout(std140) uniform shader_params
 {
     IMAGE_PARAM_DECLARATION(src);
     IMAGE_PARAM_DECLARATION(dst);
 };
 
+#if defined(TRANSPOSE_4X4)
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
 /** This OpenGL ES kernel computes the matrix transposition of input matrix
  *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
@@ -184,4 +185,93 @@
     GC_STORE1(packed_s[2], dst, uint((dst_offset_in_bytes + uint(2) * dst_stride_y) >> 3));
     GC_STORE1(packed_s[3], dst, uint((dst_offset_in_bytes + uint(3) * dst_stride_y) >> 3));
 }
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+
+#define SWAP_ROW(u0, l0)     \
+    {                        \
+        tmp_swap = u0;       \
+        u0       = l0;       \
+        l0       = tmp_swap; \
+    }
+
+#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \
+    {                                            \
+        vec4 tmp_swap;                           \
+        SWAP_ROW(u0, l0);                        \
+        SWAP_ROW(u1, l1);                        \
+        SWAP_ROW(u2, l2);                        \
+        SWAP_ROW(u3, l3);                        \
+    }
+
+#define TRANSPOSE_4x4(u0, u1, u2, u3) \
+    {                                 \
+        vec4 tmp;                     \
+        tmp.xyz = u0.yzw;             \
+        u0.y    = u1.x;               \
+        u0.z    = u2.x;               \
+        u0.w    = u3.x;               \
+        u1.x    = tmp.x;              \
+        u2.x    = tmp.y;              \
+        u3.x    = tmp.z;              \
+        tmp.xy  = u1.zw;              \
+        u1.z    = u2.y;               \
+        u1.w    = u3.y;               \
+        u2.y    = tmp.x;              \
+        u3.y    = tmp.y;              \
+        tmp.x   = u2.w;               \
+        u2.w    = u3.z;               \
+        u3.z    = tmp.x;              \
+    }
+
+/** This OpenGL ES kernel computes the matrix transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    // Compute source address
+    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vec4 u[8][2];
+
+    uvec4 packed_s[8];
+
+    for(int i = 0; i < 8; i++)
+    {
+        GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
+        u[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
+        u[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+    }
+
+    // Transpose the block
+    TRANSPOSE_4x4(u[0][0], u[1][0], u[2][0], u[3][0]);
+    TRANSPOSE_4x4(u[0][1], u[1][1], u[2][1], u[3][1]);
+    TRANSPOSE_4x4(u[4][0], u[5][0], u[6][0], u[7][0]);
+    TRANSPOSE_4x4(u[4][1], u[5][1], u[6][1], u[7][1]);
+    SWAP_4x4(u[0][1], u[1][1], u[2][1], u[3][1], u[4][0], u[5][0], u[6][0], u[7][0]);
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+
+    for(int i = 0; i < 8; i++)
+    {
+        packed_s[i] = uvec4(packHalf2x16(u[i][0].xy), packHalf2x16(u[i][0].zw), packHalf2x16(u[i][1].xy), packHalf2x16(u[i][1].zw));
+        GC_STORE1(packed_s[i], dst, uint((dst_offset_in_bytes + uint(i) * dst_stride_y) >> 4));
+    }
+}
+#endif /* TRANSPOSE_4X4 */
 #endif /*ARM_COMPUTE_ENABLE_FP16*/
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index b032bc5..a7d721d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -53,7 +53,6 @@
 void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
@@ -68,6 +67,24 @@
         ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
     }
 
+    // Get convolved dimensions
+    unsigned int owidth  = 0;
+    unsigned int oheight = 0;
+    std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, owidth);
+    output_shape.set(1, oheight);
+    output_shape.set(2, weights->info()->dimension(3));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
     _conv_pad_x    = std::get<0>(conv_info.pad());
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index 8625d37..944585d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
@@ -37,7 +37,7 @@
 using namespace arm_compute;
 
 GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel()
-    : _accum(nullptr), _biases(nullptr)
+    : _accum(nullptr), _biases(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
 {
 }
 
@@ -51,14 +51,23 @@
     _accum  = accum;
 
     std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
 
     // Create kernel
     build_opts.emplace("#define GEMM_ACCUMULATE_BIASES");
+
+#define ACCUM_PROCESS_4X
+
+#if defined(ACCUM_PROCESS_4X)
+    build_opts.emplace("#define ACCUM_PROCESS_4X");
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+    build_opts.emplace("#define ACCUM_PROCESS_8X");
+#endif                          /* ACCUM_PROCESS_4X */
     std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     build_opts.emplace(("#define " + dt_name));
+
     _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts);
 
     // Configure kernel window
@@ -70,13 +79,21 @@
     }
     else if(_accum->info()->data_type() == DataType::F16)
     {
+#if defined(ACCUM_PROCESS_4X)
         num_elems_processed_per_iteration = 4;
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+        num_elems_processed_per_iteration = 8;
+#endif                          /* ACCUM_PROCESS_4X */
     }
 
-    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+    const int  accum_width         = accum->info()->dimension(0);
+    const int  accum_padding_right = ceil_to_multiple(accum_width, num_elems_processed_per_iteration * _lws[0]) - accum_width;
+    BorderSize border              = BorderSize(0, accum_padding_right, 0, 0);
 
-    AccessWindowStatic     biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
-    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+    Window win = calculate_max_enlarged_window(*_accum->info(), Steps(num_elems_processed_per_iteration), border);
+
+    AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration * _lws[0]), biases->info()->dimension(1));
+    AccessWindowStatic accum_access(_accum->info(), 0, 0, accum_width + accum_padding_right, _accum->info()->dimension(1));
 
     update_window_and_padding(win, biases_access, accum_access);
 
@@ -107,13 +124,22 @@
         }
         else if(_accum->info()->data_type() == DataType::F16)
         {
-            add_2D_tensor_argument(idx, _accum, BufferParam(1, 3), accum_slice);
-            add_1D_tensor_argument(idx, _biases, BufferParam(2, 3), biases_slice);
+#if defined(ACCUM_PROCESS_4X)
+            BufferParam param = { 1, 3 };
+            add_2D_tensor_argument(idx, _accum, param, accum_slice);
+            param.binding_point = 2;
+            add_1D_tensor_argument(idx, _biases, param, biases_slice);
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+            BufferParam param             = { 1, 4 };
+            add_2D_tensor_argument(idx, _accum, param, accum_slice);
+            param.binding_point = 2;
+            add_1D_tensor_argument(idx, _biases, param, biases_slice);
+#endif                          /* ACCUM_PROCESS_4X */
         }
 
         _kernel.update_shader_params();
 
-        enqueue(*this, accum_slice);
+        enqueue(*this, accum_slice, _lws);
     }
     while(window.slide_window_slice_2D(accum_slice));
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index a75ab6b..8179525 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -118,9 +118,23 @@
         switch(input0->info()->data_type())
         {
             case DataType::F16:
-                num_elems_processed_per_iteration_x = 4;
-                num_elems_processed_per_iteration_y = 1;
                 build_opts.emplace("#define DATA_TYPE_FP16");
+
+#define MM_PROCESS_4X_OPTIMIZED
+
+#if defined(MM_PROCESS_4X)
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X");
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                num_elems_processed_per_iteration_x = 8;
+                num_elems_processed_per_iteration_y = 1;
+                build_opts.emplace("#define MM_PROCESS_8X");
+#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
@@ -143,8 +157,12 @@
 
         win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
+#if defined(MM_PROCESS_4X_OPTIMIZED)
+        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+#else  /* MM_PROCESS_4X_OPTIMIZED */
         AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
                                          num_elems_processed_per_iteration_y));
+#endif /* MM_PROCESS_4X_OPTIMIZED */
         AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
         AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
@@ -185,9 +203,19 @@
         switch(_input0->info()->data_type())
         {
             case DataType::F16:
+#if defined(MM_PROCESS_4X)
                 add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
                 add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
                 add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 4), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 4), slice);
+#endif                                 /* MM_PROCESS_4X */
                 break;
 
             case DataType::F32:
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 97c4dc4..e849891 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -107,7 +107,38 @@
     else
     {
         build_opts.insert("#define IM2COL_REDUCED");
-        _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+        if(input->info()->data_type() == DataType::F32)
+        {
+            _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+        }
+        else if(input->info()->data_type() == DataType::F16)
+        {
+            int input_width  = input->info()->dimension(0);
+            int input_height = input->info()->dimension(1);
+
+            build_opts.insert("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height));
+            if(input_width % 8 == 0)
+            {
+                _num_elems_processed_per_iteration = 8;
+                build_opts.insert("#define IM2COL_REDUCED_8X");
+            }
+            else if(input_width % 4 == 0)
+            {
+                _num_elems_processed_per_iteration = 4;
+                build_opts.insert("#define IM2COL_REDUCED_4X");
+            }
+            else if(input_width % 2 == 0)
+            {
+                _num_elems_processed_per_iteration = 2;
+                build_opts.insert("#define IM2COL_REDUCED_2X");
+            }
+            else
+            {
+                _num_elems_processed_per_iteration = 2;
+                build_opts.insert("#define IM2COL_REDUCED_GENERIC");
+            }
+        }
 
         // Create kernel
         _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 5bd34c2..acb9988 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -64,12 +64,25 @@
     build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
 
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 4;
+
+    if(input->info()->data_type() == DataType::F16)
+    {
+#define TRANSPOSE_8X8
+
+#if defined(TRANSPOSE_4X4)
+        build_opts.emplace(("#define TRANSPOSE_4X4"));
+        num_elems_processed_per_iteration = 4;
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+        build_opts.emplace(("#define TRANSPOSE_8X8"));
+        num_elems_processed_per_iteration = 8;
+#endif                       /* TRANSPOSE_4X4 */
+    }
+
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
 
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 4;
-
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
 
     AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
@@ -100,8 +113,17 @@
         }
         else if(_input->info()->data_type() == DataType::F16)
         {
-            add_2D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
-            add_2D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+#if defined(TRANSPOSE_4X4)
+            BufferParam param = { 1, 3 };
+            add_2D_tensor_argument(idx, _input, param, slice);
+            param.binding_point = 2;
+            add_2D_tensor_argument(idx, _output, param, slice);
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+            BufferParam param = { 1, 4 };
+            add_2D_tensor_argument(idx, _input, param, slice);
+            param.binding_point = 2;
+            add_2D_tensor_argument(idx, _output, param, slice);
+#endif                       /* TRANSPOSE_4X4 */
         }
 
         _kernel.update_shader_params();
diff --git a/tests/benchmark/fixtures/FullyConnectedLayerFixture.h b/tests/benchmark/fixtures/FullyConnectedLayerFixture.h
index 2d1f233..ef08c4a 100644
--- a/tests/benchmark/fixtures/FullyConnectedLayerFixture.h
+++ b/tests/benchmark/fixtures/FullyConnectedLayerFixture.h
@@ -30,6 +30,13 @@
 #include "tests/Utils.h"
 #include "tests/framework/Fixture.h"
 
+#ifdef ARM_COMPUTE_GC
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "tests/GLES_COMPUTE/Helper.h"
+
+using namespace arm_compute::test::gles_compute;
+#endif /* ARM_COMPUTE_GC */
+
 namespace arm_compute
 {
 namespace test
@@ -71,6 +78,12 @@
     void run()
     {
         fc_layer.run();
+#ifdef ARM_COMPUTE_GC
+        if(opengles31_is_available() && std::is_same<typename std::decay<TensorType>::type, arm_compute::GCTensor>::value)
+        {
+            force_sync_tensor(dst);
+        }
+#endif /* ARM_COMPUTE_GC */
     }
 
     void teardown()