COMPMID-2041: Create GEMM helper file for OpenCL.

Change-Id: I7203d7e4d5540536b5e6638c81b26a955aa70f5c
Signed-off-by: Usama Arif <usama.arif@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1144
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index da94008..c3107a2 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "helpers.h"
+#include "gemm_helpers.h"
 #include "repeat.h"
 
 #if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
@@ -125,63 +125,10 @@
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply src_stride_z by DEPTH_GEMM3D
 
-    // Note for the REINTERPRET_INPUT_AS_3D case
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
     input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zin0 = (0 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
-    zin0 *= (cross_plane_pad * src_stride_y);
-#if M0 > 1
-    zin1 = (1 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
-    zin1 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zin2 = (2 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
-    zin2 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zin3 = (3 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
-    zin3 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zin4 = (4 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
-    zin4 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zin5 = (5 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
-    zin5 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zin6 = (6 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
-    zin6 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7);
-    zin7 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 7
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
 
 #else // defined(REINTERPRET_INPUT_AS_3D)
 
@@ -193,79 +140,33 @@
     output_ptr += z * (uint)dst_stride_z;
 
     // ---------------------------Load input values --------------------------------
-
     // Load values from the LHS matrix
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
     BOUNDARY_CONDITION_X(x, a0);
 #if M0 > 1
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
     BOUNDARY_CONDITION_X(x, a1);
 #endif // M0 > 1
 #if M0 > 2
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
     BOUNDARY_CONDITION_X(x, a2);
 #endif // M0 > 2
 #if M0 > 3
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
     BOUNDARY_CONDITION_X(x, a3);
 #endif // M0 > 3
 #if M0 > 4
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
     BOUNDARY_CONDITION_X(x, a4);
 #endif // M0 > 4
 #if M0 > 5
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
     BOUNDARY_CONDITION_X(x, a5);
 #endif // M0 > 5
 #if M0 > 6
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
     BOUNDARY_CONDITION_X(x, a6);
 #endif // M0 > 6
 #if M0 > 7
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
     BOUNDARY_CONDITION_X(x, a7);
 #endif // M0 > 7
-
     // ---------------------------Store output values ------------------------------
-
-    VSTORE(K0)
-    (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#if M0 > 1
-    VSTORE(K0)
-    (a1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 1
-#if M0 > 2
-    VSTORE(K0)
-    (a2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 2
-#if M0 > 3
-    VSTORE(K0)
-    (a3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 3
-#if M0 > 4
-    VSTORE(K0)
-    (a4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 4
-#if M0 > 5
-    VSTORE(K0)
-    (a5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 5
-#if M0 > 6
-    VSTORE(K0)
-    (a6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 6
-#if M0 > 7
-    VSTORE(K0)
-    (a7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 7
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
 
 #undef BLOCK_SIZE
 #undef OUTPUT_OFFSET_X
@@ -424,63 +325,10 @@
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply src_stride_z by DEPTH_GEMM3D
 
-    // Note for the REINTERPRET_INPUT_AS_3D case
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
     input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zin0 = (0 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
-    zin0 *= (cross_plane_pad * src_stride_y);
-#if M0 > 1
-    zin1 = (1 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
-    zin1 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zin2 = (2 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
-    zin2 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zin3 = (3 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
-    zin3 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zin4 = (4 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
-    zin4 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zin5 = (5 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
-    zin5 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zin6 = (6 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
-    zin6 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zin7 = (7 + (uint)(y * M0)) / (uint)HEIGHT_GEMM3D;
-    zin7 = min((uint)(DEPTH_GEMM3D - 1), zin7);
-    zin7 *= (cross_plane_pad * src_stride_y);
-#endif // M0 > 7
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
 
 #else // defined(REINTERPRET_INPUT_AS_3D)
 
@@ -494,45 +342,29 @@
     // ---------------------------Load input values --------------------------------
 
     // Load values from the LHS matrix
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
     BOUNDARY_CONDITION_X(x, a0);
 #if M0 > 1
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
     BOUNDARY_CONDITION_X(x, a1);
 #endif // M0 > 1
 #if M0 > 2
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
     BOUNDARY_CONDITION_X(x, a2);
 #endif // M0 > 2
 #if M0 > 3
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
     BOUNDARY_CONDITION_X(x, a3);
 #endif // M0 > 3
 #if M0 > 4
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
     BOUNDARY_CONDITION_X(x, a4);
 #endif // M0 > 4
 #if M0 > 5
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
     BOUNDARY_CONDITION_X(x, a5);
 #endif // M0 > 5
 #if M0 > 6
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
     BOUNDARY_CONDITION_X(x, a6);
 #endif // M0 > 6
 #if M0 > 7
-    VEC_DATA_TYPE(DATA_TYPE, K0)
-    a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
     BOUNDARY_CONDITION_X(x, a7);
 #endif // M0 > 7
-
     // ---------------------------Transpose and store block -----------------------
 
     TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
@@ -711,48 +543,8 @@
 #endif // K0 > 8
 
     // ---------------------------Store output values ------------------------------
-    VSTORE(N0)
-    (a0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#if K0 > 1
-    VSTORE(N0)
-    (a1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // K0 > 1
-#if K0 > 2
-    VSTORE(N0)
-    (a2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // K0 > 2
-#if K0 > 3
-    VSTORE(N0)
-    (a3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // K0 > 3
-#if K0 > 4
-    VSTORE(N0)
-    (a4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (a5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (a6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (a7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 4
-#if K0 > 8
-    VSTORE(N0)
-    (a8, 0, (__global DATA_TYPE *)(output_ptr + 8 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (a9, 0, (__global DATA_TYPE *)(output_ptr + 9 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (aA, 0, (__global DATA_TYPE *)(output_ptr + 10 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (aB, 0, (__global DATA_TYPE *)(output_ptr + 11 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (aC, 0, (__global DATA_TYPE *)(output_ptr + 12 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (aD, 0, (__global DATA_TYPE *)(output_ptr + 13 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (aE, 0, (__global DATA_TYPE *)(output_ptr + 14 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(N0)
-    (aF, 0, (__global DATA_TYPE *)(output_ptr + 15 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 8
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
 
 #undef BLOCK_SIZE
 #undef OUTPUT_OFFSET_X
@@ -1079,47 +871,8 @@
 #endif // N0 > 2
 
     // ---------------------------Store the output values ------------------------------
-
-    VSTORE(K0)
-    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (res1, 0, (__global DATA_TYPE *)(output_ptr + 1 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#if N0 > 2
-    VSTORE(K0)
-    (res2, 0, (__global DATA_TYPE *)(output_ptr + 2 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 2
-#if N0 > 3
-    VSTORE(K0)
-    (res3, 0, (__global DATA_TYPE *)(output_ptr + 3 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 3
-#if N0 > 4
-    VSTORE(K0)
-    (res4, 0, (__global DATA_TYPE *)(output_ptr + 4 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (res5, 0, (__global DATA_TYPE *)(output_ptr + 5 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (res6, 0, (__global DATA_TYPE *)(output_ptr + 6 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (res7, 0, (__global DATA_TYPE *)(output_ptr + 7 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 4
-#if N0 > 8
-    VSTORE(K0)
-    (res8, 0, (__global DATA_TYPE *)(output_ptr + 8 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (res9, 0, (__global DATA_TYPE *)(output_ptr + 9 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (resA, 0, (__global DATA_TYPE *)(output_ptr + 10 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (resB, 0, (__global DATA_TYPE *)(output_ptr + 11 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (resC, 0, (__global DATA_TYPE *)(output_ptr + 12 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (resD, 0, (__global DATA_TYPE *)(output_ptr + 13 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (resE, 0, (__global DATA_TYPE *)(output_ptr + 14 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-    VSTORE(K0)
-    (resF, 0, (__global DATA_TYPE *)(output_ptr + 15 * OUTPUT_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 8
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
 
 #undef BLOCK_SIZE
 #undef OUTPUT_OFFSET_X
@@ -1354,63 +1107,12 @@
     rhs_offset += z * rhs_stride_z;
 #endif // defined(MATRIX_B_DEPTH)
 
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
 
 #if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
-    zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
-#if M0 > 1
-    zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
-    zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
-    zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
-    zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
-    zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
-    zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
-    zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
-    zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 7
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -1439,78 +1141,10 @@
         // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
         // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
         // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
 
         // Load values from RHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b0 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b1 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#if N0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b2 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 2
-#if N0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b3 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 3
-#if N0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b4 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b5 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b6 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b7 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 4
-#if N0 > 8
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b8 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b9 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bA = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bB = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bC = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bD = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bE = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bF = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 8
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zrhs);
 
         // Accumulate
         ARM_DOT_K0XN0(K0, a0, b, c0);
@@ -1544,54 +1178,10 @@
     for(; i < K; ++i)
     {
         // Load values from LHS matrix
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        DATA_TYPE a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        DATA_TYPE a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        DATA_TYPE a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        DATA_TYPE a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
 
         // Load values from RHS matrix
-        DATA_TYPE b0 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE b1 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#if N0 > 2
-        DATA_TYPE b2 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 2
-#if N0 > 3
-        DATA_TYPE b3 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 3
-#if N0 > 4
-        DATA_TYPE b4 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE b5 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE b6 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE b7 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 4
-#if N0 > 8
-        DATA_TYPE b8 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE b9 = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE bA = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE bB = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE bC = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE bD = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE bE = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        DATA_TYPE bF = *((__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 8
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zrhs);
 
         // Accumulate
         ARM_DOT_K0XN0(1, a0, b, c0);
@@ -1626,60 +1216,9 @@
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
 
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
-    zout0 *= (dst_cross_plane_pad * dst_stride_y);
-#if M0 > 1
-    zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
-    zout1 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
-    zout2 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
-    zout3 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
-    zout4 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
-    zout5 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
-    zout6 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
-    zout7 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 7
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1694,61 +1233,11 @@
 
     // Multiply by the weight of matrix-matrix product and store the result
 #if defined(ALPHA)
-    c0 = c0 * (DATA_TYPE)ALPHA;
-#if M0 > 1
-    c1 = c1 * (DATA_TYPE)ALPHA;
-#endif // M0 > 1
-#if M0 > 2
-    c2 = c2 * (DATA_TYPE)ALPHA;
-#endif // M0 > 2
-#if M0 > 3
-    c3 = c3 * (DATA_TYPE)ALPHA;
-#endif // M0 > 3
-#if M0 > 4
-    c4 = c4 * (DATA_TYPE)ALPHA;
-#endif // M0 > 4
-#if M0 > 5
-    c5 = c5 * (DATA_TYPE)ALPHA;
-#endif // M0 > 5
-#if M0 > 6
-    c6 = c6 * (DATA_TYPE)ALPHA;
-#endif // M0 > 5
-#if M0 > 7
-    c7 = c7 * (DATA_TYPE)ALPHA;
-#endif // M0 > 7
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
 #endif // defined(ALPHA)
 
     // Store output block
-    VSTORE(N0)
-    (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
-#if M0 > 1
-    VSTORE(N0)
-    (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
-#endif // M0 > 1
-#if M0 > 2
-    VSTORE(N0)
-    (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
-#endif // M0 > 2
-#if M0 > 3
-    VSTORE(N0)
-    (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
-#endif // M0 > 3
-#if M0 > 4
-    VSTORE(N0)
-    (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
-#endif // M0 > 4
-#if M0 > 5
-    VSTORE(N0)
-    (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
-#endif // M0 > 5
-#if M0 > 6
-    VSTORE(N0)
-    (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
-#endif // M0 > 6
-#if M0 > 7
-    VSTORE(N0)
-    (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
-#endif // M0 > 7
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -1952,60 +1441,9 @@
     REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zin0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin0 = min((uint)(DEPTH_GEMM3D - 1), zin0);
-    zin0 *= (lhs_cross_plane_pad * lhs_stride_y);
-#if M0 > 1
-    zin1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin1 = min((uint)(DEPTH_GEMM3D - 1), zin1);
-    zin1 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zin2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin2 = min((uint)(DEPTH_GEMM3D - 1), zin2);
-    zin2 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zin3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin3 = min((uint)(DEPTH_GEMM3D - 1), zin3);
-    zin3 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zin4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin4 = min((uint)(DEPTH_GEMM3D - 1), zin4);
-    zin4 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zin5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin5 = min((uint)(DEPTH_GEMM3D - 1), zin5);
-    zin5 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zin6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin6 = min((uint)(DEPTH_GEMM3D - 1), zin6);
-    zin6 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zin7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zin7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
-    zin7 *= (lhs_cross_plane_pad * lhs_stride_y);
-#endif // M0 > 7
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply lhs_stride_z by DEPTH_GEMM3D
@@ -2034,36 +1472,7 @@
         // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
         // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
         // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
 
         LD_RHS_VFMA_M0xN0(0, a, c);
         LD_RHS_VFMA_M0xN0(1, a, c);
@@ -2140,60 +1549,8 @@
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
     // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zout0 = (0 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
-    zout0 *= (dst_cross_plane_pad * dst_stride_y);
-#if M0 > 1
-    zout1 = (1 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
-    zout1 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zout2 = (2 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
-    zout2 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zout3 = (3 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
-    zout3 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zout4 = (4 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
-    zout4 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zout5 = (5 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
-    zout5 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zout6 = (6 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
-    zout6 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zout7 = (7 + (uint)(y * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
-    zout7 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 7
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -2208,61 +1565,11 @@
 
     // Multiply by the weight of matrix-matrix product and store the result
 #if defined(ALPHA)
-    c0 = c0 * (DATA_TYPE)ALPHA;
-#if M0 > 1
-    c1 = c1 * (DATA_TYPE)ALPHA;
-#endif // M0 > 1
-#if M0 > 2
-    c2 = c2 * (DATA_TYPE)ALPHA;
-#endif // M0 > 2
-#if M0 > 3
-    c3 = c3 * (DATA_TYPE)ALPHA;
-#endif // M0 > 3
-#if M0 > 4
-    c4 = c4 * (DATA_TYPE)ALPHA;
-#endif // M0 > 4
-#if M0 > 5
-    c5 = c5 * (DATA_TYPE)ALPHA;
-#endif // M0 > 5
-#if M0 > 6
-    c6 = c6 * (DATA_TYPE)ALPHA;
-#endif // M0 > 5
-#if M0 > 7
-    c7 = c7 * (DATA_TYPE)ALPHA;
-#endif // M0 > 7
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
 #endif // defined(ALPHA)
 
     // Store output block
-    VSTORE(N0)
-    (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
-#if M0 > 1
-    VSTORE(N0)
-    (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
-#endif // M0 > 1
-#if M0 > 2
-    VSTORE(N0)
-    (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
-#endif // M0 > 2
-#if M0 > 3
-    VSTORE(N0)
-    (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
-#endif // M0 > 3
-#if M0 > 4
-    VSTORE(N0)
-    (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
-#endif // M0 > 4
-#if M0 > 5
-    VSTORE(N0)
-    (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
-#endif // M0 > 5
-#if M0 > 6
-    VSTORE(N0)
-    (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
-#endif // M0 > 6
-#if M0 > 7
-    VSTORE(N0)
-    (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
-#endif // M0 > 7
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -2498,6 +1805,9 @@
     // Initialize the accumulators
     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
 
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
     for(int i = 0; i < k; i += K0)
     {
         // Supported cases (M0, K0):
@@ -2510,78 +1820,10 @@
         // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
         // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
         // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 0 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 1 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 2 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 3 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 4 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 5 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 6 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(lhs_addr + 7 * LHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // M0 > 7
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
 
         // Load values from RHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b0 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b1 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#if N0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b2 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 2
-#if N0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b3 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 3
-#if N0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b4 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b5 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b6 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b7 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 4
-#if N0 > 8
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b8 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        b9 = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bA = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bB = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bC = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bD = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bE = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        bF = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_addr + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-#endif // N0 > 8
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zrhs);
 
         // Accumulate
         ARM_DOT_K0XN0(a0, b, c0);
@@ -2616,61 +1858,9 @@
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
 
     // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    zout0 = (0 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout0 = min((uint)(DEPTH_GEMM3D - 1), zout0);
-    zout0 *= (dst_cross_plane_pad * dst_stride_y);
-#if M0 > 1
-    zout1 = (1 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout1 = min((uint)(DEPTH_GEMM3D - 1), zout1);
-    zout1 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 1
-#if M0 > 2
-    zout2 = (2 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout2 = min((uint)(DEPTH_GEMM3D - 1), zout2);
-    zout2 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 2
-#if M0 > 3
-    zout3 = (3 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout3 = min((uint)(DEPTH_GEMM3D - 1), zout3);
-    zout3 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 3
-#if M0 > 4
-    zout4 = (4 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout4 = min((uint)(DEPTH_GEMM3D - 1), zout4);
-    zout4 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 4
-#if M0 > 5
-    zout5 = (5 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout5 = min((uint)(DEPTH_GEMM3D - 1), zout5);
-    zout5 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 5
-#if M0 > 6
-    zout6 = (6 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout6 = min((uint)(DEPTH_GEMM3D - 1), zout6);
-    zout6 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 6
-#if M0 > 7
-    zout7 = (7 + (uint)(get_global_id(1) * (uint)M0)) / (uint)HEIGHT_GEMM3D;
-    zout7 = min((uint)(DEPTH_GEMM3D - 1), zout7);
-    zout7 *= (dst_cross_plane_pad * dst_stride_y);
-#endif // M0 > 7
-
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
     dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
@@ -2684,62 +1874,11 @@
 
     // Multiply by the weight of matrix-matrix product and store the result
 #if defined(ALPHA)
-    c0 = c0 * (DATA_TYPE)ALPHA;
-#if M0 > 1
-    c1 = c1 * (DATA_TYPE)ALPHA;
-#endif // M0 > 1
-#if M0 > 2
-    c2 = c2 * (DATA_TYPE)ALPHA;
-#endif // M0 > 2
-#if M0 > 3
-    c3 = c3 * (DATA_TYPE)ALPHA;
-#endif // M0 > 3
-#if M0 > 4
-    c4 = c4 * (DATA_TYPE)ALPHA;
-#endif // M0 > 4
-#if M0 > 5
-    c5 = c5 * (DATA_TYPE)ALPHA;
-#endif // M0 > 5
-#if M0 > 6
-    c6 = c6 * (DATA_TYPE)ALPHA;
-#endif // M0 > 5
-#if M0 > 7
-    c7 = c7 * (DATA_TYPE)ALPHA;
-#endif // M0 > 7
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
 #endif // defined(ALPHA)
 
     // Store output block
-    VSTORE(N0)
-    (c0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout0));
-#if M0 > 1
-    VSTORE(N0)
-    (c1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout1));
-#endif // M0 > 1
-#if M0 > 2
-    VSTORE(N0)
-    (c2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout2));
-#endif // M0 > 2
-#if M0 > 3
-    VSTORE(N0)
-    (c3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout3));
-#endif // M0 > 3
-#if M0 > 4
-    VSTORE(N0)
-    (c4, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_y + zout4));
-#endif // M0 > 4
-#if M0 > 5
-    VSTORE(N0)
-    (c5, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_y + zout5));
-#endif // M0 > 5
-#if M0 > 6
-    VSTORE(N0)
-    (c6, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_y + zout6));
-#endif // M0 > 6
-#if M0 > 7
-    VSTORE(N0)
-    (c7, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_y + zout7));
-#endif // M0 > 7
-
+    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
 #undef LHS_BLOCK_SIZE
 #undef LHS_OFFSET_X
 #undef LHS_STEP_X
@@ -2892,14 +2031,8 @@
     input_ptr += z * src_stride_z * DEPTH_GEMM3D;
 
     // Load values from Matrix A
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin.s0));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin.s1));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin.s2));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin.s3));
+    LOAD_BLOCK(4, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin.s);
+
 #else  // defined(REINTERPRET_INPUT_AS_3D)
     __global uchar *input_ptr = src.ptr;
 
@@ -4313,21 +3446,8 @@
     {
 #if defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
         VEC_DATA_TYPE(DATA_TYPE, 2)
         a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
@@ -4480,21 +3600,7 @@
     dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
 
     // Store output block
-    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
+    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);
 #else // defined(REINTERPRET_OUTPUT_AS_3D)
     // Add offset for batched GEMM
     dst_addr += z * dst_stride_z;
@@ -4671,17 +3777,8 @@
     {
 #if defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A and matrix B
-        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A and matrix B
         float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -5566,17 +4663,8 @@
     {
 #if defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
         half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -5778,19 +4866,8 @@
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
     dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
     // Store the output block
-    vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
+    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, hacc, dst_addr, dst_stride_y, zout.s);
 #else // defined(REINTERPRET_OUTPUT_AS_3D)
     // Add offset for batched GEMM
     dst_addr += z * dst_stride_z;
@@ -5945,17 +5022,8 @@
     {
 #if defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
+        LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
         half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -6143,17 +5211,7 @@
     dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
 
     // Store the output block
-    vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
+    STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
 #else // defined(REINTERPRET_OUTPUT_AS_3D)
     // Add offset for batched GEMM
     dst_addr += z * dst_stride_z;
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
new file mode 100644
index 0000000..5bc897b
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
+    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
+    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+// LOAD_ROW_n loads the rows 0..n-1 in variables BASENAME##0 to BASENAME##(n-1)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** Load Blocks of M0 consecutive rows and N0 consecutive columns when using Z offset as well
+ * Supported cases M0=1,2,3..16. N0=1,2,3,4,8,16, for variables BASENAME[0..M0]
+ * The data to load is expected to have consecutive names for each row, For e.g. For M0=3, and basename=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names For e.g. For M0=3, and Z=zin, the expected z offsets are zin0, zin1 and zin2.
+ */
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                     \
+    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                     \
+    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                     \
+    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                     \
+    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                     \
+    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                     \
+    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                     \
+    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
+    Z##7 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                     \
+    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+// CALCULATE_Z_OFFSET_n calculates Z for Z##0 to Z##(n-1)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+/** The Z offsets are expected to have consecutive names, For e.g. For M0=3, and Z=zin, the expected Z offsets are zin1, zin2, zin3.
+ * Note for the REINTERPRET_INPUT_AS_3D case
+ * Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+ * in order to take into account the presence of possible cross plane paddings
+ *
+ *  |                  |
+ *  |      plane0      |
+ *  |                  |
+ *  |__________________|
+ *  |******************|
+ *  |  cross_plane_pad |
+ *  |******************|
+ *  |                  |
+ *  |      plane1      |
+ *  |                  |
+ *  |__________________|
+ */
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE(N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE(N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE(N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+// STORE_ROW_n stores the rows 0..n-1 from variables BASENAME##0 to BASENAME##(n-1)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** Store Blocks of M0 consecutive rows and N0 consecutive columns when using Z offset as well
+* Supported cases M0=1,2,3..16. N0=2,3,4,8,16, for variables BASENAME[0..M]
+ * The data to store is expected to have consecutive names for each row, For e.g. For M0=3, and basename=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names For e.g. For M0=3, and Z=zin, the expected z offsets are zin0, zin1 and zin2.
+ */
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
+    BASENAME##0 = BASENAME##0 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##1 = BASENAME##1 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##2 = BASENAME##2 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##3 = BASENAME##3 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##4 = BASENAME##4 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##5 = BASENAME##5 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##6 = BASENAME##6 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##7 = BASENAME##7 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##8 = BASENAME##8 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
+    BASENAME##9 = BASENAME##9 * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##A = BASENAME##A * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##B = BASENAME##B * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##C = BASENAME##C * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##D = BASENAME##D * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##E = BASENAME##E * (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
+    BASENAME##F = BASENAME##F * (DATA_TYPE)SCALE;
+
+// SCALE_ROW_n scales the variables BASENAME##0 to BASENAME##(n-1) by SCALE
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+/** Scale elements stored in variables BASENAME##0 to BASENAME##(N-1) by SCALE
+ * Supported cases N=1,2,3..16, for variables BASENAME[0..N]
+ */
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 792544d..e39dbc3 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -55,6 +55,17 @@
 
 #define float1 float
 #define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
 
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)