Add Bias to MatMul Kernels and add support for use in Fully Connected Layer

Resolves: [COMPMID-6316]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I08e6bac9e6b46b76978da0dc6a48ccfe3dde5086
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9833
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
index 9656a59..c7ef8ae 100644
--- a/src/core/CL/cl_kernels/common/mat_mul.cl
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -25,6 +25,21 @@
 #include "helpers.h"
 #include "tile_helpers.h"
 
+#ifdef BIAS
+// This function performs in-place bias addition for float/half datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
 #if defined(MAT_MUL_NATIVE_NT_NT)
 /** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
  *
@@ -43,32 +58,42 @@
  *  - K0 = 1, 2, 3, 4, 8, 16
  * @note Values > 8 for M0 are not expected to be efficient
  *
- * @param[in]  lhs_ptr                           Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in]  lhs_stride_y                      Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  lhs_stride_z                      Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  lhs_w                             The width of the lhs tensor
- * @param[in]  lhs_h                             The height of the lhs tensor
- * @param[in]  lhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in]  rhs_img                           (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in]  rhs_ptr                           Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  rhs_stride_y                      Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  rhs_w                             The width of the rhs tensor
- * @param[in]  rhs_h                             The height of the rhs tensor
- * @param[in]  rhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr                           Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  dst_stride_y                      Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in]  dst_w                             The width of the dst tensor
- * @param[in]  dst_h                             The height of the dst tensor
- * @param[in]  dst_n                             Number of the matrices (buffers) in the batch
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
  */
 __kernel void mat_mul_native_nt_nt(
     TENSOR3D_T(lhs, BUFFER),
     TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
     TENSOR3D_T(dst, BUFFER))
 {
     const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -149,6 +174,10 @@
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
     T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
 
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
@@ -173,31 +202,41 @@
  *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
  * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
  *
- * @param[in]  lhs_ptr                           Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in]  lhs_stride_y                      Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  lhs_stride_z                      Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  lhs_w                             The width of the lhs tensor
- * @param[in]  lhs_h                             The height of the lhs tensor
- * @param[in]  lhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in]  rhs_img                           (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in]  rhs_ptr                           Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  rhs_stride_y                      Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  rhs_w                             The width of the rhs tensor
- * @param[in]  rhs_h                             The height of the rhs tensor
- * @param[in]  rhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr                           Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  dst_stride_y                      Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in]  dst_w                             The width of the dst tensor
- * @param[in]  dst_h                             The height of the dst tensor
- * @param[in]  dst_n                             Number of the matrices (buffers) in the batch
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
  */
 __kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
                                   TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+                                  TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
                                   TENSOR3D_T(dst, BUFFER))
 
 {
@@ -306,6 +345,10 @@
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
     T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
 
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
@@ -330,32 +373,42 @@
  *  - K0 > 0
  * * @note Values > 8 for M0, and K0 are not expected to be efficient
  *
- * @param[in]  lhs_ptr                           Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in]  lhs_stride_y                      Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  lhs_stride_z                      Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  lhs_w                             The width of the lhs tensor
- * @param[in]  lhs_h                             The height of the lhs tensor
- * @param[in]  lhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in]  rhs_img                           (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in]  rhs_ptr                           Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  rhs_stride_y                      Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  rhs_w                             The width of the rhs tensor
- * @param[in]  rhs_h                             The height of the rhs tensor
- * @param[in]  rhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr                           Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  dst_stride_y                      Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in]  dst_w                             The width of the dst tensor
- * @param[in]  dst_h                             The height of the dst tensor
- * @param[in]  dst_n                             Number of the matrices (buffers) in the batch
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
  */
 __kernel void mat_mul_native_t_nt(
     TENSOR3D_T(lhs, BUFFER),
     TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
     TENSOR3D_T(dst, BUFFER))
 {
     const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -459,6 +512,10 @@
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
     T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
 
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
@@ -483,32 +540,42 @@
  *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
  * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
  *
- * @param[in]  lhs_ptr                           Pointer to the lhs matrix. Supported data types: F32/F16
- * @param[in]  lhs_stride_y                      Stride of the lhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  lhs_stride_z                      Stride of the lhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  lhs_w                             The width of the lhs tensor
- * @param[in]  lhs_h                             The height of the lhs tensor
- * @param[in]  lhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the lhs matrix
- * @param[in]  rhs_img                           (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
- * @param[in]  rhs_ptr                           Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  rhs_stride_y                      Stride of the rhs matrix in Y (2nd) dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the rhs tensor in Z (3rd) dimension (in bytes)
- * @param[in]  rhs_w                             The width of the rhs tensor
- * @param[in]  rhs_h                             The height of the rhs tensor
- * @param[in]  rhs_n                             Number of the matrices (buffers) in the batch
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the rhs matrix
- * @param[out] dst_ptr                           Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
- * @param[in]  dst_stride_y                      Stride of the dst matrix in Y (2nd) dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the dst tensor in Z (3rd) dimension (in bytes)
- * @param[in]  dst_w                             The width of the dst tensor
- * @param[in]  dst_h                             The height of the dst tensor
- * @param[in]  dst_n                             Number of the matrices (buffers) in the batch
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the dst matrix
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
  */
 __kernel void mat_mul_native_t_t(
     TENSOR3D_T(lhs, BUFFER),
     TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
     TENSOR3D_T(dst, BUFFER))
 {
     const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
@@ -630,6 +697,10 @@
         indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
     });
 
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
     T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
 
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);