Implement CLDirectConv3D f32/f16

Resolve COMPMID-4660

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ibd66ec1eb6faa60086981b1e3a9c12561df3445f
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6420
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index 564cbf9..f9d18ec 100644
--- a/src/core/CL/cl_kernels/common/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -703,7 +703,7 @@
 
     a_offset_s32[0].v *= A_OFFSET;
 
-    T_ADD_BROADCAST_X(int, M0, 1, offset_s32, a_offset_s32, offset_s32);
+    T_ADD_BROADCAST_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
 #endif // defined(A_OFFSET)
 
 #if defined(B_OFFSET)
@@ -728,7 +728,7 @@
 
     T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
 
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, 1, offset_s32, bias, offset_s32);
+    T_ADD_BROADCAST_X(int, M0, N0, offset_s32, bias, offset_s32);
 #endif // defined(ADD_BIAS)
 
     LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -786,7 +786,7 @@
 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
     })
 
-    const bool cond_x = (xo > (N - N0)) && (PARTIAL_STORE_N0 != 0);
+    const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0);
 
 #if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
     T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);