COMPMID-1343: Add grouping support to CLCol2ImKernel

Change-Id: I5188a2163e7341f1915d98c21464fea13a9a7faf
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/143330
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
index 98bf8d1..5e52127 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/col2im.cl
@@ -41,12 +41,15 @@
  * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
  * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
  * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ * @note In case of grouping the GROUPING flag must be passed at compile time using -DGROUPING
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -59,11 +62,14 @@
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void col2im(
-    IMAGE_DECLARATION(src),
+    TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     uint dst_stride_w)
 {
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
+    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
 
     VEC_DATA_TYPE(DATA_TYPE, 8)
     data = vload8(0, (__global DATA_TYPE *)src.ptr);
@@ -82,8 +88,16 @@
 
     __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes;
 
-    // Compute output offset
-    int idx = (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
+#if defined(GROUPING)
+    // Compute output offset (batches on 4th dimension, no need to compute manually)
+    int idx = yd * dst_stride_y + xd * dst_stride_x;
+
+    const uint group = get_global_id(2); // group ID
+    x_clamped += group * WIDTH_INPUT;
+#else  /* defined(GROUPING) */
+    // Compute output offset (batches on 3rd dimension)
+    int idx = yd * dst_stride_y + xd * dst_stride_x + get_global_id(2) * dst_stride_w;
+#endif /* GROUPING */
 
     // Store value
     *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0;
@@ -95,4 +109,4 @@
     *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6;
     *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7;
 }
-#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)