COMPMID-801: NHWC support in CLIm2Col.

And extended tests coverage adding kernel shapes 3x1, 1x5 and 7x7

Change-Id: Ia7c1d4da2368d5f5fbc1a41187f4ac1aca5f150f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127727
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 1e85e1b..f53ce21 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -123,7 +123,207 @@
 }
 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
 
+#define PTR_TO_VALUE(PTR, DATA_TYPE) *((DATA_TYPE *)(PTR))
+
 #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int src_stride_y_int = (int)src_stride_y;
+    const int src_stride_z_int = (int)src_stride_z;
+    const int xc               = get_global_id(1);                    // x coordinate in the convolved tensor
+    const int yc               = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
+    const int ch               = get_global_id(0);                    // input feature map
+    const int batch            = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+    const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH;
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        const int y0 = yi + yk;
+        if(y0 >= 0 && y0 < SRC_HEIGHT)
+        {
+            int xk;
+            for(xk = 0; xk < KERNEL_WIDTH; xk++)
+            {
+                const int x0 = xi + xk;
+                if(x0 >= 0 && x0 < SRC_WIDTH)
+                {
+                    *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + xk * src_stride_y + yk * src_stride_z, DATA_TYPE);
+                }
+                else
+                {
+                    *((__global DATA_TYPE *)output_ptr) = PAD_VALUE;
+                }
+                output_ptr += 1 * sizeof(DATA_TYPE);
+            }
+        }
+        else
+        {
+            for(int xk = 0; xk < KERNEL_WIDTH; xk++)
+            {
+                *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE;
+                output_ptr += 1 * dst_stride_x;
+            }
+        }
+    }
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+        output_ptr += 1 * dst_stride_x;
+    }
+#endif // HAS_BIAS
+}
+
+/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int src_stride_y_int = (int)src_stride_y;
+    const int src_stride_z_int = (int)src_stride_z;
+    const int xc               = get_global_id(1);                    // x coordinate in the convolved tensor
+    const int yc               = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
+    const int ch               = get_global_id(0);                    // input feature map
+    const int batch            = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+    const int xo = ch * 9;                    // 3x3
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
+
+    const int3 y = (int3)yi + (int3)(0, 1, 2);
+    // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer.
+    if(y.s0 >= 0 && y.s0 < SRC_HEIGHT)
+    {
+        row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
+                   PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE),
+                   PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE));
+    }
+
+    if(y.s1 >= 0 && y.s1 < SRC_HEIGHT)
+    {
+        row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
+                   PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
+                   PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
+                   PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE));
+    }
+
+    if(y.s2 >= 0 && y.s2 < SRC_HEIGHT)
+    {
+        row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
+                   PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
+                   PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
+                   PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE));
+    }
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    const int3 x = (int3)xi + (int3)(0, 1, 2);
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    row0  = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+    row1  = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0);
+    row2  = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
@@ -804,4 +1004,4 @@
     }
 #endif // HAS_BIAS
 }
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)