COMPMID-443: Use 3D tensors for fill_border_image

2x performance improvement on some GoogLeNet Pooling tests

Change-Id: If75336aa6308731a06462a73cd9209d24574509e
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80342
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Steven Niu <steven.niu@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index df63586..5fbe3ea 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -36,18 +36,20 @@
  * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in]     width                             Width of the valid region of the image
  * @param[in]     height                            Height of the valid region of the image
  * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
  */
 __kernel void fill_image_borders_replicate(
-    IMAGE_DECLARATION(buf),
+    TENSOR3D_DECLARATION(buf),
     uint width,
     uint height,
     int2 start_pos)
 {
-    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
 
     // Update pointer to point to the starting point of the valid region
     buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
@@ -109,6 +111,8 @@
  * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
  * @param[in]  width                             Width of the valid region of the image
  * @param[in]  height                            Height of the valid region of the image
@@ -116,13 +120,13 @@
  * @param[in]  constant_value                    Constant value to use to fill the edges
  */
 __kernel void fill_image_borders_constant(
-    IMAGE_DECLARATION(buf),
+    TENSOR3D_DECLARATION(buf),
     uint      width,
     uint      height,
     int2      start_pos,
     DATA_TYPE constant_value)
 {
-    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
 
     // Update pointer to point to the starting point of the valid region
     buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index cf3cb78..29a43f7 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -78,6 +78,9 @@
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
 #define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
     update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
                                  name##_stride_z, name##_step_z)
@@ -157,6 +160,32 @@
     return img;
 }
 
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return img;
+}
+
 /** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
  *
  * @param[in] ptr                           Pointer to the starting postion of the buffer
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 7683ff9..2c751a4 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -108,7 +108,7 @@
     const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
@@ -163,13 +163,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _tensor, slice);
+        add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, cl::NullRange);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }