COMPMID-522 - Added support for GlobalPooling in CLPoolingLayer and CLFlattening for 3D tensor

Change-Id: Ifc7db1e4d4af322a4dcbfeb3e132e5c326596872
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86618
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 18ad4a6..0497bf4 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -415,3 +415,101 @@
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
 }
+
+#if defined(POOL_SIZE)
+
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG)
+#define INITIAL_VALUE 0
+#else // POOL_AVG
+#ifdef FIXED_POINT_POSITION
+#define MIN_VAL_EXPAND(type) type##_MIN
+#define MIN_VAL(type) MIN_VAL_EXPAND(type)
+#define INITIAL_VALUE MIN_VAL(DATA_TYPE)
+#define INITIAL_VALUE 0
+#else // FIXED_POINT_POSITION
+#if FP16
+#define INITIAL_VALUE -HALF_MAX
+#else // FP16
+#define INITIAL_VALUE -FLT_MAX
+#endif // FP16
+#endif // FIXED_POINT_POSITION
+
+#endif // POOL_AVG
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note -DFP16 must be passed at compile time if half float data type is used
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vdata           = INITIAL_VALUE;
+    DATA_TYPE sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE - 8); x += 8)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, 8)
+            data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            vdata = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE; ++x)
+        {
+            DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            sdata           = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
+    DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res           = POOL_OP(res, sdata);
+
+    // Divide by pool region in case of average pooling
+#ifdef POOL_AVG
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* POOL_AVG */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
+#endif // defined(POOL_SIZE)
\ No newline at end of file