COMPMID-358 Implement OpenCL ROI Pooling

* Implement OpenCL ROI Pooling
* Add CLROIPoolingLayer benchmarks

Change-Id: I8786d01d551850a1b4d599a48fabe3925e0a27d0
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79833
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
index 1b676ed..e12695f 100644
--- a/arm_compute/core/CL/ICLArray.h
+++ b/arm_compute/core/CL/ICLArray.h
@@ -107,6 +107,7 @@
 using ICLKeyPointArray        = ICLArray<KeyPoint>;
 using ICLCoordinates2DArray   = ICLArray<Coordinates2D>;
 using ICLDetectionWindowArray = ICLArray<DetectionWindow>;
+using ICLROIArray             = ICLArray<ROI>;
 using ICLSize2DArray          = ICLArray<Size2D>;
 using ICLUInt8Array           = ICLArray<cl_uchar>;
 using ICLUInt16Array          = ICLArray<cl_ushort>;
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index cfbf760..1334c54 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -31,6 +31,8 @@
 
 namespace arm_compute
 {
+template <typename T>
+class ICLArray;
 class ICLTensor;
 class Window;
 
@@ -45,6 +47,16 @@
      * @return A reference to the OpenCL kernel of this object.
      */
     cl::Kernel &kernel();
+    /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     array          Array to set as an argument of the object's kernel.
+     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
+     * @param[in]     num_dimensions Number of dimensions of the @p array.
+     * @param[in]     window         Window the kernel will be executed on.
+     */
+    template <typename T>
+    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
     /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -73,6 +85,11 @@
      * @param[in]     window Window the kernel will be executed on.
      */
     void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Returns the number of arguments enqueued per 1D array object.
+     *
+     * @return The number of arguments enqueues per 1D array object.
+     */
+    unsigned int num_arguments_per_1D_array() const;
     /** Returns the number of arguments enqueued per 1D tensor object.
      *
      * @return The number of arguments enqueues per 1D tensor object.
@@ -142,6 +159,16 @@
     GPUTarget get_target() const;
 
 private:
+    /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     array          Array to set as an argument of the object's kernel.
+     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
+     * @param[in]     num_dimensions Number of dimensions of the @p array.
+     * @param[in]     window         Window the kernel will be executed on.
+     */
+    template <typename T, unsigned int dimension_size>
+    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -150,6 +177,12 @@
      */
     template <unsigned int dimension_size>
     void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Returns the number of arguments enqueued per array object.
+     *
+     * @return The number of arguments enqueued per array object.
+     */
+    template <unsigned int dimension_size>
+    unsigned int           num_arguments_per_array() const;
     /** Returns the number of arguments enqueued per tensor object.
      *
      * @return The number of arguments enqueued per tensor object.
@@ -177,5 +210,50 @@
  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
  */
 void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange());
+
+template <typename T, unsigned int dimension_size>
+void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+{
+    // Calculate offset to the start of the window
+    unsigned int offset_first_element = 0;
+
+    for(unsigned int n = 0; n < num_dimensions; ++n)
+    {
+        offset_first_element += window[n].start() * strides[n];
+    }
+
+    unsigned int idx_start = idx;
+    _kernel.setArg(idx++, array->cl_buffer());
+
+    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    {
+        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
+        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
+    }
+
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+
+    ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_array<dimension_size>() != idx,
+                             "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+    ARM_COMPUTE_UNUSED(idx_start);
+}
+
+template <typename T>
+void ICLKernel::add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+{
+    add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
+}
+
+template <unsigned int dimension_size>
+unsigned int           ICLKernel::num_arguments_per_array() const
+{
+    return num_arguments_per_tensor<dimension_size>();
+}
+
+template <unsigned int dimension_size>
+unsigned int           ICLKernel::num_arguments_per_tensor() const
+{
+    return 2 + 2 * dimension_size;
+}
 }
 #endif /*__ARM_COMPUTE_ICLKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
new file mode 100644
index 0000000..51aae30
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the ROI pooling layer kernel */
+class CLROIPoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLROIPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLROIPoolingLayerKernel(const CLROIPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLROIPoolingLayerKernel &operator=(const CLROIPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLROIPoolingLayerKernel(CLROIPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLROIPoolingLayerKernel &operator=(CLROIPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLROIPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[in]  rois      Array containing @ref ROI.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor    *_input;
+    const ICLROIArray *_rois;
+    ICLTensor          *_output;
+    ROIPoolingLayerInfo _pool_info;
+};
+}
+#endif /*__ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
index 3a2f761..40f79ac 100644
--- a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -52,9 +52,14 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. Data types supported: F32.
-     * @param[in]  rois      Array containing the regions of interest.
+     * @param[in]  rois      Array containing @ref ROI.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
     void configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
index f4c2ef0..3dc7f19 100644
--- a/arm_compute/runtime/CL/CLArray.h
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -97,6 +97,7 @@
 using CLKeyPointArray        = CLArray<KeyPoint>;
 using CLCoordinates2DArray   = CLArray<Coordinates2D>;
 using CLDetectionWindowArray = CLArray<DetectionWindow>;
+using CLROIArray             = CLArray<ROI>;
 using CLSize2DArray          = CLArray<Size2D>;
 using CLUInt8Array           = CLArray<cl_uchar>;
 using CLUInt16Array          = CLArray<cl_ushort>;
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
new file mode 100644
index 0000000..f089375
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLROIPOOLINGLAYER_H__
+#define __ARM_COMPUTE_CLROIPOOLINGLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLROIPoolingLayerKernel.
+ *
+ * This function calls the following OpenCL kernels:
+ * -# @ref CLROIPoolingLayerKernel
+ *
+ */
+class CLROIPoolingLayer : public ICLSimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * @param[in]  rois      Array containing @ref ROI.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     */
+    void configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLROIPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 04b5c35..5adc111 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -47,9 +47,14 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input     Source tensor. Data types supported: F32.
-     * @param[in]  rois      Array containing the regions of interest.
+     * @param[in]  rois      Array containing @ref ROI.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
     void configure(const ITensor *input, const IROIArray *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);