COMPMID-1676: Change CLROIAlign interface to accept ROIs as tensors

Change-Id: I69e995973597ba3927d29e4f6ed5438560e53d77
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
index 4625e53..f52eb18 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/roi_align_layer.cl
@@ -97,38 +97,40 @@
  * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
  *       will have a default sampling ratio of roi_dims/pooling_dims
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the rois array. Layout: {x, y, width, height, batch_indx}
- * @param[in]  rois_stride_x                        Stride of the rois array in X dimension (in bytes)
- * @param[in]  rois_step_x                          rois_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the rois array
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_stride_w                       Stride of the source image in W dimension (in bytes)
- * @param[in]  output_stride_w                      Stride of the destination image in W dimension (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void roi_align_layer(
     TENSOR3D_DECLARATION(input),
-    VECTOR_DECLARATION(rois),
+    IMAGE_DECLARATION(rois),
     TENSOR3D_DECLARATION(output),
     unsigned int input_stride_w, unsigned int output_stride_w)
 {
     // Get pixels pointer
     Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Vector   rois   = CONVERT_TO_VECTOR_STRUCT_NO_STEP(rois);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
     Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
 
     const int px = get_global_id(0);
@@ -136,19 +138,19 @@
     const int pw = get_global_id(2);
 
     // Load roi parameters
-    // roi is laid out as follows:
-    // { x, y, width, height, batch_index }
-    const ushort4 roi       = vload4(0, (__global ushort *)vector_offset(&rois, pw));
-    const ushort roi_batch  = *((__global ushort *)vector_offset(&rois, pw) + 4);
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
     const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
-    const float2 roi_dims   = fmax(convert_float2(roi.s23) * convert_float(SPATIAL_SCALE), 1.f);
+    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
 
     // Calculate pooled region start and end
     const float2 spatial_indx     = (float2)(px, py);
     const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
     const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
 
-    const float2 bin_size     = roi_dims / pooled_dims;
+    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
     float2       region_start = spatial_indx * bin_size + roi_anchor;
     float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
 
@@ -159,7 +161,7 @@
     const float2 roi_bin_grid = SAMPLING_RATIO;
 #else  // !defined(SAMPLING_RATIO)
     // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    const float2 roi_bin_grid = ceil(roi_dims / pooled_dims - EPS_GRID);
+    const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
 #endif // defined(SAMPLING_RATIO)
 
     // Move input and output pointer across the fourth dimension
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 2e1e854..2d2ac07 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -39,24 +39,47 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, size_t num_rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, rois);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
-    ARM_COMPUTE_RETURN_ERROR_ON(num_rois == 0);
 
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
-        ARM_COMPUTE_RETURN_ERROR_ON(num_rois != output->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
 
     return Status{};
 }
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+    auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    Window             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 CLROIAlignLayerKernel::CLROIAlignLayerKernel()
@@ -64,13 +87,14 @@
 {
 }
 
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->num_values(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
 
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input     = input;
     _output    = output;
@@ -78,46 +102,27 @@
     _pool_info = pool_info;
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX))));
-    build_opts.emplace(("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY))));
-    build_opts.emplace(("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ))));
-    build_opts.emplace(("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width())));
-    build_opts.emplace(("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height())));
-    build_opts.emplace(("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale())));
-    if(pool_info.sampling_ratio() > 0)
-    {
-        build_opts.emplace(("-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio())));
-    }
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+    build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+    build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+    build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
     // Create kernel
     std::string kernel_name = "roi_align_layer";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor() + num_arguments_per_1D_array();
-    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
-    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 1;
-    Window             window                            = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(input->info(),
-                                    input->info()->valid_region().start(0),
-                                    input->info()->valid_region().start(1),
-                                    input->info()->valid_region().end(0),
-                                    input->info()->valid_region().end(1));
-    AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
-
-    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    ICLKernel::configure_internal(window);
+    ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, size_t num_rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, num_rois, output, pool_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
 }
 
@@ -126,16 +131,20 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
+    Window slice      = window.first_slice_window_3D();
+    Window slice_rois = slice;
+    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
+    slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
     slice.set(Window::DimZ, window[3]);
 
     // Set arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, slice);
-    add_1D_array_argument<ROI>(idx, _rois, Strides(sizeof(ROI)), 1U, slice);
+    add_2D_tensor_argument(idx, _rois, slice_rois);
     add_3D_tensor_argument(idx, _output, slice);
+    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
     enqueue(queue, *this, slice);
 }
-
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 1528759..5bfd594 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp
@@ -29,14 +29,14 @@
 
 namespace arm_compute
 {
-Status CLROIAlignLayer::validate(const ITensorInfo *input, size_t num_rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, num_rois, output, pool_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLROIArray *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
 {
     // Configure ROI pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>();