COMPMID-417 Fix ROIPooling

* Fix ROIPooling in NEON, CL and Reference.

Change-Id: Id5066625e5073e0bfebe69391f7941e993003296
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87435
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
index 35a9c0a..042b102 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl
@@ -138,24 +138,23 @@
     // Load roi parameters
     // roi is laid out as follows:
     // { x, y, width, height, batch_index }
-    const ushort8 roi     = vload8(0, (__global ushort *)vector_offset(&rois, pw));
-    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
-    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
-
-    // Determine pooled region in input image to pooled region in output image ratio
-    const float2 pool_region_ratio = convert_float2(roi_dims) / (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const ushort4 roi      = vload4(0, (__global ushort *)vector_offset(&rois, pw));
+    const ushort roi_batch = *((__global ushort *)vector_offset(&rois, pw) + 4);
+    const int2 roi_anchor  = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims    = convert_int2_sat(fmax(round(convert_float2(roi.s23) * (float)SPATIAL_SCALE), 1.f));
 
     // Calculate pooled region start and end
     const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
     const int2   max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
-    int2         region_start     = convert_int2_sat(floor(spatial_indx * pool_region_ratio)) + roi_anchor;
-    int2         region_end       = convert_int2_sat(ceil((spatial_indx + 1) * pool_region_ratio)) + roi_anchor;
+    int2         region_start     = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+    int2         region_end       = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
 
     region_start = clamp(region_start, 0, max_spatial_dims);
     region_end   = clamp(region_end, 0, max_spatial_dims);
 
     // Move input and output pointer across the fourth dimension
-    input.ptr += roi.s4 * input_stride_w;
+    input.ptr += roi_batch * input_stride_w;
     output.ptr += pw * output_stride_w;
 
     for(int pz = 0; pz < MAX_DIM_Z; ++pz)
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 83ab611..a209a52 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -105,10 +105,6 @@
         const int roi_width    = std::max(support::cpp11::round(curr_roi.rect.width * spatial_scale), 1.f);
         const int roi_height   = std::max(support::cpp11::round(curr_roi.rect.height * spatial_scale), 1.f);
 
-        // Determine pooling regions
-        const auto pool_region_size_x = static_cast<float>(roi_width) / pooled_w;
-        const auto pool_region_size_y = static_cast<float>(roi_height) / pooled_h;
-
         // Iterate through all feature maps
         for(int fm = 0; fm < fms; ++fm)
         {
@@ -117,10 +113,10 @@
             {
                 for(int px = 0; px < pooled_w; ++px)
                 {
-                    auto region_start_x = static_cast<int>(std::floor(px * pool_region_size_x));
-                    auto region_end_x   = static_cast<int>(std::ceil((px + 1) * pool_region_size_x));
-                    auto region_start_y = static_cast<int>(std::floor(py * pool_region_size_y));
-                    auto region_end_y   = static_cast<int>(std::ceil((py + 1) * pool_region_size_y));
+                    auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
+                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
 
                     region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
                     region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);