COMPMID-1451 Change PriorBox output to NCHw

Output of Priorbox should be independent of the input
data layout and should  always be in NCHW format

Change-Id: Ie80cd4e51c78945b158c0db1af1923bdf8d7ea7b
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 2f63179..365fc83 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -67,8 +67,7 @@
 
     if(output != nullptr && output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
 
     return Status{};
@@ -76,29 +75,13 @@
 
 std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info)
 {
-    ARM_COMPUTE_UNUSED(input2);
+    ARM_COMPUTE_UNUSED(input1, input2);
 
-    Window win            = {};
-    bool   window_changed = false;
-    switch(input1->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            const int          num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
-            win                                                  = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, output_access);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            win = calculate_max_window(*output, Steps());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    };
+    const int              num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+    const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
+    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -106,11 +89,10 @@
 } // namespace
 
 NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-template <DataLayout DL>
 void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
                                               const int height)
 {
@@ -119,49 +101,23 @@
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    switch(DL)
+    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
+    if(_info.clip())
     {
-        case DataLayout::NCHW:
-        {
-            float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-            if(_info.clip())
-            {
-                static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
-                static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
-                vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
-            }
-            vst1q_f32(out + offset, vec_elements);
-        }
-        break;
-        case DataLayout::NHWC:
-        {
-            const int output_offset = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
-            if(_info.clip())
-            {
-                xmin = std::min(std::max(xmin, 0.f), 1.f);
-                ymin = std::min(std::max(ymin, 0.f), 1.f);
-                xmax = std::min(std::max(xmax, 0.f), 1.f);
-                ymax = std::min(std::max(ymax, 0.f), 1.f);
-            }
-
-            *(out + output_offset * offset)       = xmin;
-            *(out + output_offset * (offset + 1)) = ymin;
-            *(out + output_offset * (offset + 2)) = xmax;
-            *(out + output_offset * (offset + 3)) = ymax;
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
+        static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+        static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+        vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
     }
+    vst1q_f32(out + offset, vec_elements);
 }
 
-template <DataLayout DL>
 void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 {
     const int num_priors = _info.aspect_ratios().size() * _info.min_sizes().size() + _info.max_sizes().size();
 
-    const int width_idx  = get_data_layout_dimension_index(DL, DataLayoutDimension::WIDTH);
-    const int height_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::HEIGHT);
+    const DataLayout data_layout = _input1->info()->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     const int layer_width  = _input1->info()->dimension(width_idx);
     const int layer_height = _input1->info()->dimension(height_idx);
@@ -182,44 +138,17 @@
         step_y = static_cast<float>(img_height) / layer_height;
     }
 
-    Window slice = {};
-
-    switch(DL)
-    {
-        case DataLayout::NCHW:
-            slice = window.first_slice_window_2D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
-            break;
-        case DataLayout::NHWC:
-            slice = window.first_slice_window_3D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * num_priors));
-            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    Window slice = window.first_slice_window_2D();
+    slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
     execute_window_loop(slice, [&](const Coordinates & id)
     {
         float center_x = 0;
         float center_y = 0;
-        int   idx      = 0;
-        switch(DL)
-        {
-            case DataLayout::NCHW:
-                idx      = id.x() / (4 * num_priors);
-                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-                break;
-            case DataLayout::NHWC:
-                idx      = id.y() / (4 * num_priors);
-                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
+        int   idx      = id.x() / (4 * num_priors);
+        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
 
         float box_width;
         float box_height;
@@ -231,7 +160,7 @@
             const float min_size = _info.min_sizes().at(i);
             box_width            = min_size;
             box_height           = min_size;
-            store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
             offset += 4;
 
             if(!_info.max_sizes().empty())
@@ -240,7 +169,7 @@
                 box_width            = std::sqrt(min_size * max_size);
                 box_height           = box_width;
 
-                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
             }
 
@@ -255,50 +184,27 @@
                 box_width  = min_size * sqrt(ar);
                 box_height = min_size / sqrt(ar);
 
-                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
             }
         }
 
         // set the variance
-        switch(DL)
+        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+        float32x4_t var;
+        if(_info.variances().size() == 1)
         {
-            case DataLayout::NCHW:
-            {
-                out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-                float32x4_t var;
-                if(_info.variances().size() == 1)
-                {
-                    var = vdupq_n_f32(_info.variances().at(0));
-                }
-                else
-                {
-                    const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-                    var                    = vars;
-                }
-                for(int i = 0; i < num_priors; ++i)
-                {
-                    vst1q_f32(out + 4 * i, var);
-                }
-            }
-            break;
-            case DataLayout::NHWC:
-            {
-                for(int i = 0; i < num_priors; ++i)
-                {
-                    const int  prior_offset = 4 * i;
-                    const bool single_var   = _info.variances().size() == 1;
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 0, 1)))) = _info.variances().at(0);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 1, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(1);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 2, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(2);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 3, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(3);
-                }
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
+            var = vdupq_n_f32(_info.variances().at(0));
         }
-
+        else
+        {
+            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
+            var                    = vars;
+        }
+        for(int i = 0; i < num_priors; ++i)
+        {
+            vst1q_f32(out + 4 * i, var);
+        }
     },
     output);
 }
@@ -314,22 +220,6 @@
     _info   = info;
     _output = output;
 
-    switch(input1->info()->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NCHW>;
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NHWC>;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented.");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -350,9 +240,8 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     // Run function
-    (this->*_func)(window);
+    calculate_prior_boxes(window);
 }
 } // namespace arm_compute
\ No newline at end of file