COMPMID-1451 Change PriorBox output to NCHw

Output of Priorbox should be independent of the input
data layout and should  always be in NCHW format

Change-Id: Ie80cd4e51c78945b158c0db1af1923bdf8d7ea7b
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index ff4803e..33e6670 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -355,7 +355,6 @@
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
     { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
     { "prior_box_layer_nchw", "prior_box_layer.cl" },
-    { "prior_box_layer_nhwc", "prior_box_layer.cl" },
     { "quantization_layer", "quantization_layer.cl" },
     { "reduction_operation_x", "reduction_operation.cl" },
     { "reduction_operation_quantized_x", "reduction_operation.cl" },
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
index be072ec..046151b 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/prior_box_layer.cl
@@ -104,88 +104,6 @@
 
     return idx;
 }
-
-/** Compute prior boxes and clip (NHWC)
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx                                  Index to write to
- * @param[in]  center_x                             Center value of the x axis
- * @param[in]  center_y                             Center value of the y axis
- * @param[in]  box_width                            Prior box width
- * @param[in]  box_height                           Prior box height
- *
- */
-inline void calculate_xy_min_max_nhwc(Tensor3D *out, int idx, float center_x, float center_y, float box_width, float box_height)
-{
-    float xmin = (center_x - box_width / 2.f) / WIDTH;
-    float ymin = (center_y - box_height / 2.f) / HEIGHT;
-    float xmax = (center_x + box_width / 2.f) / WIDTH;
-    float ymax = (center_y + box_height / 2.f) / HEIGHT;
-
-#if defined(CLIP)
-    xmin = clamp(xmin, 0.f, 1.f);
-    ymin = clamp(ymin, 0.f, 1.f);
-    xmax = clamp(xmax, 0.f, 1.f);
-    ymax = clamp(ymax, 0.f, 1.f);
-#endif // defined(CLIP)
-
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 0, 0)) = xmin;
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 1, 0)) = ymin;
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 2, 0)) = xmax;
-    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 3, 0)) = ymax;
-}
-
-/** Compute prior boxes (NHWC)
- *
- * @param[in,out] out                Tensor output
- * @param[in]     max                The maximum values
- * @param[in]     aspect_ratios      The aspect ratio values
- * @param[in]     max_size           The maximum values values size
- * @param[in]     aspect_ratios_size The aspect ratio values size
- * @param[in]     min_size           The minimum values size
- * @param[in]     min_idx            Index of the min vector
- * @param[in]     idx                Index to write to
- *
- * @return The updated index
- */
-inline int calculate_min_nhwc(Tensor3D *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
-{
-    const float center_x = ((float)(get_global_id(1) % LAYER_WIDTH) + OFFSET) * STEP_X;
-    const float center_y = ((float)(get_global_id(1) / LAYER_WIDTH) + OFFSET) * STEP_Y;
-
-    float box_width  = min_size;
-    float box_height = min_size;
-
-    calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
-    idx += 4;
-    if(max_size > 0)
-    {
-        box_width  = sqrt(min_size * max[min_idx]);
-        box_height = box_width;
-        calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
-    {
-        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
-        {
-            continue;
-        }
-        box_width  = min_size * sqrt(aspect_ratios[i]);
-        box_height = min_size * rsqrt(aspect_ratios[i]);
-
-        calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-
-    return idx;
-}
-
 /** Calculate prior boxes with NCHW format.
  *
  * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
@@ -218,39 +136,4 @@
         vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
     }
 }
-
-/** Calculate prior boxes with NHWC format.
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  min                                  The minimum values
- * @param[in]  max                                  The maximum_values
- * @param[in]  aspect_ratios                        The aspect ratio values
- * @param[in]  min_size                             The minimum values size
- * @param[in]  max_size                             The maximum_values values size
- * @param[in]  aspect_ratios_size                   The aspect ratio values size
- */
-__kernel void prior_box_layer_nhwc(TENSOR3D_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
-                                   unsigned int aspect_ratios_size)
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int idx = 0;
-    for(unsigned int i = 0; i < min_size; ++i)
-    {
-        idx = calculate_min_nhwc(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
-    }
-
-    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
-    {
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 0, 1)) = VARIANCE_0;
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 1, 1)) = VARIANCE_1;
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 2, 1)) = VARIANCE_2;
-        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 3, 1)) = VARIANCE_3;
-    }
-}
 #endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 63e745e..c76d839 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -73,8 +73,7 @@
 
     if(output != nullptr && output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
 
     return Status{};
@@ -87,29 +86,11 @@
     TensorShape output_shape = compute_prior_box_shape(*input1, info);
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
-    Window win{};
-    bool   window_changed = false;
-
-    switch(input1->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
-
-            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, output_access);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            win = calculate_max_window(*output, Steps());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    };
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
+    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
+    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -188,25 +169,8 @@
         }
     }
 
-    unsigned int idx = 0;
-    // Create kernel
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            idx     = num_arguments_per_2D_tensor();
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            idx     = num_arguments_per_3D_tensor();
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nhwc", build_opts.options()));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    unsigned int idx = num_arguments_per_2D_tensor();
+    _kernel          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
 
     _kernel.setArg(idx++, *_min);
     _kernel.setArg(idx++, *_max);
@@ -245,31 +209,11 @@
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }
 
-    switch(_input1->info()->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            Window slice = window.first_slice_window_2D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+    Window slice = window.first_slice_window_2D();
+    slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
-            unsigned int idx = 0;
-            add_2D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window slice = window.first_slice_window_3D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * _num_priors));
-            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
-
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _output, slice);
-            enqueue(queue, *this, slice);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _output, slice);
+    enqueue(queue, *this, slice);
 }
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 2f63179..365fc83 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -67,8 +67,7 @@
 
     if(output != nullptr && output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
 
     return Status{};
@@ -76,29 +75,13 @@
 
 std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info)
 {
-    ARM_COMPUTE_UNUSED(input2);
+    ARM_COMPUTE_UNUSED(input1, input2);
 
-    Window win            = {};
-    bool   window_changed = false;
-    switch(input1->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            const int          num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
-            win                                                  = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, output_access);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            win = calculate_max_window(*output, Steps());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    };
+    const int              num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+    const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
+    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, output_access);
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -106,11 +89,10 @@
 } // namespace
 
 NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-template <DataLayout DL>
 void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
                                               const int height)
 {
@@ -119,49 +101,23 @@
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    switch(DL)
+    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
+    if(_info.clip())
     {
-        case DataLayout::NCHW:
-        {
-            float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-            if(_info.clip())
-            {
-                static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
-                static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
-                vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
-            }
-            vst1q_f32(out + offset, vec_elements);
-        }
-        break;
-        case DataLayout::NHWC:
-        {
-            const int output_offset = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
-            if(_info.clip())
-            {
-                xmin = std::min(std::max(xmin, 0.f), 1.f);
-                ymin = std::min(std::max(ymin, 0.f), 1.f);
-                xmax = std::min(std::max(xmax, 0.f), 1.f);
-                ymax = std::min(std::max(ymax, 0.f), 1.f);
-            }
-
-            *(out + output_offset * offset)       = xmin;
-            *(out + output_offset * (offset + 1)) = ymin;
-            *(out + output_offset * (offset + 2)) = xmax;
-            *(out + output_offset * (offset + 3)) = ymax;
-        }
-        break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
+        static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+        static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+        vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
     }
+    vst1q_f32(out + offset, vec_elements);
 }
 
-template <DataLayout DL>
 void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
 {
     const int num_priors = _info.aspect_ratios().size() * _info.min_sizes().size() + _info.max_sizes().size();
 
-    const int width_idx  = get_data_layout_dimension_index(DL, DataLayoutDimension::WIDTH);
-    const int height_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::HEIGHT);
+    const DataLayout data_layout = _input1->info()->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     const int layer_width  = _input1->info()->dimension(width_idx);
     const int layer_height = _input1->info()->dimension(height_idx);
@@ -182,44 +138,17 @@
         step_y = static_cast<float>(img_height) / layer_height;
     }
 
-    Window slice = {};
-
-    switch(DL)
-    {
-        case DataLayout::NCHW:
-            slice = window.first_slice_window_2D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
-            break;
-        case DataLayout::NHWC:
-            slice = window.first_slice_window_3D();
-            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * num_priors));
-            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
+    Window slice = window.first_slice_window_2D();
+    slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
     execute_window_loop(slice, [&](const Coordinates & id)
     {
         float center_x = 0;
         float center_y = 0;
-        int   idx      = 0;
-        switch(DL)
-        {
-            case DataLayout::NCHW:
-                idx      = id.x() / (4 * num_priors);
-                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-                break;
-            case DataLayout::NHWC:
-                idx      = id.y() / (4 * num_priors);
-                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
-        }
+        int   idx      = id.x() / (4 * num_priors);
+        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
 
         float box_width;
         float box_height;
@@ -231,7 +160,7 @@
             const float min_size = _info.min_sizes().at(i);
             box_width            = min_size;
             box_height           = min_size;
-            store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
             offset += 4;
 
             if(!_info.max_sizes().empty())
@@ -240,7 +169,7 @@
                 box_width            = std::sqrt(min_size * max_size);
                 box_height           = box_width;
 
-                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
             }
 
@@ -255,50 +184,27 @@
                 box_width  = min_size * sqrt(ar);
                 box_height = min_size / sqrt(ar);
 
-                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
             }
         }
 
         // set the variance
-        switch(DL)
+        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+        float32x4_t var;
+        if(_info.variances().size() == 1)
         {
-            case DataLayout::NCHW:
-            {
-                out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-                float32x4_t var;
-                if(_info.variances().size() == 1)
-                {
-                    var = vdupq_n_f32(_info.variances().at(0));
-                }
-                else
-                {
-                    const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-                    var                    = vars;
-                }
-                for(int i = 0; i < num_priors; ++i)
-                {
-                    vst1q_f32(out + 4 * i, var);
-                }
-            }
-            break;
-            case DataLayout::NHWC:
-            {
-                for(int i = 0; i < num_priors; ++i)
-                {
-                    const int  prior_offset = 4 * i;
-                    const bool single_var   = _info.variances().size() == 1;
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 0, 1)))) = _info.variances().at(0);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 1, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(1);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 2, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(2);
-                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 3, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(3);
-                }
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Not implemented");
+            var = vdupq_n_f32(_info.variances().at(0));
         }
-
+        else
+        {
+            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
+            var                    = vars;
+        }
+        for(int i = 0; i < num_priors; ++i)
+        {
+            vst1q_f32(out + 4 * i, var);
+        }
     },
     output);
 }
@@ -314,22 +220,6 @@
     _info   = info;
     _output = output;
 
-    switch(input1->info()->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NCHW>;
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NHWC>;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented.");
-    }
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -350,9 +240,8 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
     // Run function
-    (this->*_func)(window);
+    calculate_prior_boxes(window);
 }
 } // namespace arm_compute
\ No newline at end of file