COMPMID-3177: Remove padding from NEBatchNormalizationLayer

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I9be23e6ef1f552eb159e39fda16c82fa20124094
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3307
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 6bd30ee..3d84ce8 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,10 +33,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+
 #include <map>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status
@@ -82,56 +84,41 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *var, ITensorInfo *gamma, ITensorInfo *beta)
 {
+    ARM_COMPUTE_UNUSED(mean, var, gamma, beta);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
     if(output != nullptr)
     {
-        // Output tensor auto initialization if not yet initialized
+        // Output auto initialization if not yet initialized
         auto_init_if_empty(*output, *input->clone());
+
+        // NEBatchNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
     }
 
-    unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access);
-
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed |= update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-
-    // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
-        window_changed |= update_window_and_padding(win, mean_access, var_access);
-
-        if(gamma != nullptr)
-        {
-            AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
-            window_changed |= update_window_and_padding(win, gamma_access);
-        }
-        if(beta != nullptr)
-        {
-            AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
-            window_changed |= update_window_and_padding(win, beta_access);
-        }
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } //namespace
 
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
+template <typename T, bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
 {
-    ARM_COMPUTE_UNUSED(window);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_to_use = window;
+    win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_to_use);
+    Iterator output(_output, win_to_use);
 
     F activation_functor(_act_info);
 
@@ -139,196 +126,168 @@
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
-    float16x8_t       mean_vec    = vdupq_n_f16(0.0);
-    float16x8_t       var_vec     = vdupq_n_f16(0.0);
-    float16x8_t       gamma_vec   = vdupq_n_f16(1.0);
-    float16x8_t       beta_vec    = vdupq_n_f16(0.0);
-    float16x8_t       denominator = vdupq_n_f16(0.0);
-    const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
+    T mean        = static_cast<T>(0);
+    T var         = static_cast<T>(0);
+    T gamma       = static_cast<T>(1);
+    T beta        = static_cast<T>(0);
+    T denominator = static_cast<T>(0);
+
+    auto       mean_vec        = wrapper::vdup_n(mean, ExactTagType{});
+    auto       var_vec         = wrapper::vdup_n(var, ExactTagType{});
+    auto       gamma_vec       = wrapper::vdup_n(gamma, ExactTagType{});
+    auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
+    auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
+    const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
+    execute_window_loop(win_to_use, [&](const Coordinates & id)
     {
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
         if(slice != id.z())
         {
-            // Conctruct vectors
-            mean_vec = vdupq_n_f16(*(input_mean + id.z()));
-            var_vec  = vdupq_n_f16(*(input_var + id.z()));
+            mean     = input_mean[id.z()];
+            var      = input_var[id.z()];
+            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+            var_vec  = wrapper::vdup_n(var, ExactTagType{});
             if(input_gamma != nullptr)
             {
-                gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+                gamma     = input_gamma[id.z()];
+                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
             }
             if(input_beta != nullptr)
             {
-                beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+                beta     = input_beta[id.z()];
+                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
             }
 
             // Calculate denominator
-            denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
-            slice       = id.z();
+            denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+            denominator     = wrapper::vgetlane(denominator_vec, 0);
+            slice           = id.z();
         }
 
-        // Calculate x bar and store results
-        const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
-        const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
-        float16x8_t       res       = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
-
-        // Perform fused activation
-        if(fused_activation)
+        // Perform core calculations using vector operations
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            activation_functor(res);
+            // Calculate x bar
+            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+            const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            wrapper::vstore(output_ptr + x, res);
         }
 
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const T numerator = input_ptr[x] - mean;
+            const T x_bar     = numerator * denominator;
+            T       res       = beta + x_bar * gamma;
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            *(output_ptr + x) = res;
+        }
     },
     input, output);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
+template <typename T, bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_nhwc(const Window &window)
 {
-    ARM_COMPUTE_UNUSED(window);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
 
     F activation_functor(_act_info);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
-    const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
+    const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
-        // Conctruct vectors
-        const float16x8_t mean_vec  = vld1q_f16(input_mean + id.x());
-        const float16x8_t var_vec   = vld1q_f16(input_var + id.x());
-        const float16x8_t gamma_vec = (input_gamma != nullptr) ? vld1q_f16(input_gamma + id.x()) : vdupq_n_f16(1.0);
-        const float16x8_t beta_vec  = (input_beta != nullptr) ? vld1q_f16(input_beta + id.x()) : vdupq_n_f16(0.0);
-        // Calculate denominator
-        const float16x8_t denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        // Calculate x bar and store results
-        const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
-        const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
-        float16x8_t       res       = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
-
-        // Perform fused activation
-        if(fused_activation)
-        {
-            activation_functor(res);
-        }
-
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
-    },
-    input, output);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw(const Window &window)
-{
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    F activation_functor(_act_info);
-
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and NEON vectors once per feature map.
-    int slice = -1;
-
-    const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
-    float32x4_t       var_vec     = vdupq_n_f32(0.0);
-    float32x4_t       gamma_vec   = vdupq_n_f32(1.0);
-    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
-    float32x4_t       denominator = vdupq_n_f32(0.0);
-    const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(slice != id.z())
+        // Perform core calculations using vector operations
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Conctruct vectors
-            mean_vec = vdupq_n_f32(*(input_mean + id.z()));
-            var_vec  = vdupq_n_f32(*(input_var + id.z()));
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
-            }
-            if(input_beta != nullptr)
-            {
-                beta_vec = vdupq_n_f32(*(input_beta + id.z()));
-            }
+            const auto mean_vec  = wrapper::vloadq(input_mean + x);
+            const auto var_vec   = wrapper::vloadq(input_var + x);
+            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
             // Calculate denominator
-            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
-            slice       = id.z();
+            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+            // Calculate x bar
+            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+            const auto x_bar     = wrapper::vmul(numerator, denominator);
+            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            wrapper::vstore(output_ptr + x, res);
         }
 
-        // Calculate x bar
-        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
-        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
-        float32x4_t       res       = vmlaq_f32(beta_vec, x_bar, gamma_vec);
-
-        // Perform fused activation
-        if(fused_activation)
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
         {
-            activation_functor(res);
+            // Conctruct vectors
+            const T gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+            const T beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+            const T denominator = sqrt(input_var[x] + _epsilon);
+            const T numerator   = input_ptr[x] - input_mean[x];
+            const T x_bar       = numerator / denominator;
+            T       res         = beta + x_bar * gamma;
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            *reinterpret_cast<T *>(output_ptr + x) = res;
         }
-
-        // Store results
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
-    },
-    input, output);
-}
-
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc(const Window &window)
-{
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    F activation_functor(_act_info);
-
-    const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Conctruct vectors
-        const float32x4_t mean_vec  = vld1q_f32(input_mean + id.x());
-        const float32x4_t var_vec   = vld1q_f32(input_var + id.x());
-        const float32x4_t gamma_vec = (input_gamma != nullptr) ? vld1q_f32(input_gamma + id.x()) : vdupq_n_f32(1.0);
-        const float32x4_t beta_vec  = (input_beta != nullptr) ? vld1q_f32(input_beta + id.x()) : vdupq_n_f32(0.0);
-        // Calculate denominator
-        const float32x4_t denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
-
-        // Calculate x bar
-        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
-        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
-        float32x4_t       res       = vmlaq_f32(beta_vec, x_bar, gamma_vec);
-
-        // Perform fused activation
-        if(fused_activation)
-        {
-            activation_functor(res);
-        }
-
-        // Store results
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
     },
     input, output);
 }
@@ -340,13 +299,13 @@
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false, ::detail::dummy<float16_t, 8>> :
-                    &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false, ::detail::dummy<float16_t, 8>>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, false, detail::dummy<float16_t, 8>> :
+                    &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
-                    &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, false, detail::dummy<float, 4>> :
+                    &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -359,31 +318,31 @@
     // NCHW Fused Batched Normalization with activation functions : FP32
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::lubrelu<float, 4>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
     };
     // NHWC Fused Batched Normalization with activation functions : FP32
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::lubrelu<float, 4>> }
     };
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // NCHW Fused Batched Normalization with activation functions : FP16
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::lubrelu<float16_t, 8>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
     };
     // NHWC Fused Batched Normalization with activation functions : FP16
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nhwc =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::lubrelu<float16_t, 8>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::relu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::brelu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::lubrelu<float16_t, 8>> }
     };
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
@@ -475,3 +434,4 @@
 
     (this->*_func)(window);
 }
+} // namespace arm_compute