Fix CLNormalizationLayer NCHW border calculation

* Calculate border using both norm size and vec_size_x
* Expose reference tensor printer

Resolves: COMPMID-4793

Change-Id: I7bd8e49779baf7d6848271757bc7993aa1ed2960
Signed-off-by: SiCongLi <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6201
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
index 0fef98e..deada49 100644
--- a/src/core/CL/cl_kernels/nchw/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -134,9 +134,8 @@
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     kappa_v = SQCVT_SAT(KAPPA);
 
-    const int current_col = get_global_id(0) << 2;
-    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
-    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+    const int left_pos  = -(int)RADIUS;
+    const int right_pos = (int)RADIUS;
 
 #if defined(IN_MAP_2D)
     const int current_row = get_global_id(1);
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index a5dfafe..2765300 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -71,18 +71,29 @@
     const DataLayout data_layout = input->data_layout();
     if(data_layout == DataLayout::NCHW)
     {
-        const unsigned int vec_size_x            = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
-        const bool         is_norm_accross_width = norm_idx == 0;
+        const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        const unsigned int norm_idx             = get_normalization_dimension_index(input->data_layout(), norm_info);
+        const bool         is_norm_across_width = norm_idx == 0;
 
-        const unsigned int border_width = is_norm_accross_width ? vec_size_x - 1 : 0;
-        const BorderSize   border_size  = BorderSize(0, border_width);
+        const unsigned int norm_radius = norm_info.norm_size() / 2;
+        // Border / padding calculation:
+        // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+        // This means the x axis is fully-padded depending on vec_size_x and norm_size
+        // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+        // In : |p|#|#|#|p|p|
+        // Out:   |#|#|#|p|
+        // The output has 1 right padding because of the vec_size_x.
+        // The input has 1 left padding because radius = 1.
+        // The input has 2 right padding because of radius = 1 AND because of the extra output padding
+        const unsigned int border_width_left  = is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+        const BorderSize   border_size        = BorderSize(0, border_width_right, 0, border_width_left);
 
         win = calculate_max_window(*input, Steps(vec_size_x));
 
         // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
         // Reads can occur within the valid region of the input
-        if(is_norm_accross_width)
+        if(is_norm_across_width)
         {
             AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
             window_changed = window_changed || update_window_and_padding(win, input_access);
@@ -150,10 +161,21 @@
 
     if(data_layout == DataLayout::NCHW)
     {
-        const unsigned int norm_idx     = get_normalization_dimension_index(data_layout, norm_info);
-        _is_norm_across_width           = norm_idx == 0;
-        const unsigned int border_width = _is_norm_across_width ? vec_size_x - 1 : 0;
-        _border_size                    = BorderSize(0, border_width);
+        const unsigned int norm_idx    = get_normalization_dimension_index(data_layout, norm_info);
+        _is_norm_across_width          = norm_idx == 0;
+        const unsigned int norm_radius = norm_info.norm_size() / 2;
+        // Border / padding calculation:
+        // For NCHW no border handling is impelmeneted in the kernel in the x axis.
+        // This means the x axis is fully-padded depending on vec_size_x and norm_size
+        // E.G. for input x dimension = 3, norm_size = 3 (radius = 1), vec_size_x = 2 ('#' is element 'p' is padding):
+        // In : |p|#|#|#|p|p|
+        // Out:   |#|#|#|p|
+        // The output has 1 right padding because of the vec_size_x.
+        // The input has 1 left padding because radius = 1.
+        // The input has 2 right padding because of radius = 1 AND the extra output padding
+        const unsigned int border_width_left  = _is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+        _border_size                          = BorderSize(0, border_width_right, 0, border_width_left);
     }
 
     const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
diff --git a/tests/SimpleTensorPrinter.h b/tests/SimpleTensorPrinter.h
index 6c1506b..5d0299a 100644
--- a/tests/SimpleTensorPrinter.h
+++ b/tests/SimpleTensorPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,6 @@
 {
 namespace test
 {
-namespace
-{
 template <typename T>
 inline std::string prettify_tensor(const SimpleTensor<T> &input, const IOFormatInfo &io_fmt = IOFormatInfo{ IOFormatInfo::PrintRegion::NoPadding })
 {
@@ -152,6 +150,5 @@
     }
 }
 #endif // PRINT_TENSOR_LIMIT
-}
 } // namespace test
 } // namespace arm_compute