COMPMID-684: 2D In-Map normalization support for CL

Change-Id: I73a11ef3ff7265abce196b128413f54623d33cae
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111294
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 7e18ce5..51cd28e 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -51,7 +51,7 @@
     /** Set the input and output tensors.
      *
      * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler)
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32 (Written to by the border handler)
      * @param[out]     output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
      * @param[in]      norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      */
@@ -59,7 +59,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32
+     *                      and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32
      * @param[in] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
      * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
      *
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 948fe44..4bc4a48 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -278,7 +278,7 @@
     { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
-    { "normalization_layer_in_map_1D", "normalization_layer.cl" },
+    { "normalization_layer_in_map", "normalization_layer.cl" },
     { "batchnormalization_layer", "batchnormalization_layer.cl" },
     { "NV12_to_IYUV_bt709", "color_convert.cl" },
     { "NV12_to_RGB888_bt709", "color_convert.cl" },
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index f870589..bc00252 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -50,7 +50,7 @@
 
 #endif // FIXED_POINT_POSITION
 
-/** Apply cross map normalization.
+/** Apply cross-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -92,9 +92,8 @@
     kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
 
     const int current_slice = get_global_id(2);
-
-    const int left_slice  = max(-(int)RADIUS, -current_slice);
-    const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+    const int left_slice    = max(-(int)RADIUS, -current_slice);
+    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
 
     for(int i = left_slice; i <= right_slice; i++)
     {
@@ -112,7 +111,7 @@
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 
-/** Apply in map normalization.
+/** Apply in-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -137,8 +136,8 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
-                                            TENSOR3D_DECLARATION(output))
+__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
+                                         TENSOR3D_DECLARATION(output))
 {
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -152,17 +151,34 @@
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
 
-    const int current_pos = get_global_id(0) << 2;
+    const int current_col = get_global_id(0) << 2;
+    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
+    const int right_pos   = min((int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1 - current_col));
 
-    const int left_pos  = max(current_pos - (int)RADIUS, -3);
-    const int right_pos = min(current_pos + (int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1));
+#if defined(IN_MAP_2D)
+    const int current_row = get_global_id(1);
+    const int first_row   = max(-(int)RADIUS, -current_row);
+    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
 
-    for(int i = left_pos; i <= right_pos; i += 1)
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
     {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i - current_pos, 0, 0));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
+#endif /* defined(IN_MAP_2D) */
+        for(int i = left_pos; i <= right_pos; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
     }
+#endif /* defined(IN_MAP_2D) */
 
     acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index d94f7b2..c46b598 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -43,7 +43,6 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
 
     if(is_data_type_fixed_point(input->data_type()))
     {
@@ -63,8 +62,11 @@
     return Error{};
 }
 
-std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, bool is_in_map, unsigned int norm_size)
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
+    const unsigned int norm_size = norm_info.norm_size();
+    bool               is_in_map = norm_info.is_in_map();
+
     const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
     const BorderSize   border_size  = BorderSize(0, border_width);
 
@@ -73,6 +75,7 @@
 
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
+    // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
     AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
@@ -113,27 +116,27 @@
     _border_size                    = BorderSize(0, border_width);
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->info()->data_type())) ? 16 : 4;
+    const bool         is_in_map_2D                      = (norm_info.type() == NormType::IN_MAP_2D);
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
-    build_opts.emplace(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
-    build_opts.emplace(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
-    build_opts.emplace(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.emplace(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
-    build_opts.emplace(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+    CLBuildOptions build_opts;
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+    build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+    build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
+    build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+    build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
 
     // Create kernel
-    std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    std::string kernel_name = _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), _is_in_map, norm_info.norm_size());
+    auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -153,7 +156,7 @@
 Error CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info.is_in_map(), norm_info.norm_size()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
     return Error{};
 }
diff --git a/tests/validation/CL/NormalizationLayer.cpp b/tests/validation/CL/NormalizationLayer.cpp
index 2e6ff89..0ed5284 100644
--- a/tests/validation/CL/NormalizationLayer.cpp
+++ b/tests/validation/CL/NormalizationLayer.cpp
@@ -49,14 +49,19 @@
 
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<int8_t>  tolerance_qs8(2);
-constexpr AbsoluteTolerance<int16_t> tolerance_qs16(3);
+constexpr AbsoluteTolerance<int16_t> tolerance_qs16(4);
 
 /** Input data set. */
-const auto NormalizationDataset = combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("NormType", { NormType::IN_MAP_1D, NormType::CROSS_MAP })),
+const auto NormalizationDataset = combine(combine(combine(combine(datasets::SmallShapes(), datasets::NormalizationTypes()),
                                                           framework::dataset::make("NormalizationSize", 3, 9, 2)),
                                                   framework::dataset::make("Beta", { 0.5f, 1.f, 2.f })),
                                           framework::dataset::make("IsScaled", { true }));
-const auto NormalizationDatasetFP32 = combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("NormType", { NormType::IN_MAP_1D, NormType::CROSS_MAP })),
+const auto NormalizationDatasetFP16 = combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("NormType", { NormType::IN_MAP_1D, NormType::CROSS_MAP })),
+                                                              framework::dataset::make("NormalizationSize", 3, 9, 2)),
+                                                      framework::dataset::make("Beta", { 0.5f, 1.f, 2.f })),
+                                              framework::dataset::make("IsScaled", { true }));
+
+const auto NormalizationDatasetFP32 = combine(combine(combine(combine(datasets::SmallShapes(), datasets::NormalizationTypes()),
                                                               framework::dataset::make("NormalizationSize", 3, 9, 2)),
                                                       framework::dataset::make("Beta", { 0.5f, 1.f, 2.f })),
                                               framework::dataset::make("IsScaled", { true, false }));
@@ -107,12 +112,12 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDatasetFP16, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDatasetFP16, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);