COMPMID-3705: Remove OpenCL padding: CLBatchNormalizationLayerKernel

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: If077a245156be69f34834cbfbd0a36e570ee4149
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4107
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 16dbeaf..89cbe44 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -129,7 +129,7 @@
     res = MUL_OP(gamma_vec, x_bar);
 #else  /* USE_DEFAULT_GAMMA */
     // gamma is equal to 1, no need to perform multiplications
-    res          = x_bar;
+    res                         = x_bar;
 #endif /* USE_DEFAULT_GAMMA */
 
 #ifndef USE_DEFAULT_BETA
@@ -198,19 +198,21 @@
 #endif /* USE_DEFAULT_GAMMA */
                                             float epsilon)
 {
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
 #ifdef IN_PLACE
-    Tensor3D out = in;
+    __global uchar *output_addr = input_ptr;
 #else  /* IN_PLACE */
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
 #endif /* IN_PLACE */
-    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
 #ifndef USE_DEFAULT_BETA
-    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
 #endif /* USE_DEFAULT_BETA */
 #ifndef USE_DEFAULT_GAMMA
-    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
 #endif /* USE_DEFAULT_GAMMA */
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -222,40 +224,37 @@
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     x_bar = 0;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = 0;
+    res0 = 0;
 
-    const int current_slice = get_global_id(0);
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
     denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
 
     // Calculate x bar and store results
-    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
     numerator = SUB_OP(data, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
 #ifndef USE_DEFAULT_GAMMA
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
 
-    res = MUL_OP(gamma_vec, x_bar);
+    res0 = MUL_OP(gamma_vec, x_bar);
 #else  /* USE_DEFAULT_GAMMA */
     // gamma is equal to 1, no need to perform multiplications
-    res = x_bar;
+    res0 = x_bar;
 #endif /* USE_DEFAULT_GAMMA */
 
 #ifndef USE_DEFAULT_BETA
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
     // beta is not zero, hence we need to perform the addition
-    res = ADD_OP(res, beta_vec);
+    res0 = ADD_OP(res0, beta_vec);
 #endif /* USE_DEFAULT_BETA */
 
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
 
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)out.ptr);
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
 }
 #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
 
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index a2cabcf..1c1df6c 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -80,16 +80,9 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                        ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma)
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    if(output != nullptr)
-    {
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output, *input->clone());
-    }
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -107,25 +100,6 @@
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, mean_access, var_access);
-
-        if(beta != nullptr)
-        {
-            AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
-            window_changed = window_changed || update_window_and_padding(win, beta_access);
-        }
-        if(gamma != nullptr)
-        {
-            AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
-            window_changed = window_changed || update_window_and_padding(win, gamma_access);
-        }
-    }
-
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
@@ -162,12 +136,13 @@
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -191,13 +166,24 @@
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), *input->info()->clone());
+    }
+
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(),
-                                                    mean->info(), var->info(),
-                                                    (beta != nullptr) ? beta->info() : nullptr,
-                                                    (gamma != nullptr) ? gamma->info() : nullptr);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    if(input->info()->data_layout() == DataLayout::NHWC)
+    {
+        Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+        ICLKernel::configure_internal(win);
+    }
+    else
+    {
+        auto win_config = validate_and_configure_window_nchw(input->info(), (_run_in_place) ? nullptr : output->info());
+        ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+        ICLKernel::configure_internal(win_config.second);
+    }
 
     _config_id = "batch_normalization_layer_";
     _config_id += string_from_data_type(input->info()->data_type());
@@ -218,11 +204,12 @@
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(),
-                                                              mean->clone().get(), var->clone().get(),
-                                                              (beta != nullptr) ? beta->clone().get() : nullptr,
-                                                              (gamma != nullptr) ? gamma->clone().get() : nullptr)
-                                .first);
+
+    if(input->data_layout() != DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                                    .first);
+    }
 
     return Status{};
 }
diff --git a/tests/validation/CL/BatchNormalizationLayer.cpp b/tests/validation/CL/BatchNormalizationLayer.cpp
index 3d1848b..e67f4cc 100644
--- a/tests/validation/CL/BatchNormalizationLayer.cpp
+++ b/tests/validation/CL/BatchNormalizationLayer.cpp
@@ -62,6 +62,29 @@
                                                            framework::dataset::make("UseBeta", { false, true })),
                                                    framework::dataset::make("UseGamma", { false, true })),
                                            framework::dataset::make("Epsilon", { 0.001f }));
+
+bool validate_zero_padding(TensorShape shape0, const TensorShape shape1, float epsilon, ActivationLayerInfo act_info, DataType dt, DataLayout data_layout)
+{
+    if(data_layout == DataLayout::NHWC)
+    {
+        permute(shape0, PermutationVector(2U, 0U, 1U));
+    }
+
+    // Create tensors
+    CLTensor src   = create_tensor<CLTensor>(shape0, dt, 1, QuantizationInfo(), data_layout);
+    CLTensor dst   = create_tensor<CLTensor>(shape0, dt, 1, QuantizationInfo(), data_layout);
+    CLTensor mean  = create_tensor<CLTensor>(shape1, dt, 1);
+    CLTensor var   = create_tensor<CLTensor>(shape1, dt, 1);
+    CLTensor beta  = create_tensor<CLTensor>(shape1, dt, 1);
+    CLTensor gamma = create_tensor<CLTensor>(shape1, dt, 1);
+
+    // Create and configure function
+    CLBatchNormalizationLayer norm;
+    norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon, act_info);
+
+    return src.info()->padding().empty() && dst.info()->padding().empty() && mean.info()->padding().empty() && var.info()->padding().empty() && beta.info()->padding().empty()
+           && gamma.info()->padding().empty();
+}
 } // namespace
 
 TEST_SUITE(CL)
@@ -118,6 +141,16 @@
 // clang-format on
 // *INDENT-ON*
 
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
+                                                                                         act_infos),
+                                                                                 framework::dataset::make("DataType", { DataType::F32, DataType::F16 })),
+                                                                         framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+               shape0, shape1, episilon, act_infos, data_type, data_layout)
+{
+    bool status = validate_zero_padding(shape0, shape1, episilon, act_infos, data_type, data_layout);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(Random, CLBatchNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),