APPBROWSER-314: Performance optimazation for BatchNormalizationLayer

Change-Id: Ie3ad9abb64e90720609bb6e67662eaf9dd4f3689
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111826
Reviewed-by: Joel Liang <joel.liang@arm.com>
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
(cherry picked from commit 02c1fa663926cc4fcd1995d4d18d7528e0c85d94)
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111834
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index c3df5d5..be1d01f 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
@@ -127,12 +127,12 @@
 }
 
 #elif defined(DATA_TYPE_FP16)
-BUFFER_DECLARATION(src, 1, uint, );
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
-BUFFER_DECLARATION(mean, 3, uint, );
-BUFFER_DECLARATION(var, 4, uint, );
-BUFFER_DECLARATION(beta, 5, uint, );
-BUFFER_DECLARATION(gamma, 6, uint, );
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(mean, 3, uvec2, readonly);
+BUFFER_DECLARATION(var, 4, uvec2, readonly);
+BUFFER_DECLARATION(beta, 5, uvec2, readonly);
+BUFFER_DECLARATION(gamma, 6, uvec2, readonly);
 
 /** Apply batch normalization.
  *
@@ -180,43 +180,86 @@
     Vector   beta  = CONVERT_TO_VECTOR_STRUCT_FP16(beta);
     Vector   gamma = CONVERT_TO_VECTOR_STRUCT_FP16(gamma);
 
-    vec2  input_value;
+    uvec2 packed_s[5];
+    vec4  unpacked_s[5];
     float denominator;
     float numerator;
-    vec2  x_bar;
     float gamma_param;
     float beta_param;
+    vec4  x_bar;
+    vec4  result;
 
     uint current_slice = gl_GlobalInvocationID.z;
-    if((current_slice % uint(2)) == uint(0))
+    packed_s[0]        = src_ptr[src.current_offset >> 3];
+    packed_s[1]        = var_ptr[(var.current_offset + current_slice * var.stride_x) >> 3];
+    packed_s[2]        = mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 3];
+    packed_s[3]        = gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 3];
+    packed_s[4]        = beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 3];
+    unpacked_s[0]      = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    unpacked_s[1]      = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    unpacked_s[2]      = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+    unpacked_s[3]      = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y));
+    unpacked_s[4]      = vec4(unpackHalf2x16(packed_s[4].x), unpackHalf2x16(packed_s[4].y));
+
+    if((current_slice % uint(4)) == uint(0))
     {
-        input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
-        denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).x;
+        denominator = unpacked_s[1].x;
         denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
 
         //Calculate x bar and store results
-        numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).x;
-        x_bar     = MUL_OP(SUB_OP(input_value, numerator), denominator);
+        numerator = unpacked_s[2].x;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
 
-        gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).x;
-        beta_param  = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).x;
+        gamma_param = unpacked_s[3].x;
+        beta_param  = unpacked_s[4].x;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
 
-        dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+        dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
+    }
+    else if((current_slice % uint(4)) == uint(1))
+    {
+        denominator = unpacked_s[1].y;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpacked_s[2].y;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+        gamma_param = unpacked_s[3].y;
+        beta_param  = unpacked_s[4].y;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+        dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
+    }
+    else if((current_slice % uint(4)) == uint(2))
+    {
+        denominator = unpacked_s[1].z;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpacked_s[2].z;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+        gamma_param = unpacked_s[3].z;
+        beta_param  = unpacked_s[4].z;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+        dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
     }
     else
     {
-        input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
-        denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).y;
+        denominator = unpacked_s[1].w;
         denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
 
         //Calculate x bar and store results
-        numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).y;
-        x_bar     = MUL_OP(SUB_OP(input_value, numerator), denominator);
+        numerator = unpacked_s[2].w;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
 
-        gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).y;
-        beta_param  = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).y;
+        gamma_param = unpacked_s[3].w;
+        beta_param  = unpacked_s[4].w;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
 
-        dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+        dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
     }
 }
-#endif /*DATA_TYPE_FP32*/
+#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index 982143f..dee2a55 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -64,7 +64,11 @@
     _gamma   = gamma;
     _epsilon = epsilon;
 
-    const unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size();
+    unsigned int num_elems_processed_per_iteration = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+    }
 
     // Set build options
     std::set<std::string> build_opts;
@@ -83,10 +87,10 @@
 
     AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 1, mean->info()->dimension(1));
-    AccessWindowStatic     var_access(var->info(), 0, 0, var->info()->dimension(0) + 1, var->info()->dimension(1));
-    AccessWindowStatic     beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 1, beta->info()->dimension(1));
-    AccessWindowStatic     gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 1, gamma->info()->dimension(1));
+    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 3, mean->info()->dimension(1));
+    AccessWindowStatic     var_access(var->info(), 0, 0, var->info()->dimension(0) + 3, var->info()->dimension(1));
+    AccessWindowStatic     beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 3, beta->info()->dimension(1));
+    AccessWindowStatic     gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 3, gamma->info()->dimension(1));
 
     update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
     output_access.set_valid_region(win, input->info()->valid_region());
diff --git a/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp b/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp
new file mode 100755
index 0000000..4464ea2
--- /dev/null
+++ b/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITCLSS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "tests/GLES_COMPUTE/GCAccessor.h"
+#include "tests/benchmark/fixtures/BatchNormalizationLayerFixture.h"
+#include "tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h"
+#include "tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace
+{
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+} // namespace
+
+using GCBatchNormalizationLayerFixture = BatchNormalizationLayerFixture<GCTensor, GCBatchNormalizationLayer, GCAccessor>;
+
+TEST_SUITE(GC)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::ALL,
+                                framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2BatchNormalizationLayerDataset(),
+                                                                                        data_types),
+                                                            framework::dataset::make("Batches", 1)));
+
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::ALL,
+                                framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4BatchNormalizationLayerDataset(),
+                                                                                        data_types),
+                                                            framework::dataset::make("Batches", 1)));
+
+TEST_SUITE(NIGHTLY)
+REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::NIGHTLY,
+                                framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2BatchNormalizationLayerDataset(),
+                                                                                        data_types),
+                                                            framework::dataset::make("Batches", { 4, 8 })));
+
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::NIGHTLY,
+                                framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4BatchNormalizationLayerDataset(),
+                                                                                        data_types),
+                                                            framework::dataset::make("Batches", { 4, 8 })));
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h b/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h
index 79dbc76..55411a4 100644
--- a/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h
+++ b/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h
@@ -29,6 +29,12 @@
 #include "tests/Globals.h"
 #include "tests/Utils.h"
 #include "tests/framework/Fixture.h"
+#ifdef ARM_COMPUTE_GC
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "tests/GLES_COMPUTE/Helper.h"
+
+using namespace arm_compute::test::gles_compute;
+#endif /* ARM_COMPUTE_GC */
 
 namespace arm_compute
 {
@@ -76,6 +82,12 @@
     void run()
     {
         batch_norm_layer.run();
+#ifdef ARM_COMPUTE_GC
+        if(opengles31_is_available() && std::is_same<typename std::decay<TensorType>::type, arm_compute::GCTensor>::value)
+        {
+            force_sync_tensor(dst);
+        }
+#endif /* ARM_COMPUTE_GC */
     }
 
     void teardown()