COMPMID-1632 Add CLL2NormalizationLayer for NHWC and FP32

Change-Id: Iae22554d5fe893fd22a000eab5bfd8275ea06eb3
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154102
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index c1be447..d76e12a 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -189,7 +189,12 @@
 
     for(unsigned int y = 0; y < HEIGHT; ++y)
     {
-        res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
     }
 
 #if defined(MEAN)
@@ -236,7 +241,12 @@
 
     for(unsigned int z = 0; z < DEPTH; ++z)
     {
-        res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
     }
 
 #if defined(MEAN)
@@ -288,7 +298,12 @@
 
     for(unsigned int w = 0; w < BATCH; ++w)
     {
-        res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
     }
 
 #if defined(MEAN)