COMPMID-556: Fix CLNormalization issues.

-Extracts calculations from the CL kernel core loop.
-Changes the access elements for CROSS_MAP to reduce the applied
redundant padding.

Change-Id: If41c3adddd977be9386fe34940d055c301ccbb91
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95917
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 4e65560..f870589 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -93,13 +93,13 @@
 
     const int current_slice = get_global_id(2);
 
-    const int left_slice  = max(current_slice - (int)RADIUS, (int)0);
-    const int right_slice = min(current_slice + (int)RADIUS, (int)(NUM_SLICES - 1));
+    const int left_slice  = max(-(int)RADIUS, -current_slice);
+    const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
 
     for(int i = left_slice; i <= right_slice; i++)
     {
         VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i - current_slice));
+        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
         acc    = ADD_OP(acc, MUL_OP(values, values));
     }
 
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index a744739..6481ad0 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -73,7 +73,7 @@
     _border_size                    = BorderSize(0, border_width);
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->info()->data_type())) ? 16 : 4;
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_elems_read_per_iteration      = _is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2)) : num_elems_processed_per_iteration;
 
     // Set build options
     std::set<std::string> build_opts;