COMPMID-421: Added FP16 suppot to NENormalizationLayer and NEPixelWiseMultiplication.

Change-Id: If174f8071502fc5cc94b27cd44a9b1d5e451a9e2
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79553
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 0183e54..76ace91 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -46,12 +46,10 @@
 
 void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared, output);
@@ -68,27 +66,79 @@
     _norm_info     = norm_info;
     _border_size   = BorderSize(0, border_width);
 
-    const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+    unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
-    switch(norm_info.type())
+    switch(_input->info()->data_type())
     {
-        case NormType::IN_MAP_1D:
-            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+        case DataType::F32:
+        {
+            num_elems_processed_per_iteration = 4;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
             break;
-        case NormType::IN_MAP_2D:
-            // Normalize over X and Y
-            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+        }
+        case DataType::F16:
+        {
+            num_elems_processed_per_iteration = 8;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
             break;
-        case NormType::CROSS_MAP:
-            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+        }
+        case DataType::QS8:
+        {
+            num_elems_processed_per_iteration = 16;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
             break;
+        }
         default:
             ARM_COMPUTE_ERROR("NOT SUPPORTED!");
     }
 
-    const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+    const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_rows                     = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
@@ -104,8 +154,8 @@
     INEKernel::configure(win);
 }
 
-template <unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize(const Window &window)
+template <DataType dt, unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_float(const Window &window)
 {
     Iterator input(_input, window);
     Iterator input_squared(_input_squared, window);
@@ -121,39 +171,83 @@
     const int min_top    = 0;
     const int max_bottom = _input->info()->dimension(dim_y) - 1;
 
-    const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
-    const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
-    const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    if(dt == DataType::F32)
     {
-        // Get range to normalize
-        const int current_row   = do_2D_norm ? id[dim_y] : 0;
-        const int current_slice = id[dim];
-        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-        const int first_slice   = std::max(current_slice - radius, min_left);
-        const int last_slice    = std::min(current_slice + radius, max_right);
+        const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+        const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
+        const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
 
-        // Accumulate 2D In-Map values
-        float32x4_t accu = vdupq_n_f32(0.f);
-        for(int j = first_row; j <= last_row; j++)
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            // Compute row displacement
-            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-            for(int i = first_slice; i <= last_slice; ++i)
-            {
-                accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
-            }
-        }
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
 
-        // Normalize
-        const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
-        const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
-    },
-    input, input_squared, output);
+            // Accumulate 2D In-Map values
+            float32x4_t accu = vdupq_n_f32(0.f);
+            for(int j = first_row; j <= last_row; j++)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            // Normalize
+            const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+            const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+            vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    else if(dt == DataType::F16)
+    {
+        const float16x8_t coeff_vec    = vdupq_n_f16(_norm_info.scale_coeff());
+        const float16x8_t beta_vec_f16 = vdupq_n_f16(_norm_info.beta());
+        const float16x8_t kappa_vec    = vdupq_n_f16(_norm_info.kappa());
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
+
+            // Accumulate 2D In-Map values
+            float16x8_t accu = vdupq_n_f16(0.f);
+            for(int j = first_row; j <= last_row; j++)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vaddq_f16(accu, vld1q_f16(reinterpret_cast<const float16_t *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            const float16x8_t norm_f16         = vpowq_f16(vaddq_f16(kappa_vec, vmulq_f16(coeff_vec, accu)), beta_vec_f16);
+            const float16x8_t normalized_pixel = vmulq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), vinvq_f16(norm_f16));
+            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
 }
 
 template <unsigned int dim, bool do_2D_norm>