COMPMID-2708 NEDepthwiseConvolution Generic: support for QUANT8_PER_CHANNEL_SYMM

COMPMID-2470 Implement a new and generic depthwise convolution for NEON QASYMM8 NHWC
COMPMID-2477 Enable FP16 data type for the new generic convolution on NEON for NHWC
COMPMID-2625 Remove old implementation files for the generic NEDepthwiseConvolution

Change-Id: I8f6deda4fc69dd7e472fba3228b1ed5dad172f3e
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2094
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index 7458f81..608093d 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
@@ -188,17 +188,17 @@
     {
         for(int z = 0; z < input_depth; ++z)
         {
-            int         output_multiplier = 0;
-            int         output_shift      = 0;
-            const float weights_scale     = (is_quantized_per_channel) ? weights_scale_vec[z] : weights_scale_vec[0];
-            const float multiplier        = input_scale * weights_scale / output_scale;
-            arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
             for(unsigned int m = 0; m < depth_multiplier; ++m)
             {
                 const int     out_z    = z * depth_multiplier + m;
                 const int32_t bias_val = *static_cast<const int32_t *>(biases(Coordinates(out_z)));
 
+                int         output_multiplier = 0;
+                int         output_shift      = 0;
+                const float weights_scale     = (is_quantized_per_channel) ? weights_scale_vec[out_z] : weights_scale_vec[0];
+                const float multiplier        = input_scale * weights_scale / output_scale;
+                arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
                 for(int y = minimum_y; y <= minimum_y + maximum_y; y += conv_info.stride().second)
                 {
                     for(int x = minimum_x; x <= minimum_x + maximum_x; x += conv_info.stride().first)