COMPMID-2409: Add QSYMM16 support for PixelWiseMultiplication for NEON

Change-Id: Idfd3b45857201d5143242f9517d3353150b2c923
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1422
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 172aaef..2247c14 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -69,19 +69,20 @@
 {
 #ifdef __aarch64__
     return vrndnq_f32(val);
-#else // __aarch64__
+#else  // __aarch64__
     static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
-    static const float32x4_t CONST_1_FLOAT = vdupq_n_f32(1.f);
-    static const int32x4_t CONST_1_INT = vdupq_n_s32(1);
-    const float32x4_t floor_val = vfloorq_f32(val);
-    const float32x4_t diff = vsubq_f32(val, floor_val);
+    static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
+    static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
+    const float32x4_t        floor_val        = vfloorq_f32(val);
+    const float32x4_t        diff             = vsubq_f32(val, floor_val);
 
     /*
     * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
     * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT))))
     */
 
-    return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))), floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+    return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+                     floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
 #endif // __aarch64__
 }
 
@@ -191,6 +192,21 @@
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
+{
+    const int32x4_t shift_vec  = vdupq_n_s32(-exponent);
+    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+inline int32_t rounding_divide_by_pow2(int32_t x, int exponent)
+{
+    const int32_t mask      = (1 << exponent) - 1;
+    const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
+    return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Exponent polynomial coefficients */
 /** Logarithm polynomial coefficients */