COMPMID-447: Support scaling factors different than 1 for QS8/QS16 NEPixelWiseMultiplication.

Change-Id: I6d90a18df861d53546bdca982192b4ffc0dbb3c2
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80794
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 150db39..33663eb 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -131,60 +131,100 @@
 template <bool is_scale255, bool is_sat>
 void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
 {
-    // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
-    ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
-    ARM_COMPUTE_UNUSED(n);
-
-    const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
     const auto output = static_cast<qint8_t *__restrict>(output_ptr);
 
-    const qint8x16_t ta1 = vld1q_qs8(input1);
-    const qint8x16_t ta2 = vld1q_qs8(input2);
+    const qint8x16_t ta1 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input1_ptr));
+    const qint8x16_t ta2 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input2_ptr));
 
-    qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+    if(is_scale255)
+    {
+        qint16x8_t       tmp1_high = vmovl_s8(vget_high_s8(ta1));
+        qint16x8_t       tmp1_low  = vmovl_s8(vget_low_s8(ta1));
+        const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2));
+        const qint16x8_t tmp2_low  = vmovl_s8(vget_low_s8(ta2));
 
-    vst1q_s8(output, res);
+        const float32x4x2_t scale255_f32 =
+        {
+            {
+                scale255_constant_f32q,
+                scale255_constant_f32q
+            }
+        };
+        const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
+
+        tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position);
+        tmp1_low  = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position);
+        tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position);
+        tmp1_low  = vmulq_qs16(tmp1_low, scale255, fixed_point_position);
+
+        if(is_sat)
+        {
+            vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high)));
+        }
+        else
+        {
+            vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high)));
+        }
+    }
+    else
+    {
+        const qint8x16_t vn  = vdupq_n_s8(-n);
+        qint8x16_t       res = ta2;
+
+        if(is_sat)
+        {
+            res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn);
+        }
+        else
+        {
+            res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn);
+        }
+        vst1q_qs8(output, res);
+    }
 }
 
 template <bool is_scale255, bool is_sat>
 void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
 {
-    // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
-    ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 16-bit fixed-point pixel-wise multiplication");
-    ARM_COMPUTE_UNUSED(n);
-
     const qint16x8x2_t ta1 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input1_ptr));
-    const qint16x8x2_t ta2 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
+    qint16x8x2_t       res = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
 
-    if(is_sat)
+    if(is_scale255)
     {
-        const qint16x8x2_t res =
+        const float32x4x2_t scale255_f32 =
         {
             {
-                // First 8 elements
-                vqmulq_qs16(ta1.val[0], ta2.val[0], fixed_point_position),
-                // Second 8 elements
-                vqmulq_qs16(ta1.val[1], ta2.val[1], fixed_point_position)
+                scale255_constant_f32q,
+                scale255_constant_f32q
             }
         };
-
-        vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
+        const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
+        if(is_sat)
+        {
+            res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
+            res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
+        }
+        else
+        {
+            res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
+            res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
+        }
     }
     else
     {
-        const qint16x8x2_t res =
+        const qint16x8_t vn = vdupq_n_s16(-n);
+        if(is_sat)
         {
-            {
-                // First 8 elements
-                vmulq_qs16(ta1.val[0], ta2.val[0], fixed_point_position),
-                // Second 8 elements
-                vmulq_qs16(ta1.val[1], ta2.val[1], fixed_point_position)
-            }
-        };
-
-        vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
+            res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
+            res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
+        }
+        else
+        {
+            res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
+            res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
+        }
     }
+    vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -438,6 +478,8 @@
     {
         // Check that all data types are the same and all fixed-point positions are the same
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+        // Check if scale is representable in fixed-point with the provided settings
+        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
     }
 
     _input1         = input1;
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index c46ad6a..f809448 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -442,6 +442,36 @@
     validate(Accessor(dst), ref_dst);
 }
 
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmallScale255, SmallShapes() * DataType::QS8 * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_NEAREST_UP * boost::unit_test::data::xrange(1, 7),
+                     shape, dt, scale, convert_policy, rounding_policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(Accessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmallScaleOther, SmallShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7),
+                     shape, dt, convert_policy, rounding_policy, fixed_point_position)
+{
+    const float scale = 1.f / static_cast<float>(1 << fixed_point_position);
+
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(Accessor(dst), ref_dst, 1.f);
+}
+
 BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
 BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange<int>(1, 7),
                      shape, dt, convert_policy, rounding_policy, fixed_point_position)
@@ -455,6 +485,36 @@
     // Validate output
     validate(Accessor(dst), ref_dst);
 }
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLargeScale255, LargeShapes() * DataType::QS8 * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7),
+                     shape, dt, scale, convert_policy, rounding_policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(Accessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLargeScaleOther, LargeShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 7),
+                     shape, dt, convert_policy, rounding_policy, fixed_point_position)
+{
+    const float scale = 1.f / static_cast<float>(1 << fixed_point_position);
+
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(Accessor(dst), ref_dst, 1.f);
+}
 BOOST_AUTO_TEST_SUITE_END()
 
 BOOST_AUTO_TEST_SUITE(QS16)
@@ -472,6 +532,36 @@
     validate(Accessor(dst), ref_dst);
 }
 
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmallScale255, SmallShapes() * DataType::QS16 * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_NEAREST_UP * boost::unit_test::data::xrange(1, 15),
+                     shape, dt, scale, convert_policy, rounding_policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(Accessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmallScaleOther, SmallShapes() * DataType::QS16 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange(1, 15),
+                     shape, dt, convert_policy, rounding_policy, fixed_point_position)
+{
+    const float scale = 1.f / static_cast<float>(1 << fixed_point_position);
+
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, scale, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, scale, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(Accessor(dst), ref_dst, 1.f);
+}
+
 BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
 BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * DataType::QS16 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange<int>(1, 15),
                      shape, dt, convert_policy, rounding_policy, fixed_point_position)
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 4d067ac..3190478 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -866,7 +866,7 @@
 
 // Fixed-point Pixel-wise Multiplication
 template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
-void fixed_point_pixel_wise_multiplication(const Tensor<T> &in1, const Tensor<T> &in2, Tensor<T> &out, int scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+void fixed_point_pixel_wise_multiplication(const Tensor<T> &in1, const Tensor<T> &in2, Tensor<T> &out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
 {
     using namespace fixed_point_arithmetic;
 
@@ -881,18 +881,20 @@
     ARM_COMPUTE_ERROR_ON((in1.data_type() == DataType::QS8) && (fixed_point_position == 0 || fixed_point_position > 7));
     ARM_COMPUTE_ERROR_ON((in1.data_type() == DataType::QS16) && (fixed_point_position == 0 || fixed_point_position > 15));
 
-    fixed_point<T> fp_scale(scale, fixed_point_position);
-    const bool     is_sat     = convert_policy == ConvertPolicy::SATURATE;
-    const bool     do_scaling = scale != 1;
+    const fixed_point<T> fp_scale(scale, fixed_point_position);
+    const bool           is_sat = convert_policy == ConvertPolicy::SATURATE;
 
     for(int i = 0; i < in1.num_elements(); ++i)
     {
-        fixed_point<T> val1(in1[i], fixed_point_position, true);
-        fixed_point<T> val2(in2[i], fixed_point_position, true);
-        fixed_point<T> res = (is_sat) ? val1 * val2 : mul<OverflowPolicy::WRAP>(val1, val2);
-        if(do_scaling)
+        const fixed_point<T> val1(in1[i], fixed_point_position, true);
+        fixed_point<T>       res(in2[i], fixed_point_position, true);
+        if(is_sat)
         {
-            res = (is_sat) ? res * fp_scale : mul<OverflowPolicy::WRAP>(res, fp_scale);
+            res = mul(mul(res, val1), fp_scale);
+        }
+        else
+        {
+            res = mul<OverflowPolicy::WRAP>(mul<OverflowPolicy::WRAP>(res, val1), fp_scale);
         }
         out[i] = res.raw();
     }