COMPMID-3477: Remove padding from NEPixelWiseMultiplicationKernel

Remove padding from all NEPixelWiseMultiplicationKernel functions.
Add test case for U8_U8_S16(input1,input2,output).
Add reference code for U8_U8_S16(input1,input2,output).
Remove window shrink test from NormalizationLayer.

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: I28d89790c5527a42f918814a0ee3d6ec4c273532
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3468
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index f72d156..e4f7207 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -68,24 +68,21 @@
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Even normalization
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non implemented IN_MAP_2D
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                           }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(27U, 11U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                           })),
     framework::dataset::make("NormInfo",  { NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::IN_MAP_1D, 4),
                                             NormalizationLayerInfo(NormType::IN_MAP_2D, 5),
-                                            NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::CROSS_MAP, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, true, true })),
     input_info, output_info, norm_info, expected)
 {
     bool is_valid = bool(NENormalizationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), norm_info));
@@ -110,9 +107,7 @@
     NENormalizationLayer norm;
     norm.configure(&src, &dst, info);
 
-    // To enable check on src as soon as NEPixelWiseMultiplicationKernel stops using padding anymore: COMPMID-3477
-    //validate(src.info()->padding(), PaddingSize(0,0,0,0));
-    validate(dst.info()->padding(), PaddingSize());
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
 }
 
 template <typename T>
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 9c0417b..29eaf0c 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -83,16 +83,17 @@
 
 // *INDENT-OFF*
 // clang-format off
-#define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, SCALE, RP, INPLACE_DATASET, VALIDATE) \
-    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                   \
-                           combine(combine(combine(combine(combine(combine(                                                       \
+#define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, DT3, SCALE, RP, INPLACE_DATASET, VALIDATE) \
+    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                        \
+                           combine(combine(combine(combine(combine(combine(combine(                                            \
                            datasets::SHAPES,                                                                              \
                            framework::dataset::make("DataType1", DataType::DT1)),                                         \
                            framework::dataset::make("DataType2", DataType::DT2)),                                         \
+                           framework::dataset::make("DataType3", DataType::DT3)),                                         \
                            framework::dataset::make("Scale", std::move(SCALE))),                                          \
                            datasets::ConvertPolicies()),                                                                  \
-                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)), \
-                           (INPLACE_DATASET)))                               \
+                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)),                               \
+                           (INPLACE_DATASET)))                                                                            \
     {                                                                                                                     \
         VALIDATE                                                                                                          \
     }
@@ -115,6 +116,7 @@
 using NEPixelWiseMultiplicationToF32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
 template <typename T>
 using NEPixelWiseMultiplicationBroadcastFixture = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationU8U8ToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t, int16_t>;
 
 TEST_SUITE(NEON)
 TEST_SUITE(PixelWiseMultiplication)
@@ -193,7 +195,7 @@
                                                    ConvertPolicy::WRAP,
                                         })),
 
-               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false, true , false, false, true, false })),
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false })),
                input1_info, input2_info, output_info, scale, policy, expected)
 {
     bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, policy, RoundingPolicy::TO_ZERO));
@@ -376,18 +378,48 @@
 TEST_SUITE_END() // QSYMM16toS32
 TEST_SUITE_END() // Quantized
 
+TEST_SUITE(U8U8toS16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::U8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::U8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::S16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       datasets::ConvertPolicies()),
+                                                                                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
+                                                                                                                       framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall1, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                        framework::dataset::make("DataTypeIn1", DataType::U8)),
+                                                                                                                        framework::dataset::make("DataTypeIn2", DataType::U8)),
+                                                                                                                        framework::dataset::make("DataTypeOut", DataType::S16)),
+                                                                                                                        framework::dataset::make("Scale", { scale_other })),
+                                                                                                                        datasets::ConvertPolicies()),
+                                                                                                                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_ZERO)),
+                                                                                                                        framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // U8U8toS16
+
 TEST_SUITE(U8toU8)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(uint8_t, 1))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(uint8_t, 1))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // U8toU8
@@ -395,16 +427,18 @@
 TEST_SUITE(U8toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
                                                  WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }), DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }),
+                                                 DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, scale_other, TO_ZERO, framework::dataset::make("InPlace", { false }), DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, framework::dataset::make("InPlace", { false }),
+                                                 DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // U8toS16
@@ -412,15 +446,15 @@
 TEST_SUITE(S16toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(int16_t, 2))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // S16toS16
@@ -429,7 +463,7 @@
 TEST_SUITE(F16toF16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF16Fixture<half_float::half>, ALL, SmallShapes(), F16, F16, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF16Fixture<half_float::half>, ALL, SmallShapes(), F16, F16, F16, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE_END() // F16toF16
@@ -438,21 +472,21 @@
 TEST_SUITE(F32toF32)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // F32toF32
 
 TEST_SUITE(Broadcast)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, ALL, SmallShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, ALL, SmallShapesBroadcast(), F32, F32, F32, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
                                                  VALIDATE(float, 1.f))
 TEST_SUITE_END() // Broadcast
 
diff --git a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
index 315c840..3869c35 100644
--- a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
+++ b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
@@ -139,27 +139,28 @@
     bool             _is_inplace{ false };
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class PixelWiseMultiplicationValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2, typename T3 = T2>
+class PixelWiseMultiplicationValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, bool is_inplace)
+    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, bool is_inplace)
     {
-        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, shape, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
-                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape, shape, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
+                                                                                                                   QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class PixelWiseMultiplicationBroadcastValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2, typename T3 = T2>
+class PixelWiseMultiplicationBroadcastValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, bool is_inplace)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+               bool is_inplace)
     {
-        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape0, shape1, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
-                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
+                                                                                                                   QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 3e21fca..0ee8dee 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp
@@ -178,6 +178,34 @@
 }
 
 template <>
+SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+                                                DataType dt_out, const QuantizationInfo &qout)
+{
+    SimpleTensor<int16_t> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dt_out, 1, qout);
+
+    if(src1.data_type() == DataType::QASYMM8 && src2.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
+        SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
+        SimpleTensor<float> dst_tmp  = pixel_wise_multiplication<float, float, float>(src1_tmp, src2_tmp, scale, convert_policy, rounding_policy, DataType::F32, qout);
+        dst                          = convert_to_symmetric<int16_t>(dst_tmp, qout);
+    }
+    else
+    {
+        if(scale < 0)
+        {
+            ARM_COMPUTE_ERROR("Scale of pixel-wise multiplication must be non-negative");
+        }
+
+        Coordinates id_src1{};
+        Coordinates id_src2{};
+        Coordinates id_dst{};
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, scale, convert_policy, rounding_policy, id_src1, id_src2, id_dst);
+    }
+    return dst;
+}
+
+template <>
 SimpleTensor<int8_t> pixel_wise_multiplication(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
                                                DataType dt_out, const QuantizationInfo &qout)
 {
@@ -234,6 +262,7 @@
 }
 // *INDENT-OFF*
 // clang-format off
+template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<int32_t> pixel_wise_multiplication(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);
 template SimpleTensor<float> pixel_wise_multiplication(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, DataType dt_out, const QuantizationInfo &qout);