COMPMID-3236: Add support QSYMM16 into S32 CLPixelwiseMultiplicationKernel

Change-Id: Ifc519f53f04fcb14ddb9c17f98cc687f34285c97
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3018
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index ff5afa3..2df3ff4 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -28,17 +28,9 @@
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
 #include "support/StringSupport.h"
 
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-
 namespace arm_compute
 {
 namespace
@@ -77,7 +69,7 @@
                                                              1,
                                                              DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                              DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::F32);
+                                                             DataType::S32, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
@@ -86,6 +78,8 @@
                                         "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
                                         "Output can only be QSYMM16 if both inputs are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
+                                        "Output can only be S32 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
     }
 
@@ -177,22 +171,24 @@
         scale_int = std::abs(exponent - 1);
     }
 
-    std::string compute_type;
+    std::string acc_type;
     // Check if it has float inputs and output
     if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
     {
-        scale_int    = -1;
-        compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
+        scale_int = -1;
+        acc_type  = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+        if(input1->info()->element_size() == 2 || input2->info()->element_size() == 2)
         {
-            compute_type = "int";
+            // Use 32-bit accumulator for 16-bit input
+            acc_type = "int";
         }
         else
         {
-            compute_type = "ushort";
+            // Use 16-bit accumulator for 8-bit input
+            acc_type = "ushort";
         }
     }
 
@@ -205,7 +201,7 @@
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    if(is_quantized)
+    if(is_quantized && (output->info()->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
@@ -227,7 +223,7 @@
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
         build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()), "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
-        build_opts.add_option("-DDATA_TYPE_RES=" + compute_type);
+        build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
         if(act_info.enabled())
         {
             build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));