COMPMID-2980 (Nightly) armv7a build failures

Change-Id: I8c2a20fc345694d1ad6e0fe63e4f22fb73e6c1df
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2463
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/CPP/Validate.h b/arm_compute/core/CPP/Validate.h
index 1ec41a9..f195a31 100644
--- a/arm_compute/core/CPP/Validate.h
+++ b/arm_compute/core/CPP/Validate.h
@@ -37,15 +37,15 @@
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                                         const ITensorInfo *tensor_info)
+inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
+                                            const ITensorInfo *tensor_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::F16,
                                         function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    return arm_compute::Status {};
+    return Status {};
 }
 
 /** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in.
@@ -57,12 +57,12 @@
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                                         const ITensor *tensor)
+inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
+                                            const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
-    return arm_compute::Status{};
+    return Status{};
 }
 
 #define ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
index ef3adc4..37c1f1b 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
@@ -31,31 +31,21 @@
 using namespace neon_convolution_kernels;
 using namespace qasymm8;
 
-template <typename T, typename U = int32_t>
-inline T saturating_doubling_high_mul(const T&, const U&);
-
-template <>
 inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
 {
   return vqrdmulhq_s32(a, b);
 }
 
-template <>
 inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
 {
   return vqrdmulhq_n_s32(a, b);
 }
 
-template <>
 inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
 {
   return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
 }
 
-template <typename T, typename U = int32_t>
-inline T rounding_divide_by_exp2(const T& x, const U exponent);
-
-template <>
 inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
 {
   const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
@@ -63,7 +53,6 @@
   return vrshlq_s32(fixed, shift);
 }
 
-template <>
 inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
 {
   const int32x4_t shift = vdupq_n_s32(-exponent);
@@ -72,7 +61,6 @@
   return vrshlq_s32(fixed, shift);
 }
 
-template <>
 inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
 {
   const int32x2_t shift = vdup_n_s32(-exponent);
@@ -81,7 +69,6 @@
   return vrshl_s32(fixed, shift);
 }
 
-template <>
 inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
 {
   const int32x2_t xs = vdup_n_s32(x);
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 8c2ab92..52b1f65 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -103,6 +103,15 @@
                 break;
         }
     }
+    /** Initialize the union with a S8 pixel value
+     *
+     * @param[in] v S8 value.
+     */
+    PixelValue(int8_t v)
+        : PixelValue()
+    {
+        value.s8 = v;
+    }
     /** Initialize the union with a U8 pixel value
      *
      * @param[in] v U8 value.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index c281312..901d080 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1876,11 +1876,11 @@
 struct GEMMLowpOutputStageInfo
 {
     GEMMLowpOutputStageType type{ GEMMLowpOutputStageType::NONE }; /**< GEMMLowp output stage type */
-    int                     gemmlowp_offset{ 0 };                  /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
-    int                     gemmlowp_multiplier{ 0 };              /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    int                     gemmlowp_shift{ 0 };                   /**< GEMMLowp output stage shift used for quantizing to uint8 */
-    int                     gemmlowp_min_bound{ 0 };               /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
-    int                     gemmlowp_max_bound{ 0 };               /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
+    int32_t                 gemmlowp_offset{ 0 };                  /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
+    int32_t                 gemmlowp_multiplier{ 0 };              /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    int32_t                 gemmlowp_shift{ 0 };                   /**< GEMMLowp output stage shift used for quantizing to uint8 */
+    int32_t                 gemmlowp_min_bound{ 0 };               /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
+    int32_t                 gemmlowp_max_bound{ 0 };               /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
     std::vector<int32_t>    gemmlowp_multipliers{};                /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
     std::vector<int32_t>    gemmlowp_shifts{};                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
     bool                    is_quantized_per_channel{ false };     /**< GEMMLowp quantized per-channel flag */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index c11fffe..18c5471 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -557,15 +557,15 @@
  */
 inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
 {
-    PixelValue min(0);
-    PixelValue max(0);
+    PixelValue min{};
+    PixelValue max{};
     switch(dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
         {
-            min = PixelValue(std::numeric_limits<uint8_t>::lowest());
-            max = PixelValue(std::numeric_limits<uint8_t>::max());
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()));
             break;
         }
         case DataType::S8:
@@ -573,22 +573,22 @@
         case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
         {
-            min = PixelValue(std::numeric_limits<int8_t>::lowest());
-            max = PixelValue(std::numeric_limits<int8_t>::max());
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::max()));
             break;
         }
         case DataType::U16:
         case DataType::QASYMM16:
         {
-            min = PixelValue(std::numeric_limits<uint16_t>::lowest());
-            max = PixelValue(std::numeric_limits<uint16_t>::max());
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::max()));
             break;
         }
         case DataType::S16:
         case DataType::QSYMM16:
         {
-            min = PixelValue(std::numeric_limits<int16_t>::lowest());
-            max = PixelValue(std::numeric_limits<int16_t>::max());
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
             break;
         }
         case DataType::U32:
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index f0b0770..1bdc995 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -40,7 +40,7 @@
  *
  * @return a status
  */
-Status calculate_quantized_multiplier(float multiplier, int *quant_multiplier, int *shift);
+Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift);
 /** Calculate quantized representation of multiplier with value less than one.
  *
  * @param[in]  multiplier       Real multiplier.
@@ -49,7 +49,7 @@
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_less_than_one(float multiplier, int *quant_multiplier, int *right_shift);
+Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift);
 /** Calculate quantized representation of multiplier having value greater than one.
  *
  * @param[in]  multiplier           Real multiplier.
@@ -58,7 +58,7 @@
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift);
+Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
 
 /** Calculate quantized representation of per-channel multipliers with value less than one.
  *