COMPMID-2336: Fix build issues.

Change-Id: I0932dc9ca4649f0825950ed9d6d249212bc6971e
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1671
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index fb38bdc..f7edf8e 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -116,7 +116,7 @@
             _scores_above_thd_vector.emplace_back(score_i);
             // Initialize respective index and visited
             _sorted_indices.emplace_back(num_above_thd);
-            _visited.emplace_back(false);
+            _visited.push_back(false);
             ++num_above_thd;
         }
     }
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index aafdb2e..c9d4e9b 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -28,6 +28,8 @@
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
+#include "support/ToolchainSupport.h"
+
 namespace arm_compute
 {
 namespace
@@ -160,7 +162,7 @@
                 for(size_t m = 0; m < depth_multiplier; ++m)
                 {
                     const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
-                    acc.at(m)              = std::fma(weights_val, input_val, acc.at(m));
+                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
                 }
 
                 offs += dilation.x() * input_stride_y;
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
index f638f0b..e8f44b6 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
@@ -373,12 +373,23 @@
           final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
         }
 
+#ifndef __aarch64__
+        const int16x8x2_t zelems = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
+                                             vreinterpretq_s16_s32(final_accs[1]));
+        const int8x16_t elems = vreinterpretq_s8_s16(zelems.val[0]);
+
+        const int8x16x2_t zoutput = vuzpq_s8(elems, elems);
+        const uint8x8_t output =
+                vget_low_u8(vreinterpretq_u8_s8(zoutput.val[0]));
+        vst1_u8(get_output_ptr(oi, oj, channel), output);
+#else
         const int8x16_t elems = vreinterpretq_s8_s16(
             vuzp1q_s16(vreinterpretq_s16_s32(final_accs[0]),
                        vreinterpretq_s16_s32(final_accs[1])));
         const uint8x8_t output =
             vget_low_u8(vreinterpretq_u8_s8(vuzp1q_s8(elems, elems)));
         vst1_u8(get_output_ptr(oi, oj, channel), output);
+#endif // __aarch64__
       }
     }
   }
diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 020a4a1..03bbff9 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h
@@ -195,6 +195,23 @@
     return ::copysign(x, y);
 }
 
+/** Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
+ *
+ * @note This function implements the same behaviour as std::fma except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] x floating-point value
+ * @param[in] y floating-point value
+ * @param[in] z floating-point value
+ *
+ * @return Result floating point value equal to (x*y) + z.c
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T fma(T x, T y, T z)
+{
+    return ::fma(x, y, z);
+}
+
 /** Loads the data from the given location, converts them to character string equivalents
  *  and writes the result to a character string buffer.
  *
@@ -304,6 +321,23 @@
     return std::copysign(x, y);
 }
 
+/** Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
+ *
+ * @note This function implements the same behaviour as std::fma except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] x floating-point value
+ * @param[in] y floating-point value
+ * @param[in] z floating-point value
+ *
+ * @return Result floating point value equal to (x*y) + z.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T fma(T x, T y, T z)
+{
+    return std::fma(x, y, z);
+}
+
 /** Loads the data from the given location, converts them to character string equivalents
  *  and writes the result to a character string buffer.
  *