COMPMID-417: DepthConvert NEON for QS8/QS16.

Change-Id: Ieb120bccf146045b3a0001ceb3893d4e67fd19df
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79763
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Steven Niu <steven.niu@arm.com>
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index da304c6..5eb4c55 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -296,7 +296,7 @@
  *
  * @return The result of the conversion float -> 8 bit fixed point
  */
-qint8_t scvt_qs8_f32(float a, int fixed_point_position);
+qint8_t sqcvt_qs8_f32(float a, int fixed_point_position);
 
 /** Convert a 16 bit fixed point to float
  *
@@ -314,7 +314,7 @@
  *
  * @return The result of the conversion float -> 16 bit fixed point
  */
-qint16_t scvt_qs16_f32(float a, int fixed_point_position);
+qint16_t sqcvt_qs16_f32(float a, int fixed_point_position);
 
 /** Scalar saturating move and narrow.
  *
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
index fab91d6..fdbc3f0 100644
--- a/arm_compute/core/FixedPoint.inl
+++ b/arm_compute/core/FixedPoint.inl
@@ -366,10 +366,10 @@
     return static_cast<float>(a) / (1 << fixed_point_position);
 }
 
-inline qint8_t scvt_qs8_f32(float a, int fixed_point_position)
+inline qint8_t sqcvt_qs8_f32(float a, int fixed_point_position)
 {
     // round_nearest_integer(a * 2^(fixed_point_position))
-    return static_cast<qint8_t>(a * (1 << fixed_point_position) + 0.5f);
+    return saturate_convert<float, qint8_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
 }
 
 inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
@@ -377,10 +377,10 @@
     return static_cast<float>(a) / (1 << fixed_point_position);
 }
 
-inline qint16_t scvt_qs16_f32(float a, int fixed_point_position)
+inline qint16_t sqcvt_qs16_f32(float a, int fixed_point_position)
 {
     // round_nearest_integer(a * 2^(fixed_point_position))
-    return static_cast<qint16_t>(a * (1 << fixed_point_position) + 0.5f);
+    return saturate_convert<float, qint16_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
 }
 
 inline qint8_t sqmovn_qs16(qint16_t a)
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 660464e..e3eb5d4 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -788,36 +788,36 @@
  * @param[in] a                    Float input vector
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
- * @return The result of the conversion float -> 8 bit fixed point
+ * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
  */
-qint8x8_t vcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
+qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
 
 /** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements
  *
  * @param[in] a                    Float input vector
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
- * @return The result of the conversion float -> 16 bit fixed point
+ * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
  */
-qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
+qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
 
 /** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
  *
  * @param[in] a                    Float input vector
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
- * @return The result of the conversion float -> 8 bit fixed point
+ * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
  */
-qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
+qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
 
 /** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements
  *
  * @param[in] a                    Float input vector
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
- * @return The result of the conversion float -> 16 bit fixed point
+ * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
  */
-qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
+qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
 
 /** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
  *
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 4f7f44a..92af82c 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -236,7 +236,7 @@
             vdupq_n_f32(a),
         }
     };
-    return vcvtq_qs8_f32(res, fixed_point_position);
+    return vqcvtq_qs8_f32(res, fixed_point_position);
 }
 
 inline qint16x8_t vdupq_n_qs16(qint16_t a)
@@ -809,15 +809,15 @@
     return vqaddq_s32(a, tmp);
 }
 
-inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
+inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
 
     float32x4x2_t res_f32 =
     {
         {
-            vdupq_n_f32(0.5f),
-            vdupq_n_f32(0.5f)
+            vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+            vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
         }
     };
 
@@ -837,11 +837,11 @@
     return vqmovn_s16(res_s16);
 }
 
-inline qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
+inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
 
-    float32x4_t res_f32 = vdupq_n_f32(0.5f);
+    float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
 
     res_f32 = vmlaq_f32(res_f32, a, pow2);
 
@@ -850,17 +850,17 @@
     return vqmovn_s32(res_s32);
 }
 
-inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
+inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
 
     float32x4x4_t res_f32 =
     {
         {
-            vdupq_n_f32(0.5f),
-            vdupq_n_f32(0.5f),
-            vdupq_n_f32(0.5f),
-            vdupq_n_f32(0.5f)
+            vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+            vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+            vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+            vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
         }
     };
 
@@ -890,15 +890,15 @@
     return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
 }
 
-inline qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
+inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
 
     float32x4x2_t res_f32 =
     {
         {
-            vdupq_n_f32(0.5f),
-            vdupq_n_f32(0.5f)
+            vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+            vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
         }
     };
 
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
index 7c59ce4..47b3a7e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
@@ -52,11 +52,12 @@
      *    U8 -> U16, S16, S32
      *    U16 -> U8, U32
      *    S16 -> U8, S32
-     *    F32 -> QS8
+     *    QS16 -> F32
+     *    F32 -> QS8, QS16
      *
      *
-     * @param[in]  input  The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32.
-     * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32.
+     * @param[in]  input  The input tensor to convert. Data type supported: QS8/U8/U16/S16/QS16/F32.
+     * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/QS16/U32/S32/F32.
      * @param[in]  policy Conversion policy.
      * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
      *                    It is not used on fixed point conversion.
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index d1cdd7d..1499df0 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -56,8 +56,8 @@
     if(is_data_type_fixed_point(input->info()->data_type()))
     {
         ma_arguments << "-DBETA=" << (input->info()->data_type() == DataType::QS8 ?
-                                      scvt_qs8_f32(beta, input->info()->fixed_point_position()) :
-                                      scvt_qs16_f32(beta, input->info()->fixed_point_position()))
+                                      sqcvt_qs8_f32(beta, input->info()->fixed_point_position()) :
+                                      sqcvt_qs16_f32(beta, input->info()->fixed_point_position()))
                      << " ";
         ma_arguments << "-DFIXED_POINT_POSITION=" << input->info()->fixed_point_position();
     }
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 2d6b83a..c65b9e0 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -77,8 +77,8 @@
     if(is_data_type_fixed_point(input0->info()->data_type()))
     {
         mm_arguments << "-DALPHA=" << (input0->info()->data_type() == DataType::QS8 ?
-                                       scvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
-                                       scvt_qs16_f32(alpha, input0->info()->fixed_point_position()))
+                                       sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
+                                       sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position()))
                      << " ";
         mm_arguments << "-DFIXED_POINT_POSITION=" << input0->info()->fixed_point_position() << " ";
     }
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 1bd0353..492d197 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -267,9 +267,9 @@
     int      fixed_point_position = _input->info()->fixed_point_position();
 
     static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
-    const qint8x16_t        CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
-    const qint8x16_t        a       = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
-    const qint8x16_t        b       = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+    const qint8x16_t        CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
+    const qint8x16_t        a       = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
+    const qint8x16_t        b       = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index e6f233c..d0aec69 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -58,7 +58,7 @@
     qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
     qint8x16_t       beta_vec    = vdupq_n_qs8(0);
     qint8x16_t       denominator = vdupq_n_qs8(0);
-    const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+    const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
     execute_window_loop(window, [&](const Coordinates & id)
     {
         if(slice != id.z())
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index 56612a7..3c1a94d 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -46,27 +46,35 @@
 
 void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_ERROR_ON(shift >= 8);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
-                             "Only data_types supported [in] QS8 ->  [out] F32");
-
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
                                                                             && output->info()->data_type() != DataType::S32),
                              "Only data_types supported [in] U8 -> [out] U16, S16, S32");
 
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
+                             "Only data_types supported [in] QS8 ->  [out] F32");
+
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
                              "Only data_types supported [in] U16 ->  [out] U8, U32");
 
     ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
                              "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
-                             "Only data_types supported [in] F32 ->  [out] QS8");
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && output->info()->data_type() != DataType::F32,
+                             "Only data_types supported [in] QS16 ->  [out] F32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
+                             "Only data_types supported [in] F32 ->  [out] QS8, QS16");
+
+    // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     _policy = policy;
     _shift  = shift;
@@ -346,6 +354,38 @@
             }
             break;
         }
+        case DataType::QS16:
+        {
+            const int fixed_point_position = _input->info()->fixed_point_position();
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    /* Up-conversion QS16 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels.val[0]), fixed_point_position));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels.val[0]), fixed_point_position));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels.val[1]), fixed_point_position));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels.val[1]), fixed_point_position));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
         case DataType::F32:
         {
             switch(_output->info()->data_type())
@@ -366,13 +406,40 @@
                             }
                         };
 
-                        const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+                        const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, fixed_point_position);
 
                         vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
                     },
                     input, output);
                     break;
                 }
+                case DataType::QS16:
+                {
+                    const int fixed_point_position = _output->info()->fixed_point_position();
+                    /* Down-conversion F32 -> QS16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x2_t texels_f32_1 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                            }
+                        };
+                        const float32x4x2_t texels_f32_2 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, fixed_point_position));
+                        vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, fixed_point_position));
+                    },
+                    input, output);
+                    break;
+                }
                 default:
                     ARM_COMPUTE_ERROR("Output data type not supported");
             }
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 91fbe6f..f2cd18d 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -94,7 +94,7 @@
 void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
 {
     const int        fixed_point_position = input->info()->fixed_point_position();
-    const qint8x16_t beta_qs8             = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+    const qint8x16_t beta_qs8             = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
 
     Iterator in(input, window);
     Iterator out(output, window);
@@ -118,7 +118,7 @@
 void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
 {
     const int        fixed_point_position = input->info()->fixed_point_position();
-    const qint16x8_t beta_qs16            = vdupq_n_qs16(scvt_qs16_f32(beta, fixed_point_position));
+    const qint16x8_t beta_qs16            = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
 
     Iterator in(input, window);
     Iterator out(output, window);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index b81be6c..8381dd8 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -456,7 +456,7 @@
         // Multiply by the weight of the matrix product (alpha)
         if(multiply_alpha)
         {
-            const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+            const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
             acc00_qs8                 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
             acc01_qs8                 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
             acc02_qs8                 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
@@ -585,7 +585,7 @@
         // Multiply by the weight of the matrix product (alpha)
         if(multiply_alpha)
         {
-            const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position));
+            const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
             acc00_qs16                  = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
             acc01_qs16                  = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
             acc02_qs16                  = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
@@ -1058,7 +1058,7 @@
     const size_t    out_stride3          = out_stride1 * 3;
     const int       num_elems_matrix_b_x = input1->info()->dimension(0);
     const int       fixed_point_position = input0->info()->fixed_point_position();
-    const qint8x8_t alpha_qs8            = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+    const qint8x8_t alpha_qs8            = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
     ARM_COMPUTE_UNUSED(alpha_qs8);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
@@ -1291,7 +1291,7 @@
     const size_t     out_stride3          = out_stride1 * 3;
     const int        num_elems_matrix_b_x = input1->info()->dimension(0);
     const int        fixed_point_position = input0->info()->fixed_point_position();
-    const qint16x4_t alpha_qs16           = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position));
+    const qint16x4_t alpha_qs16           = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
     ARM_COMPUTE_UNUSED(alpha_qs16);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 5bb8b1c..e4de60d 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -136,11 +136,11 @@
     {
         if(std::is_same<T, qint8_t>::value)
         {
-            *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+            *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
         }
         else if(std::is_same<T, qint16_t>::value)
         {
-            *out_ptr = scvt_qs16_f32(1.0f, fixed_point_position);
+            *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
         }
         else
         {
@@ -255,11 +255,11 @@
         {
             if(std::is_same<T, qint8_t>::value)
             {
-                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
             }
             else if(std::is_same<T, qint16_t>::value)
             {
-                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
             }
             else
             {
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index 011e366..24b5149 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -23,9 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
@@ -34,11 +32,6 @@
 
 void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input == output);
-    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
-
     auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
diff --git a/tests/TensorLibrary.h b/tests/TensorLibrary.h
index 5b2c5b6..3fb593c 100644
--- a/tests/TensorLibrary.h
+++ b/tests/TensorLibrary.h
@@ -560,6 +560,7 @@
             break;
         }
         case DataType::S16:
+        case DataType::QS16:
         {
             ARM_COMPUTE_ERROR_ON(!(std::is_same<int16_t, D>::value));
             std::uniform_int_distribution<int16_t> distribution_s16(low, high);
diff --git a/tests/validation/FixedPoint.h b/tests/validation/FixedPoint.h
index 53f532c..261fcd6 100644
--- a/tests/validation/FixedPoint.h
+++ b/tests/validation/FixedPoint.h
@@ -333,7 +333,7 @@
      */
     static constexpr T to_fixed(float val, uint8_t p)
     {
-        return static_cast<T>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5));
+        return static_cast<T>(saturate_cast<float>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5)));
     }
     /** Clamp value between two ranges
      *
diff --git a/tests/validation/NEON/DepthConvert.cpp b/tests/validation/NEON/DepthConvert.cpp
index 4a37d98..65d3ab1 100644
--- a/tests/validation/NEON/DepthConvert.cpp
+++ b/tests/validation/NEON/DepthConvert.cpp
@@ -208,6 +208,89 @@
 }
 BOOST_AUTO_TEST_SUITE_END()
 
+BOOST_AUTO_TEST_SUITE(QS16_to_F32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 15, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 15, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 15, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(F32_to_QS16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 15, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 15, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
 BOOST_AUTO_TEST_SUITE(U8_to_U16)
 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
 
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 0502f53..e90635f 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -518,94 +518,58 @@
 }
 
 // Depth conversion
-template <typename T1, typename T2>
+template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 >
 void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR("The conversion is not supported");
-}
+    using namespace fixed_point_arithmetic;
 
-template <>
-void depth_convert<int8_t, float>(const Tensor<int8_t> &in, Tensor<float> &out, ConvertPolicy policy, uint32_t shift)
-{
-    const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position());
+    const int fixed_point_position = in.fixed_point_position();
     for(int i = 0; i < in.num_elements(); ++i)
     {
-        out[i] = static_cast<float>(in[i]) * (1.0f / (1 << fixed_point_position));
+        out[i] = static_cast<float>(fixed_point<T1>(in[i], fixed_point_position, true));
     }
 }
 
-template <>
-void depth_convert<float, int8_t>(const Tensor<float> &in, Tensor<int8_t> &out, ConvertPolicy policy, uint32_t shift)
+template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_integral<T2>::value, int >::type = 0 >
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
 {
-    const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position());
+    using namespace fixed_point_arithmetic;
+
+    const int fixed_point_position = out.fixed_point_position();
     for(int i = 0; i < in.num_elements(); ++i)
     {
-        float val = in[i] * (1 << fixed_point_position) + 0.5f;
-        out[i]    = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<int8_t>(val) : static_cast<int8_t>(val));
+        out[i] = fixed_point<T2>(in[i], fixed_point_position).raw();
     }
 }
 
-template <>
-void depth_convert<uint8_t, uint16_t>(const Tensor<uint8_t> &in, Tensor<uint16_t> &out, ConvertPolicy policy, uint32_t shift)
+template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value, int >::type = 0 >
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
 {
-    for(int i = 0; i < in.num_elements(); ++i)
+    // Up-casting
+    if(std::numeric_limits<T1>::digits <= std::numeric_limits<T2>::digits)
     {
-        out[i] = static_cast<uint16_t>(in[i]) << shift;
+        for(int i = 0; i < in.num_elements(); ++i)
+        {
+            out[i] = static_cast<T2>(in[i]) << shift;
+        }
+    }
+    // Down-casting
+    else
+    {
+        for(int i = 0; i < in.num_elements(); ++i)
+        {
+            T1 val = in[i] >> shift;
+            out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<T2>(val) : static_cast<T2>(val));
+        }
     }
 }
 
-template <>
-void depth_convert<uint8_t, int16_t>(const Tensor<uint8_t> &in, Tensor<int16_t> &out, ConvertPolicy policy, uint32_t shift)
+template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 >
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
 {
     for(int i = 0; i < in.num_elements(); ++i)
     {
-        out[i] = static_cast<int16_t>(in[i]) << shift;
-    }
-}
-
-template <>
-void depth_convert<uint8_t, int32_t>(const Tensor<uint8_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift)
-{
-    for(int i = 0; i < in.num_elements(); ++i)
-    {
-        out[i] = static_cast<int32_t>(in[i]) << shift;
-    }
-}
-
-template <>
-void depth_convert<uint16_t, uint8_t>(const Tensor<uint16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift)
-{
-    for(int i = 0; i < in.num_elements(); ++i)
-    {
-        uint16_t val = in[i] >> shift;
-        out[i]       = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val));
-    }
-}
-
-template <>
-void depth_convert<uint16_t, uint32_t>(const Tensor<uint16_t> &in, Tensor<uint32_t> &out, ConvertPolicy policy, uint32_t shift)
-{
-    for(int i = 0; i < in.num_elements(); ++i)
-    {
-        out[i] = static_cast<uint32_t>(in[i]) << shift;
-    }
-}
-
-template <>
-void depth_convert<int16_t, uint8_t>(const Tensor<int16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift)
-{
-    for(int i = 0; i < in.num_elements(); ++i)
-    {
-        int16_t val = in[i] >> shift;
-        out[i]      = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val));
-    }
-}
-template <>
-void depth_convert<int16_t, int32_t>(const Tensor<int16_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift)
-{
-    for(int i = 0; i < in.num_elements(); ++i)
-    {
-        out[i] = static_cast<int32_t>(in[i]) << shift;
+        out[i] = static_cast<T2>(in[i]);
     }
 }