COMPMID-3523: Fix NEDepthConvertLayerKernel f16 casting

* Force F16->QASYMM8 AND F16->QASYMM8_SIGNED saturation
* Fix S32->F16 casting

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ic4be3865794947c577897cd9ad8554be4ebfe9bc
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3324
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 79dc2cb..cbb746c 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -849,7 +849,7 @@
                     const float16_t   scale_s = 1 << _shift;
                     const float16x8_t scale   = vdupq_n_f16(scale_s);
 
-                    /* Up-conversion F16 -> QASYMM8_SIGNED */
+                    /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
                     execute_window_loop(win, [&](const Coordinates &)
                     {
                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
@@ -872,7 +872,7 @@
                         // Compute left-over elements
                         for(; x < window_end_x; ++x)
                         {
-                            *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) * scale_s);
+                            *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
                         }
                     },
                     input, output);
@@ -884,7 +884,7 @@
                     const float16_t   scale_s = 1 << _shift;
                     const float16x8_t scale   = vdupq_n_f16(scale_s);
 
-                    /* Up-conversion F16 -> U8 */
+                    /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
                     execute_window_loop(win, [&](const Coordinates &)
                     {
                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
@@ -907,7 +907,7 @@
                         // Compute left-over elements
                         for(; x < window_end_x; ++x)
                         {
-                            *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) * scale_s);
+                            *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
                         }
 
                     },
@@ -1215,7 +1215,7 @@
                         // Compute left-over elements
                         for(; x < window_end_x; ++x)
                         {
-                            *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) * scale_s);
+                            *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
                         }
                     },
                     input, output);