COMPMID-2252 NECast.

Change-Id: I7532aea6827a325eb8457132d4787ac527e93cd4
Signed-off-by: Usama Arif <usama.arif@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1149
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index cbc90a0..531873e 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -44,16 +44,17 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON(input == output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
-                                    "Only data_types supported [in] QASYMM8 -> [out] F16, F32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
+                                                                           && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
+                                    "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
-                                                                           && output->data_type() != DataType::S32),
-                                    "Only data_types supported [in] U8 -> [out] U16, S16, S32");
+                                                                           && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
+                                    "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
                                     "Only data_types supported [in] U16 ->  [out] U8, U32");
@@ -61,11 +62,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F32),
-                                    "Only data_types supported [in] F16 ->  [out] QASYMM8, F32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::U8 && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
+                                    "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16),
-                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, F16");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
+                                    "Only data_types supported [in] F32 ->  [out] QASYMM8, F16, S32, U8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
+                                    "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8");
 
     // Validate in case of configured output
     if(output->total_size() > 0)
@@ -139,74 +143,6 @@
     switch(_input->info()->data_type())
     {
         case DataType::QASYMM8:
-        {
-            switch(_output->info()->data_type())
-            {
-                /* Up-conversion QASYMM8 -> F32 */
-                case DataType::F32:
-                {
-                    const float32x4_t scale  = vdupq_n_f32(_input->info()->quantization_info().scale);
-                    const int32x4_t   offset = vdupq_n_s32(_input->info()->quantization_info().offset);
-
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        const uint8x16_t   texels_u8 = vld1q_u8(input.ptr());
-                        const uint16x8x2_t texels_u16 =
-                        {
-                            {
-                                vmovl_u8(vget_low_u8(texels_u8)),
-                                vmovl_u8(vget_high_u8(texels_u8))
-                            }
-                        };
-
-                        const int32x4x4_t texels_s32 =
-                        {
-                            {
-                                vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[0]))),
-                                vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[0]))),
-                                vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[1]))),
-                                vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[1])))
-                            }
-                        };
-
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[0], offset)), scale));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[1], offset)), scale));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[2], offset)), scale));
-                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[3], offset)), scale));
-                    },
-                    input, output);
-                    break;
-                }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                /* Up-conversion QASYMM8 -> F16 */
-                case DataType::F16:
-                {
-                    const float16x8_t scale  = vdupq_n_f16(static_cast<float16_t>(_input->info()->quantization_info().scale));
-                    const int16x8_t   offset = vdupq_n_s16(static_cast<int16_t>(_input->info()->quantization_info().offset));
-
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        const uint8x16_t  texels_u8 = vld1q_u8(input.ptr());
-                        const int16x8x2_t texels_s16 =
-                        {
-                            {
-                                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                            }
-                        };
-
-                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[0], offset)), scale));
-                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[1], offset)), scale));
-                    },
-                    input, output);
-                    break;
-                }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                default:
-                    ARM_COMPUTE_ERROR("Output data type not supported");
-            }
-            break;
-        }
         case DataType::U8:
         {
             const int16x8_t b = vdupq_n_s16(_shift);
@@ -257,6 +193,51 @@
                     input, output);
                     break;
                 }
+                case DataType::F32:
+                {
+                    /* Up-conversion U8 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates &)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                    },
+                    input, output);
+                    break;
+                }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                {
+                    /* Up-conversion U8 -> F16 */
+                    execute_window_loop(window, [&](const Coordinates &)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcvtq_f16_s16(texels.val[0]));
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcvtq_f16_s16(texels.val[1]));
+                    },
+                    input, output);
+                    break;
+                }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
                 case DataType::U16:
                 {
                     /* Up-conversion U8 -> U16 */
@@ -441,13 +422,11 @@
             switch(_output->info()->data_type())
             {
                 case DataType::QASYMM8:
+                case DataType::U8:
                 {
-                    const float16x8_t scale        = vinvq_f16(vdupq_n_f16(static_cast<float16_t>(_output->info()->quantization_info().scale)));
-                    const int16x8_t   offset       = vdupq_n_s16(static_cast<int16_t>(_output->info()->quantization_info().offset));
-                    const int16x8_t   max_val_vec  = vdupq_n_s16(255);
-                    const int16x8_t   zero_val_vec = vdupq_n_s16(0);
+                    const float16x8_t scale = vdupq_n_f16(1 << _shift);
 
-                    /* Down-conversion F16 -> QASYMM8 */
+                    /* Up-conversion F16 -> U8 */
                     execute_window_loop(window, [&](const Coordinates & id)
                     {
                         const float16x8x2_t texels =
@@ -458,9 +437,7 @@
                             }
                         };
 
-                        const auto texel_quantized_0 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[0]), offset), max_val_vec), zero_val_vec);
-                        const auto texel_quantized_1 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[1]), offset), max_val_vec), zero_val_vec);
-                        vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(texel_quantized_0), vqmovun_s16(texel_quantized_1)));
+                        vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
                     },
                     input, output);
                     break;
@@ -488,6 +465,29 @@
                     input, output);
                     break;
                 }
+                case DataType::S32:
+                {
+                    const float32x4_t scale = vdupq_n_f32(1 << _shift);
+
+                    /* Up-conversion F16 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float16x8x2_t texels =
+                        {
+                            {
+                                vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
+                                vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
+                    },
+                    input, output);
+                    break;
+                }
                 default:
                     ARM_COMPUTE_ERROR("Output data type not supported");
             }
@@ -496,39 +496,6 @@
         case DataType::F32:
             switch(_output->info()->data_type())
             {
-                case DataType::QASYMM8:
-                {
-                    const float32x4_t scale        = vinvq_f32(vdupq_n_f32(_output->info()->quantization_info().scale));
-                    const int32x4_t   offset       = vdupq_n_s32(_output->info()->quantization_info().offset);
-                    const int32x4_t   max_val_vec  = vdupq_n_s32(255);
-                    const int32x4_t   zero_val_vec = vdupq_n_s32(0);
-
-                    /* Down-conversion F32 -> QASYMM8 */
-                    execute_window_loop(window, [&](const Coordinates &)
-                    {
-                        const float32x4x4_t texels =
-                        {
-                            {
-                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
-                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
-                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
-                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
-                            }
-                        };
-
-                        const auto texel_quantized_0 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[0]), offset), max_val_vec), zero_val_vec);
-                        const auto texel_quantized_1 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[1]), offset), max_val_vec), zero_val_vec);
-                        const auto texel_quantized_2 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[2]), offset), max_val_vec), zero_val_vec);
-                        const auto texel_quantized_3 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[3]), offset), max_val_vec), zero_val_vec);
-
-                        const auto converted_0 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_0), vqmovun_s32(texel_quantized_1)));
-                        const auto converted_1 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_2), vqmovun_s32(texel_quantized_3)));
-
-                        vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(converted_0, converted_1));
-                    },
-                    input, output);
-                    break;
-                }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
                 {
@@ -554,6 +521,161 @@
                     break;
                 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                case DataType::S32:
+                {
+                    const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
+
+                    /* Conversion F32 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates &)
+                    {
+                        const float32x4x4_t texels =
+                        {
+                            {
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale),
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(texels.val[0]));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(texels.val[1]));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(texels.val[2]));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(texels.val[3]));
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::QASYMM8:
+                case DataType::U8:
+                {
+                    const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
+
+                    /* Down-conversion F32 -> U8 */
+                    execute_window_loop(window, [&](const Coordinates &)
+                    {
+                        const float32x4x4_t texels =
+                        {
+                            {
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale),
+                            }
+                        };
+
+                        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
+                        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
+                    },
+                    input, output);
+                    break;
+                }
+
+
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+
+        case DataType::S32:
+            switch(_output->info()->data_type())
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                {
+                    const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
+
+                    /* Down-conversion S32 -> F16 */
+                    execute_window_loop(window, [&](const Coordinates &)
+                    {
+                        const float32x4x4_t texels =
+                        {
+                            {
+                                vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()))), scale),
+                                vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4)), scale),
+                                vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8)), scale),
+                                vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12)), scale)
+                            }
+                        };
+
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+                    },
+                    input, output);
+                    break;
+                }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                case DataType::F32:
+                {
+                    const int32x4_t scale = vdupq_n_s32(1.f / (1 << _shift));
+
+                    /* Conversion S32 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates &)
+                    {
+                        const int32x4x4_t texels =
+                        {
+                            {
+                                vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), scale),
+                                vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), scale),
+                                vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), scale),
+                                vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), scale),
+                            }
+                        };
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(texels.val[0]));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(texels.val[1]));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(texels.val[2]));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(texels.val[3]));
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::QASYMM8:
+                case DataType::U8:
+                {
+                    const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
+
+                    /* Down-conversion S32 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates &)
+                        {
+                            const int32x4x4_t texels =
+                            {
+                                {
+                                    vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b),
+                                    vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b),
+                                    vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b),
+                                    vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b)
+                                }
+                            };
+
+                            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
+                            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates &)
+                        {
+                            const int32x4x4_t texels =
+                            {
+                                {
+                                    vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b),
+                                    vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b),
+                                    vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b),
+                                    vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b)
+                                }
+                            };
+
+                            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
+                            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
                 default:
                     ARM_COMPUTE_ERROR("Output data type not supported");
             }