COMPMID-3968 30% regression on FSSD v1 25 Grayscale

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ib1ecd7aa10fec0b7e2b3d929e212c1af34c0f58d
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4533
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/wrapper/intrinsics/mla.h b/src/core/NEON/wrapper/intrinsics/mla.h
index 2b38b34..9fb5a08 100644
--- a/src/core/NEON/wrapper/intrinsics/mla.h
+++ b/src/core/NEON/wrapper/intrinsics/mla.h
@@ -66,6 +66,22 @@
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 #undef VMLA_IMPL
+
+#define VMLAL_IMPL(vtype_in, vtype_out, postfix)                                     \
+    inline vtype_out vmlal(const vtype_out &a, const vtype_in &b, const vtype_in &c) \
+    {                                                                                \
+        return vmlal_##postfix(a, b, c);                                             \
+    }
+
+VMLAL_IMPL(uint8x8_t, uint16x8_t, u8)
+VMLAL_IMPL(int8x8_t, int16x8_t, s8)
+VMLAL_IMPL(uint16x4_t, uint32x4_t, u16)
+VMLAL_IMPL(int16x4_t, int32x4_t, s16)
+VMLAL_IMPL(uint32x2_t, uint64x2_t, u32)
+VMLAL_IMPL(int32x2_t, int64x2_t, s32)
+
+#undef VMLAL_IMPL
+
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_WRAPPER_MLA_H */
diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
index 0c26cd9..cf00a4a 100644
--- a/src/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -42,7 +42,7 @@
     }
 
 VREINTERPRET_IMPL(int16x4_t, uint16x4_t, vreinterpret, s16, u16)
-
+VREINTERPRET_IMPL(int16x8_t, uint16x8_t, vreinterpretq, s16, u16)
 VREINTERPRET_IMPL(int32x4_t, uint32x4_t, vreinterpretq, s32, u32)
 } // namespace wrapper
 } // namespace arm_compute