Optimize add layer by considering the input tensors as 1D array

Resolves: COMPMID-5108

Change-Id: I544f8160fbe5b4ffbef348d1fbd3dd626a6e1bdb
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8002
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp
index ad3e445..67985c9 100644
--- a/src/cpu/kernels/add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,6 +128,35 @@
     }
 }
 
+template <typename ScalarType>
+void add_same_neon_as_1d_array(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    const ScalarType *src0_ptr = reinterpret_cast<const ScalarType *>(src0->buffer());
+    const ScalarType *src1_ptr = reinterpret_cast<const ScalarType *>(src1->buffer());
+    ScalarType       *dst_ptr  = reinterpret_cast<ScalarType *>(dst->buffer());
+
+    constexpr int window_step_x  = 16 / sizeof(ScalarType);
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    int x = window_start_x;
+    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    {
+        const auto val1 = wrapper::vloadq(src0_ptr + x);
+        const auto val2 = wrapper::vloadq(src1_ptr + x);
+        const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+        wrapper::vstore(dst_ptr + x, res);
+    }
+
+    // Compute left-over elements
+    for(; x < window_end_x; ++x)
+    {
+        const auto val1 = *(src0_ptr + x);
+        const auto val2 = *(src1_ptr + x);
+        *(dst_ptr + x)  = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
+    }
+}
+
 template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
@@ -137,5 +166,14 @@
 template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
+template void add_same_neon_as_1d_array<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_neon_as_1d_array<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_neon_as_1d_array<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_neon_as_1d_array<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void add_same_neon_as_1d_array<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+
 } // namespace cpu
 } // namespace arm_compute