Select changes to enable fp16 in armv8a multi_isa builds

    * Code guarded with __ARM_FEATURE_FP16_VECTOR_ARITHMETIC needs
      to be moved to an fp16.cpp file to allow compilation with
      -march=armv8.2-a+fp16

    * fp16.cpp needs to use the template select_op() which had to be moved from impl.cpp to fp16.cpp

    * Partially resolves MLCE-1102

Change-Id: Ic9e73e121482fcc5e4fcbe8ae1ecd23649cbd3d1
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10359
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
diff --git a/Android.bp b/Android.bp
index ae0c79b..9b5f48a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -551,7 +551,6 @@
         "src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/range/generic/neon/fp16.cpp",
         "src/cpu/kernels/range/generic/neon/fp32.cpp",
-        "src/cpu/kernels/range/generic/neon/impl.cpp",
         "src/cpu/kernels/range/generic/neon/integer.cpp",
         "src/cpu/kernels/roialign/generic/neon/fp16.cpp",
         "src/cpu/kernels/roialign/generic/neon/fp32.cpp",
@@ -563,7 +562,6 @@
         "src/cpu/kernels/scale/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/select/generic/neon/fp16.cpp",
         "src/cpu/kernels/select/generic/neon/fp32.cpp",
-        "src/cpu/kernels/select/generic/neon/impl.cpp",
         "src/cpu/kernels/select/generic/neon/integer.cpp",
         "src/cpu/kernels/softmax/generic/neon/fp16.cpp",
         "src/cpu/kernels/softmax/generic/neon/fp32.cpp",
diff --git a/filelist.json b/filelist.json
index b74e2e2..0cf6773 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2055,7 +2055,6 @@
             "src/runtime/NEON/functions/NERange.cpp"
           ],
           "neon": {
-            "common":  [ "src/cpu/kernels/range/generic/neon/impl.cpp" ],
             "fp32":    [ "src/cpu/kernels/range/generic/neon/fp32.cpp" ],
             "fp16":    [ "src/cpu/kernels/range/generic/neon/fp16.cpp" ],
             "integer": [ "src/cpu/kernels/range/generic/neon/integer.cpp"   ]
@@ -2161,7 +2160,6 @@
             "src/runtime/NEON/functions/NESelect.cpp"
           ],
           "neon": {
-            "common": [ "src/cpu/kernels/select/generic/neon/impl.cpp" ],
             "fp32": [ "src/cpu/kernels/select/generic/neon/fp32.cpp" ],
             "fp16": [ "src/cpu/kernels/select/generic/neon/fp16.cpp" ],
             "integer": [ "src/cpu/kernels/select/generic/neon/integer.cpp"  ]
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index f4fa950..786a38f 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -800,7 +800,6 @@
 	"cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
 	"cpu/kernels/range/generic/neon/fp16.cpp",
 	"cpu/kernels/range/generic/neon/fp32.cpp",
-	"cpu/kernels/range/generic/neon/impl.cpp",
 	"cpu/kernels/range/generic/neon/integer.cpp",
 	"cpu/kernels/roialign/generic/neon/fp16.cpp",
 	"cpu/kernels/roialign/generic/neon/fp32.cpp",
@@ -812,7 +811,6 @@
 	"cpu/kernels/scale/neon/qasymm8_signed.cpp",
 	"cpu/kernels/select/generic/neon/fp16.cpp",
 	"cpu/kernels/select/generic/neon/fp32.cpp",
-	"cpu/kernels/select/generic/neon/impl.cpp",
 	"cpu/kernels/select/generic/neon/integer.cpp",
 	"cpu/kernels/softmax/generic/neon/fp16.cpp",
 	"cpu/kernels/softmax/generic/neon/fp32.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index eb17d51..db40bd7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -791,7 +791,6 @@
 	cpu/kernels/pool3d/neon/qasymm8_signed.cpp
 	cpu/kernels/range/generic/neon/fp16.cpp
 	cpu/kernels/range/generic/neon/fp32.cpp
-	cpu/kernels/range/generic/neon/impl.cpp
 	cpu/kernels/range/generic/neon/integer.cpp
 	cpu/kernels/roialign/generic/neon/fp16.cpp
 	cpu/kernels/roialign/generic/neon/fp32.cpp
@@ -803,7 +802,6 @@
 	cpu/kernels/scale/neon/qasymm8_signed.cpp
 	cpu/kernels/select/generic/neon/fp16.cpp
 	cpu/kernels/select/generic/neon/fp32.cpp
-	cpu/kernels/select/generic/neon/impl.cpp
 	cpu/kernels/select/generic/neon/integer.cpp
 	cpu/kernels/softmax/generic/neon/fp16.cpp
 	cpu/kernels/softmax/generic/neon/fp32.cpp
diff --git a/src/cpu/kernels/range/generic/neon/impl.cpp b/src/cpu/kernels/range/generic/neon/impl.cpp
deleted file mode 100644
index f91251c..0000000
--- a/src/cpu/kernels/range/generic/neon/impl.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void neon_range_function(ITensor *output, float start, float step, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
-
-    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
-    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
-    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const int  window_step_x  = 16 / sizeof(T);
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator output_it(output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            for(int count = 0; count < window_step_x; ++count)
-            {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
-            }
-
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
-        }
-
-    },
-    output_it);
-}
-
-template void neon_range_function<uint8_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<uint16_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<uint32_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<int8_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<int16_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<int32_t>(ITensor *output, float start, float step, const Window &window);
-template void neon_range_function<float32_t>(ITensor *output, float start, float step, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void neon_range_function<float16_t>(ITensor *output, float start, float step, const Window &window);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h
index 7ac2fc9..62144e6 100644
--- a/src/cpu/kernels/range/generic/neon/impl.h
+++ b/src/cpu/kernels/range/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,62 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
-#define SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
 
 namespace arm_compute
 {
-class ITensor;
-class Window;
-
 namespace cpu
 {
 template <typename T>
-void neon_range_function(ITensor *output, float start, float step, const Window &window);
+void neon_range_function(ITensor *output, float start, float step, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type;
+
+    const auto step_vec  = wrapper::vdup_n(static_cast<T>(step), ExactTagType{});
+    const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{});
+    auto       id_vec    = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = 16 / sizeof(T);
+
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator output_it(output, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int        x       = window_start_x;
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            for(int count = 0; count < window_step_x; ++count)
+            {
+                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+            }
+
+            // start + step * id
+            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+            wrapper::vstore(out_ptr + x, res_vec);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const auto res = start + x * step;
+            *(out_ptr + x) = res;
+        }
+
+    },
+    output_it);
+}
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_RANGE_IMPL_H
+#endif // ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/select/generic/neon/impl.cpp b/src/cpu/kernels/select/generic/neon/impl.cpp
deleted file mode 100644
index 62c46f3..0000000
--- a/src/cpu/kernels/select/generic/neon/impl.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-#include "src/core/NEON/NEAsymm.h"
-
-#include <arm_neon.h>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
-{
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator condition(cond, win);
-    Iterator input1(in1, win);
-    Iterator input2(in2, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
-
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
-        {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
-}
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    const auto window_step_x  = 16 / sizeof(ScalarType);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
-}
-
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(window);
-
-    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
-    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
-    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
-    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
-
-    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
-    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
-    int       offset     = 0;
-    const int step       = 16 / in1->info()->element_size();
-
-    for(int i = 0; i < outer_size; ++i)
-    {
-        int        x         = offset;
-        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
-        }
-        if(x <= offset + inner_size - (step / 2))
-        {
-            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
-            x += step / 2;
-        }
-        for(; x < offset + inner_size; ++x)
-        {
-            *(output_ptr + x) = *(input_ptr + x);
-        }
-        offset += inner_size;
-    }
-}
-
-template void select_op_32<float, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<float>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_8<int8_t, uint8x16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_16<int16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void select_op_16<float16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-template void select_op_32<int32_t, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<int8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<int16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void select_op_not_same_rank<float16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-template void select_op_not_same_rank<int32_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_8<uint8_t, uint8x16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_16<uint16_t, uint16x8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_32<uint32_t, uint32x4_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<uint8_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<uint16_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void select_op_not_same_rank<uint32_t>(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-} // namespace cpu
-
-} // namespace arm_compute
diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
index 2bbc38b..6a6d996 100644
--- a/src/cpu/kernels/select/generic/neon/impl.h
+++ b/src/cpu/kernels/select/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,34 +21,136 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
-#define SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
+#ifndef ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
 
-#include <cstdint>
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
+
+#include <arm_neon.h>
+#include <map>
+#include <string>
 
 namespace arm_compute
 {
-class ITensor;
-class Window;
-
 namespace cpu
 {
-template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template <typename ScalarType, typename VectorType>
-void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template <typename ScalarType, typename VectorType>
-void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template <typename ScalarType, typename VectorType>
-void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
 template <typename ScalarType, typename VectorType>
 void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *));
+               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
+    Iterator condition(cond, win);
+    Iterator input1(in1, win);
+    Iterator input2(in2, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+
+        int x = window_start_x;
+        for(; x <= limit; x += window_step_x)
+        {
+            const auto c = (*condition_conversion)(condition_ptr + x);
+            const auto a = wrapper::vloadq(input1_ptr + x);
+            const auto b = wrapper::vloadq(input2_ptr + x);
+            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            const auto c      = *(condition_ptr + x);
+            const auto a      = *(input1_ptr + x);
+            const auto b      = *(input2_ptr + x);
+            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+        }
+    },
+    condition, input1, input2, output);
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+    });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+    });
+}
+
+template <typename ScalarType, typename VectorType>
+void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    const auto window_step_x  = 16 / sizeof(ScalarType);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
+    {
+        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+    });
+}
+
+template <typename ScalarType>
+void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    auto       output_ptr    = reinterpret_cast<ScalarType *>(out->buffer());
+    const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer());
+    const auto input1_ptr    = reinterpret_cast<const ScalarType *>(in1->buffer());
+    const auto input2_ptr    = reinterpret_cast<const ScalarType *>(in2->buffer());
+
+    const int outer_size = cond->info()->total_size() / cond->info()->element_size();
+    const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size;
+    int       offset     = 0;
+    const int step       = 16 / in1->info()->element_size();
+
+    for(int i = 0; i < outer_size; ++i)
+    {
+        int        x         = offset;
+        const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
+        for(; x <= offset + inner_size - step; x += step)
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
+        }
+        if(x <= offset + inner_size - (step / 2))
+        {
+            wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
+            x += step / 2;
+        }
+        for(; x < offset + inner_size; ++x)
+        {
+            *(output_ptr + x) = *(input_ptr + x);
+        }
+        offset += inner_size;
+    }
+}
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_SELECT_IMPL_H
\ No newline at end of file
+#endif // ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H