L2Norm changes to enable fp16 in armv8a multi_isa builds

    * Code guarded with __ARM_FEATURE_FP16_VECTOR_ARITHMETIC needs
      to be moved to an fp16.cpp file to allow compilation with
      -march=armv8.2-a+fp16

    * fp16.cpp needs to use the template l2_normalize_x() and
      l2_normalize_yz which had to be moved from impl.cpp to impl.h

    * Removed impl.cpp

    * Partially resolves MLCE-1102

Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Change-Id: Id00a823730108293fc712295a178dad80588af30
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10344
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 1603739..696942c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -531,7 +531,6 @@
         "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
         "src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp",
         "src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp",
-        "src/cpu/kernels/l2normlayer/generic/neon/impl.cpp",
         "src/cpu/kernels/lut/generic/neon/u8.cpp",
         "src/cpu/kernels/maxunpool/generic/neon/fp16.cpp",
         "src/cpu/kernels/maxunpool/generic/neon/fp32.cpp",
diff --git a/filelist.json b/filelist.json
index e4627f8..215b363 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1792,7 +1792,6 @@
             "src/runtime/NEON/functions/NEL2NormalizeLayer.cpp"
           ],
           "neon":{
-            "common":["src/cpu/kernels/l2normlayer/generic/neon/impl.cpp"],
             "fp32":["src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp"],
             "fp16":["src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp"]
           }
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index 3b42839..d4d1fc8 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -779,7 +779,6 @@
 	"cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
 	"cpu/kernels/l2normlayer/generic/neon/fp16.cpp",
 	"cpu/kernels/l2normlayer/generic/neon/fp32.cpp",
-	"cpu/kernels/l2normlayer/generic/neon/impl.cpp",
 	"cpu/kernels/lut/generic/neon/u8.cpp",
 	"cpu/kernels/maxunpool/generic/neon/fp16.cpp",
 	"cpu/kernels/maxunpool/generic/neon/fp32.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0b3da44..ee1ff47 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -771,7 +771,6 @@
 	cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
 	cpu/kernels/l2normlayer/generic/neon/fp16.cpp
 	cpu/kernels/l2normlayer/generic/neon/fp32.cpp
-	cpu/kernels/l2normlayer/generic/neon/impl.cpp
 	cpu/kernels/lut/generic/neon/u8.cpp
 	cpu/kernels/maxunpool/generic/neon/fp16.cpp
 	cpu/kernels/maxunpool/generic/neon/fp32.cpp
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp b/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp
deleted file mode 100644
index 2886537..0000000
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2017-2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T, int S>
-void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input_it(in, win_collapsed);
-    Iterator sum_it(sum, win_collapsed);
-    Iterator output_it(out, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
-        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
-        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            out_ptr[x] = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
-}
-
-template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
-{
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window window_sum(win);
-    window_sum.set(axis, Window::Dimension(0, 0, 0));
-
-    Iterator input_it(in, win);
-    Iterator sum_it(sum, window_sum);
-    Iterator output_it(out, win);
-
-    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
-            out_ptr[x]         = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
-}
-
-template void l2_normalize_yz<float, 4>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis);
-template void l2_normalize_x<float, 4>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window);
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void l2_normalize_yz<float16_t, 8>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis);
-template void l2_normalize_x<float16_t, 8>(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window);
-#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
index 98391fb..a06cdd3 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,102 @@
 #ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
 #define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/common/Registrars.h"
+
 #include <cstddef>
 
 namespace arm_compute
 {
-class ITensor;
-class Window;
-
 namespace cpu
 {
 template <typename T, int S>
-void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window);
+void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input_it(in, win_collapsed);
+    Iterator sum_it(sum, win_collapsed);
+    Iterator output_it(out, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
+        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+        // Compute elements over vector steps
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            out_ptr[x] = in_ptr[x] * norm_value;
+        }
+    },
+    input_it, sum_it, output_it);
+}
 
 template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis);
+void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+{
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window window_sum(win);
+    window_sum.set(axis, Window::Dimension(0, 0, 0));
+
+    Iterator input_it(in, win);
+    Iterator sum_it(sum, window_sum);
+    Iterator output_it(out, win);
+
+    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+
+        // Compute elements over vector steps
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+            out_ptr[x]         = in_ptr[x] * norm_value;
+        }
+    },
+    input_it, sum_it, output_it);
+}
 } // namespace cpu
 } // namespace arm_compute
 #endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H