Changes to enable FP16 in armv8a multi_isa

* This is the initial patch to start working on enabling fp16 in all
  multi_isa builds. More changes are required in the way we register
  the kernels using the macro REGISTER_FP16_NEON.

* In this patch we add the capability to build the fp16 files in listed in
  filelist.json with the correct arch option to enable FP16

* This patch is required towards building an universal multi_isa binary
  where fp16 is enable.

* Enable REGISTER_FP16_NEON macro for all builds by removing
  __ARM_FEATURE_FP16_VECTOR_ARITHMETIC guard from the macro definition.
  The macro has to be used across all types of builds.

Change-Id: I99f4c273f6ee04cad3c097e5e374200f48568fa9
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10682
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/SConscript b/SConscript
index 9069df9..31e7a5b 100644
--- a/SConscript
+++ b/SConscript
@@ -82,7 +82,7 @@
 #         A list of static objects
 #         A list of shared objects
 
-def build_lib_objects():
+def build_multiisa_lib_objects():
     lib_static_objs = [] # static objects
     lib_shared_objs = [] # shared objects
 
@@ -93,20 +93,30 @@
 
     # Build all the common files for the base architecture
     if env['arch'] == 'armv8a':
-        lib_static_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=True)
-        lib_shared_objs += build_obj_list(filedefs["armv8-a"], lib_files, static=False)
+        lib_static_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=True)
+        lib_shared_objs += build_obj_list(filedefs["armv8-a"], misa_lib_files, static=False)
     else:
-        lib_static_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=True)
-        lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], lib_files, static=False)
+        lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=True)
+        lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files, static=False)
+
+    # Build the FP16 specific files
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a"], misa_lib_files_neon_fp16, static=False)
 
     # Build the SVE specific files
-    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=True)
-    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], lib_files_sve, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.2-a-sve"], misa_lib_files_sve_fp16, static=False)
+
 
     # Build the SVE2 specific files
     arm_compute_env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_SVE2'])
-    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=True)
-    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], lib_files_sve2, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2, static=False)
+    lib_static_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=True)
+    lib_shared_objs += build_obj_list(filedefs["armv8.6-a-sve2"], misa_lib_files_sve2_fp16, static=False)
+
 
     return lib_static_objs, lib_shared_objs
 
@@ -284,29 +294,29 @@
     return attrs
 
 
-def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[]):
+def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[], include_common=True):
     files = { "common" : [] }
-
     # Early return if filelist is empty
     if backend not in filelist:
         return files
-
     # Iterate over operators and create the file lists to compiler
     for operator in operators:
         if operator in filelist[backend]['operators']:
-            files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
+            if include_common :
+                files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
             for tech in techs:
                 if tech in filelist[backend]['operators'][operator]["files"]:
                     # Add tech as a key to dictionary if not there
                     if tech not in files:
                         files[tech] = []
-
                     # Add tech files to the tech file list
                     tech_files = filelist[backend]['operators'][operator]["files"][tech]
-                    files[tech] += tech_files.get('common', [])
+                    if include_common:
+                        files[tech] += tech_files.get('common', [])
                     for attr in attrs:
                         files[tech] += tech_files.get(attr, [])
 
+
     # Remove duplicates if they exist
     return {k: list(set(v)) for k,v in files.items()}
 
@@ -608,6 +618,17 @@
 lib_files_sve = []
 lib_files_sve2 = []
 
+# the variables below are used for the multi_isa builds
+# please note that the variables names without the _fp16 suffix
+# do not hold any fp16 files.
+
+misa_lib_files = lib_files
+misa_lib_files_sve = []
+misa_lib_files_sve2 = []
+misa_lib_files_neon_fp16 = []
+misa_lib_files_sve_fp16 = []
+misa_lib_files_sve2_fp16 = []
+
 if env['neon']:
     # build winograd/depthwise sources for either v7a / v8a
     arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/arm_gemm",
@@ -620,8 +641,6 @@
                                       "arm_compute/core/NEON/kernels/assembly/",
                                       "src/cpu/kernels/assembly/"])
 
-    lib_files += filelist['cpu']['common']
-
     # Setup SIMD file list to include
     simd = ['neon']
     if env['multi_isa']:
@@ -636,7 +655,6 @@
     else:
         attrs = get_attrs_list(env, env['data_type_support'], env['data_layout_support'])
 
-
     if env['fixed_format_kernels']:
         attrs.append("fixed_format_kernels")
 
@@ -644,19 +662,46 @@
     cpu_operators = custom_operators if use_custom_ops else filelist['cpu']['operators'].keys()
     cpu_ops_to_build = resolve_operator_dependencies(filelist, cpu_operators, 'cpu')
 
-    cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+    if env['multi_isa']:
+        misa_lib_files += filelist['cpu']['common']
 
-    # Shared among ALL CPU files
-    lib_files += cpu_files.get('common', [])
+        # For multi_isa builds we need to build fp16 files for armv8.2-a+fp16 so we filter them out of cpu_files removing the attribute fp16
+        attrs.remove('fp16')
+        cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
 
-    # Arm® Neon™ specific files
-    lib_files += cpu_files.get('neon', [])
+        # Shared among ALL CPU files
+        misa_lib_files += cpu_files.get('common', [])
 
-    # SVE files only
-    lib_files_sve = cpu_files.get('sve', [])
+        # Arm® Neon™ specific files
+        misa_lib_files += cpu_files.get('neon', [])
 
-    # SVE2 files only
-    lib_files_sve2 = cpu_files.get('sve2', [])
+        # Get all the fp16 files
+        fp16_cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, ['fp16'],False)
+
+        misa_lib_files_neon_fp16 = fp16_cpu_files.get('neon',[])
+        misa_lib_files_sve_fp16 = fp16_cpu_files.get('sve',[])
+        misa_lib_files_sve2_fp16 = fp16_cpu_files.get('sve2',[])
+
+        # SVE files only minus FP16
+        misa_lib_files_sve = cpu_files.get('sve', [])
+
+        # SVE2 files only minus FP16
+        misa_lib_files_sve2 = cpu_files.get('sve2', [])
+    else:
+        lib_files += filelist['cpu']['common']
+
+        # Non multi_isa build
+        cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+
+        # Shared among ALL CPU files
+        lib_files += cpu_files.get('common', [])
+
+        # Arm® Neon™ specific files
+        lib_files += cpu_files.get('neon', [])
+
+        lib_files_sve = cpu_files.get('sve', [])
+
+        lib_files_sve2 = cpu_files.get('sve2', [])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
@@ -674,7 +719,7 @@
 
 
 if (env['multi_isa']):
-    lib_static_objs, lib_shared_objs = build_lib_objects()
+    lib_static_objs, lib_shared_objs = build_multiisa_lib_objects()
 
 
 # STATIC library build.
diff --git a/SConstruct b/SConstruct
index 68c518a..3eee4c0 100644
--- a/SConstruct
+++ b/SConstruct
@@ -62,8 +62,14 @@
 
 def update_data_type_layout_flags(env, data_types, data_layouts):
     # Manage data-types
-    if any(i in data_types for i in ['all', 'fp16']):
-        env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
+    if env['multi_isa']:
+        if  any(i in data_types for i in ['all', 'fp16']):
+            env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
+    else:
+            if not 'v8a' in env['arch'] and not 'v7a' in env['arch']:
+                if  any(i in data_types for i in ['all', 'fp16']):
+                    env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
+
     if any(i in data_types for i in ['all', 'fp32']):
         env.Append(CXXFLAGS = ['-DENABLE_FP32_KERNELS'])
     if any(i in data_types for i in ['all', 'qasymm8']):
@@ -112,7 +118,7 @@
     BoolVariable("exceptions", "Enable/disable C++ exception support", True),
     BoolVariable("high_priority", "Generate a library containing only the high priority operators", False),
     PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
-    PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure: 
+    PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure:
     EXTERNAL_TESTS_DIR:
     └── tests
         ├── benchmark
@@ -240,7 +246,6 @@
 if not 'windows' in env['os']:
     env.Append(CXXFLAGS = ['-Wall','-std=c++14', '-pedantic' ])
 
-env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP'])
 
 cpp_tool = {'linux': 'g++', 'android' : 'clang++',
              'tizen': 'g++', 'macos':'clang++',
@@ -312,8 +317,7 @@
         Exit(1)
 
     if 'v8a' in env['arch']:
-        print("INFO: multi_isa armv8-a architecture build doesn't enable __ARM_FEATURE_FP16_VECTOR_ARITHMETIC. Use armv8.2-a or beyond to enable FP16 vector arithmetic support")
-        env.Append(CXXFLAGS = ['-march=armv8-a']) # note: this will disable fp16 extension __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        env.Append(CXXFLAGS = ['-march=armv8-a'])
     else:
         if 'v8.6-a' in env['arch']:
             if "disable_mmla_fp" not in env['custom_options']:
@@ -536,7 +540,7 @@
     if not 'windows' in env['os']:
         env.Append(CXXFLAGS = ['-fPIC'])
         env.Append(LINKFLAGS = ['-static-libgcc','-static-libstdc++'])
-       
+
 if env['Werror']:
     env.Append(CXXFLAGS = ['-Werror'])
 
@@ -597,7 +601,7 @@
     else:
         env.Append(CXXFLAGS = ['-Z7','-MTd','-fms-compatibility','-fdelayed-template-parsing'])
         env.Append(LINKFLAGS = ['-DEBUG'])
- 
+
     env.Append(CPPDEFINES = ['ARM_COMPUTE_DEBUG_ENABLED'])
 else:
     if not 'windows' in env['os']:
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 6d27ae3..13f4e9e 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -44,6 +44,8 @@
 v24.01 Public major release
  - Remove the legacy 'libarm_compute_core' library. This library is an artifact of Compute Library's legacy library architecture and no longer serves any purpose.
   You should link only to the main `libarm_compute` library for core functionality.
+ - New features
+   - Add support for FP16 in all multi_isa builds.
 
 v23.11 Public major release
  - New features
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 686304b..50b3fc1 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_COMMON_REGISTRARS_H
-#define SRC_CORE_COMMON_REGISTRARS_H
+#ifndef ACL_SRC_CORE_COMMON_REGISTRARS_H
+#define ACL_SRC_CORE_COMMON_REGISTRARS_H
 
 #if defined(ENABLE_FP16_KERNELS)
 
@@ -38,11 +38,11 @@
 #define REGISTER_FP16_SVE2(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
 
-#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_NEON)
 #define REGISTER_FP16_NEON(func_name) &(func_name)
 #else /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
@@ -179,4 +179,4 @@
 #define REGISTER_BF16_NEON(func_name) nullptr
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/
 
-#endif /* SRC_CORE_COMMON_REGISTRARS_H */
+#endif // ACL_SRC_CORE_COMMON_REGISTRARS_H