Enable fat binary support

Changes our build system to allow building both Neon(TM) and SVE
kernels and package them in the same binary. This will allow
runtime selection of the underlying architecture.

Adds new build option, fat_binary, for enabling this feature.

Change-Id: I8e8386149773ce28e071a2fb7ddd8c8ae0f28a4a
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5704
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index d1003f2..cbb58b1 100644
--- a/Android.bp
+++ b/Android.bp
@@ -314,12 +314,15 @@
         "src/core/cpu/kernels/add/neon/qasymm8.cpp",
         "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp",
         "src/core/cpu/kernels/add/neon/qsymm16.cpp",
+        "src/core/cpu/kernels/add/sve/impl.cpp",
         "src/core/cpu/kernels/add/sve/integer.cpp",
         "src/core/cpu/kernels/add/sve/qasymm8.cpp",
         "src/core/cpu/kernels/add/sve/qasymm8_signed.cpp",
         "src/core/cpu/kernels/add/sve/qsymm16.cpp",
-        "src/core/cpu/kernels/floor/NEON/fp16.cpp",
-        "src/core/cpu/kernels/floor/NEON/fp32.cpp",
+        "src/core/cpu/kernels/elementwise/sve/elementwise.cpp",
+        "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp",
+        "src/core/cpu/kernels/floor/neon/fp16.cpp",
+        "src/core/cpu/kernels/floor/neon/fp32.cpp",
         "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
         "src/core/cpu/kernels/pooling/neon/fp16.cpp",
         "src/core/cpu/kernels/pooling/neon/fp32.cpp",
@@ -335,6 +338,7 @@
         "src/core/cpu/kernels/scale/sve/integer.cpp",
         "src/core/cpu/kernels/scale/sve/qasymm8.cpp",
         "src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp",
+        "src/core/cpu/kernels/softmax/impl/sve/impl.cpp",
         "src/core/cpu/kernels/sub/neon/integer.cpp",
         "src/core/cpu/kernels/sub/neon/qasymm8.cpp",
         "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp",
diff --git a/SConscript b/SConscript
index 143823d..94ba6d4 100644
--- a/SConscript
+++ b/SConscript
@@ -26,6 +26,7 @@
 import zlib
 import base64
 import string
+import json
 
 VERSION = "v0.0-unreleased"
 LIBRARY_VERSION_MAJOR = 23
@@ -38,13 +39,20 @@
 Import('install_lib')
 
 def build_bootcode_objs(sources):
-
     arm_compute_env.Append(ASFLAGS = "-I bootcode/")
     obj = arm_compute_env.Object(sources)
     obj = install_lib(obj)
     Default(obj)
     return obj
 
+def build_sve_objs(sources):
+    tmp_env = arm_compute_env.Clone()
+    tmp_env.Append(CXXFLAGS = "-march=armv8.2-a+sve+fp16")
+    obj = tmp_env.SharedObject(sources)
+    obj = install_lib(obj)
+    Default(obj)
+    return obj
+
 def build_library(name, build_env, sources, static=False, libs=[]):
     if static:
         obj = build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
@@ -172,6 +180,9 @@
 
 arm_compute_env.Append(LIBS = ['dl'])
 
+with (open(Dir('#').path + '/filelist.json')) as fp:
+    filelist = json.load(fp)
+
 core_files = Glob('src/core/*.cpp')
 core_files += Glob('src/core/CPP/*.cpp')
 core_files += Glob('src/core/CPP/kernels/*.cpp')
@@ -189,25 +200,14 @@
 runtime_files += Glob('src/runtime/CPP/functions/*.cpp')
 
 # C API files
-c_api_files = ['src/c/AclContext.cpp',
-               'src/c/AclQueue.cpp',
-               'src/c/AclTensor.cpp',
-               'src/c/AclTensorPack.cpp',
-               'src/c/AclVersion.cpp',
-               ]
+runtime_files += filelist['c_api']['cpu']
+
 if env['opencl']:
-    c_api_files += ['src/c/cl/AclOpenClExt.cpp']
+    runtime_files += filelist['c_api']['gpu']
 
 # Common backend files
-common_backend_files = ['src/common/utils/LegacySupport.cpp',
-                        'src/common/AllocatorWrapper.cpp',
-                        'src/common/ITensorV2.cpp',
-                        'src/common/TensorPack.cpp',
-                        ]
+core_files += filelist['common']
 
-core_files += common_backend_files
-runtime_files += c_api_files
-# CLHarrisCorners uses the Scheduler to run CPP kernels
 runtime_files += Glob('src/runtime/CPP/SingleThreadScheduler.cpp')
 
 graph_files = Glob('src/graph/*.cpp')
@@ -231,9 +231,7 @@
                           ]
     core_files += cl_kernel_hp_files
     core_files += Glob('src/core/CL/*.cpp')
-    core_files += Glob('src/core/CL/kernels/*.cpp')
     core_files += Glob('src/core/gpu/cl/*.cpp')
-    core_files += Glob('src/core/gpu/cl/kernels/*.cpp')
 
     runtime_files += Glob('src/runtime/CL/*.cpp')
     runtime_files += Glob('src/runtime/CL/functions/*.cpp')
@@ -245,10 +243,12 @@
     runtime_files += Glob('src/runtime/CL/gemm_auto_heuristics/*.cpp')
 
     runtime_files += Glob('src/gpu/cl/*.cpp')
-
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
+    core_files += filelist['gpu']['core']['kernels']['high_priority'] + filelist['gpu']['core']['kernels']['all']
 
+sve_o = []
+core_files_sve = []
 if env['neon']:
     core_files += Glob('src/core/NEON/*.cpp')
     core_files += Glob('src/core/NEON/kernels/*.cpp')
@@ -277,107 +277,47 @@
     if env['estate'] == '64':
         core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp')
         core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/a64_*/*.cpp')
-        if "sve" in env['arch']:
-             core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
-             core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp')
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['all']
+            core_files_sve += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
+            core_files_sve += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp')
+
+    if any(i in env['data_layout_support'] for i in ['all', 'nchw']):
+        core_files += filelist['cpu']['core']['neon']['nchw']
 
     if any(i in env['data_type_support'] for i in ['all', 'fp16']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/fp16.cpp')
+        if not "sve" in env['arch'] or env['fat_binary']:
+            core_files += filelist['cpu']['core']['neon']['fp16']
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['fp16']
     if any(i in env['data_type_support'] for i in ['all', 'fp32']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/fp32.cpp')
+        if not "sve" in env['arch'] or env['fat_binary']:
+            core_files += filelist['cpu']['core']['neon']['fp32']
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['fp32']
     if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8.cpp')
+        core_files += filelist['cpu']['core']['neon']['qasymm8']
+        core_files_sve += filelist['cpu']['core']['sve']['qasymm8']
     if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8_signed.cpp')
+        core_files += filelist['cpu']['core']['neon']['qasymm8_signed']
+        core_files_sve += filelist['cpu']['core']['sve']['qasymm8_signed']
     if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/qsymm16.cpp')
+        core_files += filelist['cpu']['core']['neon']['qsymm16']
+        core_files_sve += filelist['cpu']['core']['sve']['qsymm16']
     if any(i in env['data_type_support'] for i in ['all', 'integer']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/integer.cpp')
+        if not "sve" in env['arch'] or env['fat_binary']:
+            core_files += filelist['cpu']['core']['neon']['integer']
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['integer']
+
+    core_files += Glob('src/core/cpu/kernels/*/*.cpp')
+    core_files += filelist['cpu']['core']['kernels']['high_priority'] + filelist['cpu']['core']['kernels']['all']
 
     runtime_files += Glob('src/runtime/NEON/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/assembly/*.cpp')
-
-    cpu_kernel_hp_files = ['src/core/cpu/kernels/CpuActivationKernel.cpp',
-                           'src/core/cpu/kernels/CpuCastKernel.cpp',
-                           'src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConv2dKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp',
-                           'src/core/cpu/kernels/CpuPermuteKernel.cpp',
-                           'src/core/cpu/kernels/CpuPool2dKernel.cpp',
-                           'src/core/cpu/kernels/CpuReshapeKernel.cpp',
-                          ]
-    cpu_kernel_files = ['src/core/cpu/kernels/CpuAddKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp',
-                        'src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp',
-                        'src/core/cpu/kernels/CpuCopyKernel.cpp',
-                        'src/core/cpu/kernels/CpuDequantizeKernel.cpp',
-                        'src/core/cpu/kernels/CpuElementwiseKernel.cpp',
-                        'src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp',
-                        'src/core/cpu/kernels/CpuFillKernel.cpp',
-                        'src/core/cpu/kernels/CpuFloorKernel.cpp',
-                        'src/core/cpu/kernels/CpuMulKernel.cpp',
-                        'src/core/cpu/kernels/CpuQuantizeKernel.cpp',
-                        'src/core/cpu/kernels/CpuScaleKernel.cpp',
-                        'src/core/cpu/kernels/CpuSoftmaxKernel.cpp',
-                        'src/core/cpu/kernels/CpuSubKernel.cpp',
-                        'src/core/cpu/kernels/CpuTransposeKernel.cpp',
-                       ]
-    core_files += [cpu_kernel_hp_files, cpu_kernel_files]
-
-    core_files += Glob('src/core/cpu/kernels/*/*.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'fp16']):
-        core_files += Glob('src/core/cpu/kernels/*/*/fp16.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'fp32']):
-        core_files += Glob('src/core/cpu/kernels/*/*/fp32.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
-        core_files += Glob('src/core/cpu/kernels/*/*/qasymm8.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
-        core_files += Glob('src/core/cpu/kernels/*/*/qasymm8_signed.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
-        core_files += Glob('src/core/cpu/kernels/*/*/qsymm16.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'integer']):
-        core_files += Glob('src/core/cpu/kernels/*/*/integer.cpp')
-
-    if any(i in env['data_layout_support'] for i in ['all', 'nchw']):
-        core_files += Glob('src/core/cpu/kernels/*/*/nchw/all.cpp')
-
-    cpu_rt_files = ['src/cpu/CpuContext.cpp',
-                    'src/cpu/CpuQueue.cpp',
-                    'src/cpu/CpuTensor.cpp'
-                   ]
-    cpu_operator_hp_files = ['src/runtime/cpu/operators/CpuActivation.cpp',
-                             'src/runtime/cpu/operators/CpuCast.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp',
-                             'src/runtime/cpu/operators/CpuDirectConv2d.cpp',
-                             'src/runtime/cpu/operators/CpuFlatten.cpp',
-                             'src/runtime/cpu/operators/CpuPermute.cpp',
-                             'src/runtime/cpu/operators/CpuPool2d.cpp',
-                            ]
-    cpu_operator_files = ['src/runtime/cpu/operators/CpuAdd.cpp',
-                          'src/runtime/cpu/operators/CpuConcatenate.cpp',
-                          'src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp',
-                          'src/runtime/cpu/operators/CpuCopy.cpp',
-                          'src/runtime/cpu/operators/CpuDequantize.cpp',
-                          'src/runtime/cpu/operators/CpuElementwise.cpp',
-                          'src/runtime/cpu/operators/CpuElementwiseUnary.cpp',
-                          'src/runtime/cpu/operators/CpuFill.cpp',
-                          'src/runtime/cpu/operators/CpuFloor.cpp',
-                          'src/runtime/cpu/operators/CpuMul.cpp',
-                          'src/runtime/cpu/operators/CpuQuantize.cpp',
-                          'src/runtime/cpu/operators/CpuReshape.cpp',
-                          'src/runtime/cpu/operators/CpuScale.cpp',
-                          'src/runtime/cpu/operators/CpuSoftmax.cpp',
-                          'src/runtime/cpu/operators/CpuSub.cpp',
-                          'src/runtime/cpu/operators/CpuTranspose.cpp',
-                          'src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp',
-                         ]
-    cpu_internal_operator_files = ['src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp',]
-    runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files, cpu_internal_operator_files ]
+    runtime_files += filelist['cpu']['runtime']['all'] + filelist['cpu']['runtime']['operators']['high_priority'] \
+                  + filelist['cpu']['runtime']['operators']['all'] + filelist['cpu']['runtime']['operators']['internal']
 
 bootcode_o = []
 if env['os'] == 'bare_metal':
@@ -385,11 +325,18 @@
     bootcode_o = build_bootcode_objs(bootcode_files)
 Export('bootcode_o')
 
-arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files, static=True)
+if (env['fat_binary']):
+    sve_o = build_sve_objs(core_files_sve)
+    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files + sve_o, static=True)
+else:
+    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files + core_files_sve, static=True)
 Export('arm_compute_core_a')
 
 if env['os'] != 'bare_metal' and not env['standalone']:
-    arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files, static=False)
+    if (env['fat_binary']):
+        arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files + sve_o, static=False)
+    else:
+        arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files + core_files_sve, static=False)
     Export('arm_compute_core_so')
 
 arm_compute_a = build_library('arm_compute-static', arm_compute_env, runtime_files, static=True, libs = [ arm_compute_core_a ])
diff --git a/SConstruct b/SConstruct
index d36fbab..f800d9d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -53,6 +53,7 @@
     BoolVariable("examples", "Build example programs", True),
     BoolVariable("gemm_tuner", "Build gemm_tuner programs", True),
     BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
+    BoolVariable("fat_binary", "Build fat binary version of library. Note works only for armv8.2-a", False),
     BoolVariable("standalone", "Builds the tests as standalone executables, links statically with libgcc, libstdc++ and libarm_compute", False),
     BoolVariable("opencl", "Enable OpenCL support", True),
     BoolVariable("neon", "Enable Arm® Neon™ support", False),
@@ -255,6 +256,11 @@
         elif env['os'] == 'tizen':
             prefix = "aarch64-tizen-linux-gnu-"
 
+if 'sve' in env['arch']:
+    env.Append(CXXFLAGS = ['-DENABLE_SVE'])
+else:
+    env.Append(CXXFLAGS = ['-DENABLE_NEON'])
+
 if env['build'] == 'native':
     prefix = ""
 
@@ -298,6 +304,13 @@
         if not version_at_least(compiler_ver, '7.0.0') and env['os'] == 'bare_metal':
             env.Append(LINKFLAGS = ['-fstack-protector-strong'])
 
+if env['fat_binary']:
+    if env['arch'] != 'armv8.2-a':
+        print("Currently fat binary is only supported with armv8.2-a")
+        Exit(1)
+    env.Append(CXXFLAGS = ['-DENABLE_SVE'])
+    env.Append(CXXFLAGS = ['-DENABLE_NEON'])
+
 if env['data_type_support']:
     if any(i in env['data_type_support'] for i in ['all', 'fp16']):
         env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
diff --git a/filelist.json b/filelist.json
new file mode 100644
index 0000000..d84a350
--- /dev/null
+++ b/filelist.json
@@ -0,0 +1,288 @@
+{
+    "common" : [
+        "src/common/utils/LegacySupport.cpp",
+        "src/common/AllocatorWrapper.cpp",
+        "src/common/ITensorV2.cpp",
+        "src/common/TensorPack.cpp"
+    ],
+    "c_api" :
+    {
+        "cpu":    [
+            "src/c/AclContext.cpp",
+            "src/c/AclQueue.cpp",
+            "src/c/AclTensor.cpp",
+            "src/c/AclTensorPack.cpp",
+            "src/c/AclVersion.cpp"
+        ],
+        "gpu": [
+            "src/c/cl/AclOpenClExt.cpp"
+        ]
+    },
+
+    "gpu" :
+    {
+        "core" :
+        {
+            "kernels" :
+            {
+                "high_priority" : [
+                    "src/core/gpu/cl/kernels/ClActivationKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClPermuteKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClPool2dKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClReshapeKernel.cpp"
+                ],
+                "all" : [
+                    "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClCastKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClCopyKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClCropKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClDequantizeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClElementwiseKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClFillKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClFloorKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClMulKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClQuantizeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClScaleKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClTransposeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
+                    "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp",
+                    "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
+                    "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
+                    "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
+                    "src/core/CL/kernels/CLBitwiseKernel.cpp",
+                    "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp",
+                    "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp",
+                    "src/core/CL/kernels/CLCol2ImKernel.cpp",
+                    "src/core/CL/kernels/CLComparisonKernel.cpp",
+                    "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
+                    "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
+                    "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp",
+                    "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp",
+                    "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp",
+                    "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp",
+                    "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp",
+                    "src/core/CL/kernels/CLFFTRadixStageKernel.cpp",
+                    "src/core/CL/kernels/CLFFTScaleKernel.cpp",
+                    "src/core/CL/kernels/CLFillBorderKernel.cpp",
+                    "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
+                    "src/core/CL/kernels/CLGatherKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
+                    "src/core/CL/kernels/CLIm2ColKernel.cpp",
+                    "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
+                    "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp",
+                    "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
+                    "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
+                    "src/core/CL/kernels/CLMinMaxLayerKernel.cpp",
+                    "src/core/CL/kernels/CLNormalizationLayerKernel.cpp",
+                    "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp",
+                    "src/core/CL/kernels/CLPadLayerKernel.cpp",
+                    "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp",
+                    "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp",
+                    "src/core/CL/kernels/CLRangeKernel.cpp",
+                    "src/core/CL/kernels/CLReductionOperationKernel.cpp",
+                    "src/core/CL/kernels/CLRemapKernel.cpp",
+                    "src/core/CL/kernels/CLReorgLayerKernel.cpp",
+                    "src/core/CL/kernels/CLReverseKernel.cpp",
+                    "src/core/CL/kernels/CLROIAlignLayerKernel.cpp",
+                    "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp",
+                    "src/core/CL/kernels/CLSelectKernel.cpp",
+                    "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp",
+                    "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp",
+                    "src/core/CL/kernels/CLStackLayerKernel.cpp",
+                    "src/core/CL/kernels/CLStridedSliceKernel.cpp",
+                    "src/core/CL/kernels/CLTileKernel.cpp",
+                    "src/core/CL/kernels/CLWeightsReshapeKernel.cpp",
+                    "src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp",
+                    "src/core/CL/kernels/CLWinogradInputTransformKernel.cpp",
+                    "src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp"
+                ]
+            }
+        }
+    },
+    "cpu" :
+    {
+        "runtime" :
+        {
+            "all" : [
+                "src/cpu/CpuContext.cpp",
+                "src/cpu/CpuQueue.cpp",
+                "src/cpu/CpuTensor.cpp"
+            ],
+            "operators" :
+            {
+                "high_priority" : [
+                    "src/runtime/cpu/operators/CpuActivation.cpp",
+                    "src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp",
+                    "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
+                    "src/runtime/cpu/operators/CpuDirectConv2d.cpp",
+                    "src/runtime/cpu/operators/CpuPermute.cpp",
+                    "src/runtime/cpu/operators/CpuPool2d.cpp"
+                ],
+                "internal" : [
+                    "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp"
+                ],
+                "all" : [
+                    "src/runtime/cpu/operators/CpuAdd.cpp",
+                    "src/runtime/cpu/operators/CpuCast.cpp",
+                    "src/runtime/cpu/operators/CpuConcatenate.cpp",
+                    "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp",
+                    "src/runtime/cpu/operators/CpuCopy.cpp",
+                    "src/runtime/cpu/operators/CpuDequantize.cpp",
+                    "src/runtime/cpu/operators/CpuElementwise.cpp",
+                    "src/runtime/cpu/operators/CpuElementwiseUnary.cpp",
+                    "src/runtime/cpu/operators/CpuFill.cpp",
+                    "src/runtime/cpu/operators/CpuFlatten.cpp",
+                    "src/runtime/cpu/operators/CpuFloor.cpp",
+                    "src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
+                    "src/runtime/cpu/operators/CpuMul.cpp",
+                    "src/runtime/cpu/operators/CpuQuantize.cpp",
+                    "src/runtime/cpu/operators/CpuReshape.cpp",
+                    "src/runtime/cpu/operators/CpuScale.cpp",
+                    "src/runtime/cpu/operators/CpuSoftmax.cpp",
+                    "src/runtime/cpu/operators/CpuSub.cpp",
+                    "src/runtime/cpu/operators/CpuTranspose.cpp"
+                ]
+            }
+        },
+        "core" :
+        {
+            "kernels" :
+            {
+                "high_priority" : [
+                    "src/core/cpu/kernels/CpuActivationKernel.cpp",
+                    "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
+                    "src/core/cpu/kernels/CpuDirectConv2dKernel.cpp",
+                    "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp",
+                    "src/core/cpu/kernels/CpuPermuteKernel.cpp",
+                    "src/core/cpu/kernels/CpuPool2dKernel.cpp",
+                    "src/core/cpu/kernels/CpuReshapeKernel.cpp"
+                ],
+                "all" : [
+                    "src/core/cpu/kernels/CpuAddKernel.cpp",
+                    "src/core/cpu/kernels/CpuCastKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
+                    "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
+                    "src/core/cpu/kernels/CpuCopyKernel.cpp",
+                    "src/core/cpu/kernels/CpuDequantizeKernel.cpp",
+                    "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
+                    "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
+                    "src/core/cpu/kernels/CpuFillKernel.cpp",
+                    "src/core/cpu/kernels/CpuFloorKernel.cpp",
+                    "src/core/cpu/kernels/CpuMulKernel.cpp",
+                    "src/core/cpu/kernels/CpuQuantizeKernel.cpp",
+                    "src/core/cpu/kernels/CpuScaleKernel.cpp",
+                    "src/core/cpu/kernels/CpuSoftmaxKernel.cpp",
+                    "src/core/cpu/kernels/CpuSubKernel.cpp",
+                    "src/core/cpu/kernels/CpuTransposeKernel.cpp"
+                ]
+            },
+
+            "sve" :
+            {
+                "all" : [
+                    "src/core/cpu/kernels/add/sve/impl.cpp",
+                    "src/core/cpu/kernels/softmax/impl/sve/impl.cpp",
+                    "src/core/cpu/kernels/elementwise/sve/elementwise.cpp",
+                    "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp"
+                ],
+                "fp32" : [
+                    "src/core/cpu/kernels/activation/sve/fp32.cpp",
+                    "src/core/cpu/kernels/scale/sve/fp32.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp"
+                ],
+                "fp16" : [
+                    "src/core/cpu/kernels/activation/sve/fp16.cpp",
+                    "src/core/cpu/kernels/scale/sve/fp16.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp"
+                ],
+                "qsymm16" : [
+                    "src/core/cpu/kernels/activation/sve/qsymm16.cpp",
+                    "src/core/cpu/kernels/add/sve/qsymm16.cpp"
+                ],
+                "qasymm8" : [
+                    "src/core/cpu/kernels/activation/sve/qasymm8.cpp",
+                    "src/core/cpu/kernels/add/sve/qasymm8.cpp",
+                    "src/core/cpu/kernels/scale/sve/qasymm8.cpp"
+                ],
+                "qasymm8_signed" : [
+                    "src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/add/sve/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp"
+                ],
+                "integer" : [
+                    "src/core/cpu/kernels/add/sve/integer.cpp",
+                    "src/core/cpu/kernels/scale/sve/integer.cpp"
+                ]
+            },
+
+            "neon":
+            {
+                "nchw" : [
+                    "src/core/cpu/kernels/pooling/neon/nchw/all.cpp"
+                ],
+                "fp32" : [
+                    "src/core/cpu/kernels/activation/neon/fp32.cpp",
+                    "src/core/cpu/kernels/floor/neon/fp32.cpp",
+                    "src/core/cpu/kernels/pooling/neon/fp32.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp"
+                ],
+                "fp16" : [
+                    "src/core/cpu/kernels/activation/neon/fp16.cpp",
+                    "src/core/cpu/kernels/floor/neon/fp16.cpp",
+                    "src/core/cpu/kernels/pooling/neon/fp16.cpp",
+                    "src/core/cpu/kernels/scale/neon/fp16.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp"
+                ],
+                "qsymm16" : [
+                    "src/core/cpu/kernels/activation/neon/qsymm16.cpp",
+                    "src/core/cpu/kernels/add/neon/qsymm16.cpp",
+                    "src/core/cpu/kernels/sub/neon/qsymm16.cpp"
+
+                ],
+                "qasymm8" : [
+                    "src/core/cpu/kernels/activation/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/add/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/pooling/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/scale/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/sub/neon/qasymm8.cpp"
+                ],
+                "qasymm8_signed" : [
+                    "src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp"
+                ],
+                "integer" : [
+                    "src/core/cpu/kernels/sub/neon/integer.cpp",
+                    "src/core/cpu/kernels/add/neon/integer.cpp"
+                ]
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 649d934..8ab7c13 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -63,6 +63,7 @@
                 ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or
                 ("CPUUtils.cpp" in line and "'asm/hwcap.h' file not found" in line) or
                 ("CPUUtils.cpp" in line and "use of undeclared identifier 'HWCAP_SVE'" in line) or
+               ("sve" in line) or
                 ("'arm_compute_version.embed' file not found" in line) ):
                 print_context=False
                 continue
@@ -115,6 +116,7 @@
                ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or
                ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or
+               ("sve" in line) or
                ("CpuDepthwiseConv2dNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
                 print_context=False
                 continue
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index b73043a..dde75e8 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_SVEMATH_H
 #define ARM_COMPUTE_SVEMATH_H
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "src/core/NEON/wrapper/intrinsics/svcvt.h"
 #include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
@@ -185,5 +185,5 @@
 
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 #endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index d909adf..7625e5b 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -24,7 +24,7 @@
 #include <cmath>
 #include <limits>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && defined(ENABLE_SVE)
 
 #ifndef M_PI
 #define M_PI (3.14159265358979323846)
@@ -388,4 +388,4 @@
 #endif /* defined(__ARM_FEATURE_SVE2) */
 
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 1691943..92000bb 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -63,7 +63,7 @@
 
 static const BatchNormalizationKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp16_sve_batch_normalization",
         [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
@@ -74,7 +74,8 @@
         [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* !defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "fp16_neon_batch_normalization",
@@ -87,7 +88,7 @@
         [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
     },
-#endif /* !defined(__ARM_FEATURE_SVE) */
+#endif /* !defined(ENABLE_NEON) */
 };
 
 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 3e3e81d..a715b9d 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -29,7 +29,7 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -114,4 +114,4 @@
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
+#endif // ENABLE_SVE
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index b0d4cbb..7cc570d 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -29,7 +29,7 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -114,4 +114,4 @@
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
+#endif // ENABLE_SVE
diff --git a/src/core/NEON/wrapper/intrinsics/svpow.h b/src/core/NEON/wrapper/intrinsics/svpow.h
index e89a4ab..0f58d75 100644
--- a/src/core/NEON/wrapper/intrinsics/svpow.h
+++ b/src/core/NEON/wrapper/intrinsics/svpow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,16 @@
         return svpow_##postfix##_z(pg, a, b);                      \
     }
 
+#define SVPOW_Z_IMPL_INT(type, postfix)                            \
+    inline type svpow_z(svbool_t pg, const type &a, const type &b) \
+    {                                                              \
+        ARM_COMPUTE_UNUSED(pg, a, b);                              \
+        ARM_COMPUTE_ERROR("Not supported");                        \
+    }
+
 SVPOW_Z_IMPL(svfloat32_t, f32)
 SVPOW_Z_IMPL(svfloat16_t, f16)
+SVPOW_Z_IMPL_INT(svint16_t, s16)
 
 #undef SVPOW_Z_IMPL
 
diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 465983d..8d2d660 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h
@@ -23,7 +23,7 @@
  */
 #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H
 #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "src/core/NEON/SVEMath.h"
 #include <arm_sve.h>
 
@@ -66,5 +66,5 @@
 } // namespace wrapper
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 #endif /* #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H */
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
index 3452b76..8168514 100644
--- a/src/core/NEON/wrapper/traits.h
+++ b/src/core/NEON/wrapper/traits.h
@@ -26,9 +26,9 @@
 
 #include <arm_neon.h>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include <arm_sve.h>
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
 namespace arm_compute
 {
@@ -116,13 +116,13 @@
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 /** Create the appropriate SVE vector given its type */
 template <typename T> struct sve_vector;
 
 template <> struct sve_vector<uint8_t>{ using scalar_type = uint8_t; using type = svuint8_t; };
 template <> struct sve_vector<int8_t>{ using scalar_type = int8_t; using type = svint8_t; };
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
 #endif /* DOXYGEN_SKIP_THIS */
 
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 112c83a..44ddf98 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -26,17 +26,17 @@
 
 #if defined(ENABLE_FP16_KERNELS)
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #define REGISTER_FP16_SVE(func_name) &(func_name)
-#else /* !defined(__ARM_FEATURE_SVE) */
+#else /* !defined(ENABLE_SVE) */
 #define REGISTER_FP16_SVE(func_name) nullptr
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #define REGISTER_FP16_NEON(func_name) &(func_name)
-#else /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#else /* !defined(ENABLE_NEON) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
@@ -44,50 +44,82 @@
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
 #if defined(ENABLE_FP32_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
+
+#if defined(ENABLE_SVE)
 #define REGISTER_FP32_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_FP32_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
+#if defined(ENABLE_NEON)
 #define REGISTER_FP32_NEON(func_name) &(func_name)
+#else /* !defined(ENABLE_NEON) */
+#define REGISTER_FP32_NEON(func_name) nullptr
+#endif /* defined(ENABLE_NEON) */
+
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
 #define REGISTER_FP32_SVE(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_SIGNED_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name)
+
+#if defined(ENABLE_SVE)
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name)
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
 #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
-#define REGISTER_QASYMM8_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
 #define REGISTER_QASYMM8_NEON(func_name) &(func_name)
+
+#if defined(ENABLE_SVE)
+#define REGISTER_QASYMM8_SVE(func_name) &(func_name)
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
 #define REGISTER_QASYMM8_SVE(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
 #if defined(ENABLE_QSYMM16_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
-#define REGISTER_QSYMM16_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+
 #define REGISTER_QSYMM16_NEON(func_name) &(func_name)
+
+#if defined(ENABLE_SVE)
+#define REGISTER_QSYMM16_SVE(func_name) &(func_name)
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
 #define REGISTER_QSYMM16_SVE(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
 #if defined(ENABLE_INTEGER_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
+
+#if defined(ENABLE_SVE)
 #define REGISTER_INTEGER_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_INTEGER_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
+#if defined(ENABLE_NEON)
 #define REGISTER_INTEGER_NEON(func_name) &(func_name)
+#else /* !defined(ENABLE_NEON) */
+#define REGISTER_INTEGER_NEON(func_name) nullptr
+#endif /* defined(ENABLE_NEON) */
+
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
 #define REGISTER_INTEGER_SVE(func_name) nullptr
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
index eb38c18..8a57a3b 100644
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/core/cpu/kernels/CpuActivationKernel.cpp
@@ -60,7 +60,7 @@
 
 static const ActivationKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp16_sve_activation",
         [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
@@ -71,7 +71,8 @@
         [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
     },
-#else  /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE)  */
+#if defined(ENABLE_NEON)
     {
         "fp16_neon_activation",
         [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
@@ -82,9 +83,8 @@
         [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
     },
-#endif /* defined(__ARM_FEATURE_SVE)  */
-
-#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(ENABLE_NEON)  */
+#if defined(__ARM_FEATURE_SVE2)
     {
         "qasymm8_sve_activation",
         [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
@@ -116,7 +116,7 @@
         [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
     },
-#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(__ARM_FEATURE_SVE2)  */
 };
 
 const ActivationKernel *get_implementation(const ActivationSelectorData &data)
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
index fc88a7e..7afdcea 100644
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ b/src/core/cpu/kernels/CpuAddKernel.cpp
@@ -61,7 +61,7 @@
 
 static const AddKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "add_same_sve",
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
@@ -102,7 +102,8 @@
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
         REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
     {
         "add_same_neon",
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
@@ -145,8 +146,7 @@
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon)
     },
-#endif /* defined(__ARM_FEATURE_SVE) */
-
+#endif /* defined(ENABLE_NEON) */
 #if defined(__ARM_FEATURE_SVE2)
     {
         "add_qasymm8_sve",
@@ -179,7 +179,7 @@
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
     },
-#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(ENABLE_NEON) */
 
 };
 
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
index ddbc48f..643a870 100644
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
@@ -76,28 +76,31 @@
     ARM_COMPUTE_UNUSED(src1, dst);
     static ElementwiseKernel kernels[] =
     {
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, float32_t>))),
-        generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, int32_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))),
+        generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))),
+        generate_kernel<DataType::S16>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))),
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
         generate_kernel<DataType::F32>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))),
         generate_kernel<DataType::S32>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 #if defined(__ARM_FEATURE_SVE2)
-        generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op<op, uint8_t>))),
-        generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op<op, int8_t>))),
-#else  /* defined(__ARM_FEATURE_SVE2) */
+        generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))),
+        generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))),
+#else  /* !defined(__ARM_FEATURE_SVE2) */
         generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))),
         generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))),
 #endif /* defined(__ARM_FEATURE_SVE2) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, float16_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))),
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         generate_kernel<DataType::F16>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
         generate_kernel<DataType::S16>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))),
+#endif /* defined(ENABLE_NEON) */
     };
 
     for(const auto &uk : kernels)
@@ -118,31 +121,31 @@
     ARM_COMPUTE_UNUSED(src1, dst);
     static ElementwiseKernel kernels[] =
     {
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, uint8_t>))),
-        generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, float>))),
-        generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, int16_t>))),
-        generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, int32_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))),
+        generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))),
+        generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))),
+        generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))),
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
         generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))),
         generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))),
         generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))),
         generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 #if defined(__ARM_FEATURE_SVE2)
-        generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op<op, int8_t>))),
-        generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op<op, uint8_t>))),
-#else  /* defined(__ARM_FEATURE_SVE2) */
+        generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))),
+        generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))),
+#else  /* !defined(__ARM_FEATURE_SVE2) */
         generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))),
         generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))),
 #endif /* defined(__ARM_FEATURE_SVE2) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, float16_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))),
+#endif /* defined(ENABLE_SVE)  */
+#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     };
 
     for(const auto &uk : kernels)
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 3a96d93..2600a49 100644
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -54,7 +54,7 @@
 
 static const ElementwiseUnaryKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp32_sve_elementwise_unary",
         [](DataType dt) { return dt == DataType::F32; },
@@ -70,7 +70,8 @@
         [](DataType dt) { return dt == DataType::S32; },
         REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
     },
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
+#if defined(ENABLE_NEON)
     {
         "fp32_neon_elementwise_unary",
         [](DataType dt) { return dt == DataType::F32; },
@@ -88,6 +89,7 @@
         [](DataType dt) { return dt == DataType::S32; },
         REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
     },
+#endif // defined(ENABLE_NEON)
 };
 
 const ElementwiseUnaryKernel *get_implementation(DataType dt)
diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp
index ed75171..29475fa 100644
--- a/src/core/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/core/cpu/kernels/CpuScaleKernel.cpp
@@ -64,38 +64,39 @@
 
 static const ScaleKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp16_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_sve_scale)
+        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
     },
     {
         "f32_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_sve_scale)
+        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
     },
     {
         "qasymm8_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_sve_scale)
+        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
     },
     {
         "qasymm8_signed_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_sve_scale)
+        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
     },
     {
         "u8_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_sve_scale)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
     },
     {
         "s16_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_sve_scale)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "common_neon_scale",
@@ -128,7 +129,7 @@
         [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
         REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
     },
-#endif /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 };
 
 /** Micro-kernel selector
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
index d2453ed..8ea186b 100644
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,8 +34,8 @@
 #include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/softmax/impl/NEON/list.h"
-#include "src/core/cpu/kernels/softmax/impl/SVE/list.h"
+#include "src/core/cpu/kernels/softmax/impl/neon/list.h"
+#include "src/core/cpu/kernels/softmax/impl/sve/list.h"
 
 namespace arm_compute
 {
@@ -69,7 +69,7 @@
 
 static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "sve_softmax_logits_1d_float",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -80,7 +80,9 @@
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
         REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+
+#if defined(ENABLE_NEON)
     {
         "neon_softmax_logits_1d_float",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -93,7 +95,7 @@
         REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* !defined(ENABLE_NEON) */
 
 #if defined(__ARM_FEATURE_SVE2)
     {
@@ -123,7 +125,7 @@
 
 static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "sve_logits_1d_max",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -144,7 +146,8 @@
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
     {
         "neon_logits_1d_max",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -167,7 +170,7 @@
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
     },
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 };
 
 const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp
index bf31fd7..e4be1a4 100644
--- a/src/core/cpu/kernels/activation/sve/fp16.cpp
+++ b/src/core/cpu/kernels/activation/sve/fp16.cpp
@@ -28,7 +28,6 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
 #include <arm_sve.h>
 
@@ -126,5 +125,4 @@
     input, output);
 }
 } // namespace cpu
-} // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp
index 75f9f8a..f797944 100644
--- a/src/core/cpu/kernels/activation/sve/fp32.cpp
+++ b/src/core/cpu/kernels/activation/sve/fp32.cpp
@@ -29,7 +29,6 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -127,5 +126,4 @@
     input, output);
 }
 } // namespace cpu
-} // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp
new file mode 100644
index 0000000..d1660fe
--- /dev/null
+++ b/src/core/cpu/kernels/add/sve/impl.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/cpu/kernels/add/sve/impl.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
+    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
+    Iterator output(dst, window);
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+
+            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+                svst1(pg, output_ptr + x, res);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto val1 = svld1(pg, input1_ptr + x);
+                const auto val2 = svld1(pg, input2_ptr + x);
+                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+                svst1(pg, output_ptr + x, res);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        input1, input2, output);
+    }
+}
+
+template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h
new file mode 100644
index 0000000..c38b1d4
--- /dev/null
+++ b/src/core/cpu/kernels/add/sve/impl.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif // defined(ENABLE_SVE)
+#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp
index ae74bfa..6dec140 100644
--- a/src/core/cpu/kernels/add/sve/integer.cpp
+++ b/src/core/cpu/kernels/add/sve/integer.cpp
@@ -25,9 +25,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -197,5 +196,4 @@
     add_s16_u8_s16_sve(src1, src0, dst, policy, window);
 }
 } // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h
index 71dd875..aebb43b 100644
--- a/src/core/cpu/kernels/add/sve/list.h
+++ b/src/core/cpu/kernels/add/sve/list.h
@@ -24,11 +24,12 @@
 #ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H
 #define SRC_CORE_SVE_KERNELS_ADD_LIST_H
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/cpu/kernels/add/sve/impl.h"
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -47,99 +48,7 @@
 
 #undef DECLARE_ADD_KERNEL
 
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
 #endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
new file mode 100644
index 0000000..2c3bb0f
--- /dev/null
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::wrapper;
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+struct LoopArguments
+{
+    OperatorType           op;
+    const InputScalarType *input1_ptr;
+    const InputScalarType *input2_ptr;
+    OutputScalarType      *output_ptr;
+};
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+struct BroadcastLoopArguments
+{
+    OperatorType           op;
+    const InputScalarType *input1_ptr;
+    InputScalarType        broadcast_value;
+    OutputScalarType      *output_ptr;
+    bool                   reorder;
+};
+
+template <typename InputScalarType, typename OutputScalarType>
+void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
+{
+    const auto in1 = svld1(pg, args.input1_ptr);
+    const auto in2 = svld1(pg, args.input2_ptr);
+    const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
+    svst1(pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType>
+void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
+{
+    const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
+    const auto broadcast_vector     = svdup_n(args.broadcast_value);
+    const auto in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
+    const auto in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
+    const auto res                  = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
+    svst1(pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType>
+void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
+{
+    const auto     in1       = svld1(pg, args.input1_ptr);
+    const auto     in2       = svld1(pg, args.input2_ptr);
+    const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
+    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+    svst1(output_pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType>
+void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
+{
+    const auto     non_broadcast_vector = svld1(pg, args.input1_ptr);
+    const auto     broadcast_vector     = svdup_n(args.broadcast_value);
+    const auto     in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
+    const auto     in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
+    const auto     res                  = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
+    const svbool_t output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+    svst1(output_pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
+
+template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
+          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
+          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    OperatorType op,
+                    LoopFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
+                    BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
+{
+    const auto all_true_pg = svptrue<InputScalarType>();
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+            int x = window_start_x;
+
+            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+            do
+            {
+                broadcast_func(pg,
+                {
+                    op,
+                    non_broadcast_input_ptr + x,
+                    broadcast_value,
+                    output_ptr + x,
+                    !is_broadcast_input_2
+                });
+                x += svcnt<InputScalarType>();
+                pg = svwhilelt<InputScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+            int x = window_start_x;
+
+            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+            do
+            {
+                func(pg,
+                {
+                    op,
+                    input1_ptr + x,
+                    input2_ptr + x,
+                    output_ptr + x
+                });
+                x += svcnt<InputScalarType>();
+                pg = svwhilelt<InputScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        input1, input2, output);
+    }
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    using VectorType = typename sve_vector<ScalarType>::type;
+
+    elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
+                                                                &arithmetic_op_loop<ScalarType, ScalarType>,
+                                                                &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
+void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    using InputVectorType  = typename sve_vector<InputScalarType>::type;
+    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+
+    elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
+                                                                           &comparison_op_loop<InputScalarType, OutputScalarType>,
+                                                                           &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
+}
+
+template <>
+svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+{
+    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+}
+
+template <>
+svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+{
+    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+}
+
+template <>
+svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b)
+{
+    ARM_COMPUTE_UNUSED(pg, a, b);
+    ARM_COMPUTE_ERROR("Not supported");
+}
+
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
index 83c3355..a92a864 100644
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
@@ -23,50 +23,62 @@
  */
 #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/svtraits.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-namespace sve
-{
 using namespace arm_compute::wrapper;
 
 template <typename VectorType>
-inline VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
+VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
 {
     return svpow_z(pg, a, b);
 }
 
-template <>
-inline svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
-    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
 template <typename VectorType>
-inline VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
+VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
 {
     return svdiv_z(pg, a, b);
 }
 
-template <>
-inline svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+template <uint32_t bytewidth>
+svbool_t narrow_to_byte_predicate(svbool_t pg)
 {
-    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+    const auto all_false = svpfalse();
+
+    switch(bytewidth)
+    {
+        case 8:
+            pg = svuzp1_b32(pg, all_false);
+        /* fall through */
+        case 4:
+            pg = svuzp1_b16(pg, all_false);
+        /* fall through */
+        case 2:
+            pg = svuzp1_b8(pg, all_false);
+        /* fall through */
+        default:
+            break;
+    }
+    return pg;
 }
 
 template <typename VectorType>
-inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
+VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
 {
-    using ScalarType = typename sve_scalar<VectorType>::type;
+    using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
     VectorType res{};
 
     switch(op)
@@ -108,30 +120,8 @@
     return res;
 }
 
-template <uint32_t bytewidth>
-inline svbool_t narrow_to_byte_predicate(svbool_t pg)
-{
-    const auto all_false = svpfalse();
-
-    switch(bytewidth)
-    {
-        case 8:
-            pg = svuzp1_b32(pg, all_false);
-        /* fall through */
-        case 4:
-            pg = svuzp1_b16(pg, all_false);
-        /* fall through */
-        case 2:
-            pg = svuzp1_b8(pg, all_false);
-        /* fall through */
-        default:
-            break;
-    }
-    return pg;
-}
-
 template <typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
 {
     svbool_t selection_vector{};
 
@@ -159,10 +149,10 @@
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
 
-    using InputScalarType = typename sve_scalar<InputVectorType>::type;
+    using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type;
     selection_vector      = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector);
 
-    using OutputScalarType  = typename sve_scalar<OutputVectorType>::type;
+    using OutputScalarType  = typename wrapper::sve_scalar<OutputVectorType>::type;
     const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0));
     const auto true_vector  = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0));
     auto       ret          = svsel(selection_vector, true_vector, false_vector);
@@ -170,197 +160,12 @@
     return ret;
 }
 
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct LoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    const InputScalarType *input2_ptr;
-    OutputScalarType      *output_ptr;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    InputScalarType        broadcast_value;
-    OutputScalarType      *output_ptr;
-    bool                   reorder;
-};
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = svld1(pg, args.input1_ptr);
-    const auto in2 = svld1(pg, args.input2_ptr);
-    const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto res                  = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     in1       = svld1(pg, args.input1_ptr);
-    const auto     in2       = svld1(pg, args.input2_ptr);
-    const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto     broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto     in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto     in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto     res                  = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OperatorType op,
-                    LoopFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
-                    BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
-    const auto all_true_pg = svptrue<InputScalarType>();
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                broadcast_func(pg,
-                {
-                    op,
-                    non_broadcast_input_ptr + x,
-                    broadcast_value,
-                    output_ptr + x,
-                    !is_broadcast_input_2
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                func(pg,
-                {
-                    op,
-                    input1_ptr + x,
-                    input2_ptr + x,
-                    output_ptr + x
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
 template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using VectorType = typename sve_vector<ScalarType>::type;
+void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
 
-    elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
-                                                                &arithmetic_op_loop<ScalarType, ScalarType>,
-                                                                &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
-
-    elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
-                                                                           &comparison_op_loop<InputScalarType, OutputScalarType>,
-                                                                           &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
-}
-
-} // namespace sve
+template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t>
+void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
 #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
index b6342c7..6c5524e 100644
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
@@ -26,14 +26,13 @@
 
 #if defined(__ARM_FEATURE_SVE2)
 
+#include "src/core/NEON/wrapper/svtraits.h"
 #include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-namespace sve
-{
 using namespace arm_compute::wrapper;
 
 template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
@@ -176,7 +175,7 @@
     const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
     const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
 
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
 
     const auto result = svcreate4(
                             elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
@@ -200,7 +199,7 @@
     const auto &af = args.reorder ? in2 : in1;
     const auto &bf = args.reorder ? in1 : in2;
 
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
 
     const auto result = svcreate4(
                             elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
@@ -221,8 +220,8 @@
 using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
 
 template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
+          typename InputScalarType  = typename wrapper::sve_scalar<InputVectorType>::type,
+          typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type>
 void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
                               OperatorType op,
                               LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
@@ -344,7 +343,7 @@
 template <ArithmeticOperation op, typename ScalarType>
 void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    using VectorType = typename sve_vector<ScalarType>::type;
+    using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type;
     elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
                                                                           &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
                                                                           &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
@@ -354,14 +353,12 @@
 void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    using InputVectorType  = typename wrapper::traits::sve_vector<InputScalarType>::type;
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
     elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
                                                                                      &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
                                                                                      &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
 }
-
-} // namespace sve
 } // namespace cpu
 } // namespace arm_compute
 
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
new file mode 100644
index 0000000..cb58548
--- /dev/null
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, typename VectorType>
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+{
+    switch(op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return svinvsqrt(pg, a);
+        case ElementWiseUnary::EXP:
+            return wrapper::svexp_z(pg, a);
+        case ElementWiseUnary::NEG:
+            return svneg_z(pg, a);
+        case ElementWiseUnary::LOG:
+            return wrapper::svlog_z(pg, a);
+        case ElementWiseUnary::ABS:
+            return svabs_z(pg, a);
+        case ElementWiseUnary::ROUND:
+            return svrintn_z(pg, a);
+        case ElementWiseUnary::SIN:
+            return wrapper::svsin_z(pg, a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
+    }
+}
+
+template <typename ScalarType, typename VectorType>
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+{
+    switch(op)
+    {
+        case ElementWiseUnary::NEG:
+            return svneg_z(pg, a);
+        case ElementWiseUnary::ABS:
+            return svabs_z(pg, a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
+    }
+}
+
+template <typename ScalarType>
+void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+        int        x          = window_start_x;
+
+        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        do
+        {
+            const auto vin = svld1(pg, input_ptr + x);
+            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+            x += wrapper::svcnt<ScalarType>();
+            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        }
+        while(svptest_any(all_true_pg, pg));
+    },
+    input, output);
+}
+
+template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
index 23502c7..6349042 100644
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
@@ -25,87 +25,15 @@
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#if defined(__ARM_FEATURE_SVE)
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
+#if defined(ENABLE_SVE)
 
 namespace arm_compute
 {
 namespace cpu
 {
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return svinvsqrt(pg, a);
-        case ElementWiseUnary::EXP:
-            return wrapper::svexp_z(pg, a);
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::LOG:
-            return wrapper::svlog_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        case ElementWiseUnary::ROUND:
-            return svrintn_z(pg, a);
-        case ElementWiseUnary::SIN:
-            return wrapper::svsin_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
 template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
-}
-
+void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
 #endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/floor/NEON/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp
similarity index 100%
rename from src/core/cpu/kernels/floor/NEON/fp16.cpp
rename to src/core/cpu/kernels/floor/neon/fp16.cpp
diff --git a/src/core/cpu/kernels/floor/NEON/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp
similarity index 100%
rename from src/core/cpu/kernels/floor/NEON/fp32.cpp
rename to src/core/cpu/kernels/floor/neon/fp32.cpp
diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp
index 99f08db..5b9377c 100644
--- a/src/core/cpu/kernels/scale/sve/fp16.cpp
+++ b/src/core/cpu/kernels/scale/sve/fp16.cpp
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -30,12 +32,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -173,4 +173,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp
index 94055ae..05fbedf 100644
--- a/src/core/cpu/kernels/scale/sve/fp32.cpp
+++ b/src/core/cpu/kernels/scale/sve/fp32.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -33,7 +34,6 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -171,4 +171,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp
index 2a724ec..d7e270c 100644
--- a/src/core/cpu/kernels/scale/sve/integer.cpp
+++ b/src/core/cpu/kernels/scale/sve/integer.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -30,12 +31,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -298,4 +297,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
index c041f14..f747037 100644
--- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -31,12 +32,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -90,8 +89,8 @@
                                 bool align_corners, const Window &window)
 {
     // Data layout is NHWC
-    const int        idx_width   = 1;
-    const int        idx_height  = 2;
+    const int idx_width  = 1;
+    const int idx_height = 2;
 
     // Compute the ratio between source height and destination height
     const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
@@ -205,4 +204,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif //  defined(ENABLE_SVE)
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
index 9df4301..584ec7a 100644
--- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -31,12 +32,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -90,8 +89,8 @@
                                        bool align_corners, const Window &window)
 {
     // Data layout is NHWC
-    const int        idx_width   = 1;
-    const int        idx_height  = 2;
+    const int idx_width  = 1;
+    const int idx_height = 2;
 
     // Compute the ratio between source height and destination height
     const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
@@ -205,4 +204,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h
similarity index 100%
rename from src/core/cpu/kernels/softmax/impl/NEON/list.h
rename to src/core/cpu/kernels/softmax/impl/neon/list.h
diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
new file mode 100644
index 0000000..4ed5a4f
--- /dev/null
+++ b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
+{
+    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        // Get pointers
+        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+        // Init max value
+        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+
+        int      x  = window_start_x;
+        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        do
+        {
+            const auto current_value = svld1(pg, in_ptr + x);
+            vec_max                  = svmax_m(pg, vec_max, current_value);
+
+            x += wrapper::svcnt<ScalarType>();
+            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        }
+        while(svptest_any(all_true_pg, pg));
+
+        auto max_val = svmaxv(all_true_pg, vec_max);
+
+        *out_ptr = max_val;
+    },
+    input, output);
+}
+
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator max_it(max, window);
+    Iterator out_it(out, window);
+
+    const auto all_true_pg = wrapper::svptrue<ScalarType>();
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
+
+        ScalarType sum{ 0 };
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+            const auto vec_max = wrapper::svdup_n(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+            /* Loop over row and compute exponentials and sum */
+            int      x  = 0;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            do
+            {
+                auto vec_elements = svld1(pg, in_ptr + x);
+                vec_elements      = svsub_z(pg, vec_elements, vec_max);
+                if(is_log)
+                {
+                    vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
+                    vec_sum      = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                }
+                else
+                {
+                    vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
+                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                }
+                svst1(pg, tmp_ptr + x, vec_elements);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            }
+            while(svptest_any(all_true_pg, pg));
+
+            /* Reduce sum */
+            sum = svaddv(all_true_pg, vec_sum);
+
+            if(is_log)
+            {
+                sum = static_cast<ScalarType>(std::log(sum));
+            }
+            else
+            {
+                sum = ScalarType(1) / sum;
+            }
+        }
+
+        /* Normalize exponentials */
+        {
+            /* Loop over row and compute softmax */
+            int      x  = 0;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            do
+            {
+                auto vec_in           = svld1(pg, tmp_ptr + x);
+                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+                if(is_log)
+                {
+                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                }
+                else
+                {
+                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                }
+                svst1(pg, out_ptr + x, normalized_value);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            }
+            while(svptest_any(all_true_pg, pg));
+        }
+    },
+    in_it, max_it, out_it);
+}
+
+template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
+template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
+template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
+template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
+
+template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
+                                                 ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
+                                                     ITensor *out, const float beta, bool is_log, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(ENABLE_SVE) */
diff --git a/src/core/cpu/kernels/softmax/impl/SVE/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h
similarity index 68%
rename from src/core/cpu/kernels/softmax/impl/SVE/list.h
rename to src/core/cpu/kernels/softmax/impl/sve/list.h
index d558d7d..7ddb358 100644
--- a/src/core/cpu/kernels/softmax/impl/SVE/list.h
+++ b/src/core/cpu/kernels/softmax/impl/sve/list.h
@@ -24,7 +24,7 @@
 #ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
 #define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/SVEMath.h"
@@ -36,44 +36,11 @@
 namespace cpu
 {
 template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
 
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
-
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
-
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-
-        auto max_val = svmaxv(all_true_pg, vec_max);
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window);
 
 #if defined(__ARM_FEATURE_SVE2)
 template <typename ScalarType>
@@ -249,105 +216,8 @@
     in_it, max_it, out_it);
 }
 #endif /* defined(__ARM_FEATURE_SVE2) */
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
-
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
-                    vec_sum      = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
-                }
-                svst1(pg, tmp_ptr + x, vec_elements);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                else
-                {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
 #endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 577603d..111e969 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -68,11 +68,11 @@
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.25f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.1f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.05f);
             }
@@ -80,11 +80,11 @@
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.9f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.00001f);
             }
@@ -111,11 +111,11 @@
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.25f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
@@ -123,11 +123,11 @@
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.9f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index 9834180..ea6656e 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -43,11 +43,11 @@
 {
 namespace
 {
-#if !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#if !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#else                                                  // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#else                                                  // !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(0);
-#endif                                                 // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#endif                                                 // !defined(__aarch64__) || defined(ENABLE_SVE)
 
 /** Input data sets **/
 const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",