Per-operator build dependencies

Creates a list of operators their respective dependencies.
Alters the build system to walk-through them resolve the dependencies
and build Compute Library.

Removes the following unused kernels/functions:
-[NE|CL]MinMaxLayerKernel
-CLFillBorder

Resolves: COMPMID-4695,COMPMID-4696

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I35ebeef38dac25ec5459cfe9c5f7c9a708621124
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/357914
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6295
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 9b6808e..8b73de5 100644
--- a/Android.bp
+++ b/Android.bp
@@ -226,7 +226,6 @@
         "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp",
         "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
         "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
-        "src/core/CL/kernels/CLMinMaxLayerKernel.cpp",
         "src/core/CL/kernels/CLNormalizationLayerKernel.cpp",
         "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp",
         "src/core/CL/kernels/CLPadLayerKernel.cpp",
@@ -280,7 +279,6 @@
         "src/core/NEON/kernels/NELogicalKernel.cpp",
         "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp",
-        "src/core/NEON/kernels/NEMinMaxLayerKernel.cpp",
         "src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEPadLayerKernel.cpp",
         "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
@@ -639,7 +637,6 @@
         "src/runtime/CL/functions/CLFFT2D.cpp",
         "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLFill.cpp",
-        "src/runtime/CL/functions/CLFillBorder.cpp",
         "src/runtime/CL/functions/CLFlattenLayer.cpp",
         "src/runtime/CL/functions/CLFloor.cpp",
         "src/runtime/CL/functions/CLFullyConnectedLayer.cpp",
diff --git a/SConscript b/SConscript
index df8f33a..c88a867 100644
--- a/SConscript
+++ b/SConscript
@@ -38,27 +38,27 @@
 Import('install_lib')
 
 def build_bootcode_objs(sources):
-
     arm_compute_env.Append(ASFLAGS = "-I bootcode/")
     obj = arm_compute_env.Object(sources)
     obj = install_lib(obj)
     Default(obj)
     return obj
 
-def build_sve_objs(sources):
 
+def build_sve_objs(sources):
     tmp_env = arm_compute_env.Clone()
     tmp_env.Append(CXXFLAGS = "-march=armv8.2-a+sve+fp16")
     obj = tmp_env.SharedObject(sources)
     Default(obj)
     return obj
 
-def build_objs(sources):
 
+def build_objs(sources):
     obj = arm_compute_env.SharedObject(sources)
     Default(obj)
     return obj
 
+
 def build_library(name, build_env, sources, static=False, libs=[]):
     if static:
         obj = build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
@@ -72,6 +72,7 @@
     Default(obj)
     return obj
 
+
 def remove_incode_comments(code):
     def replace_with_empty(match):
         s = match.group(0)
@@ -83,6 +84,7 @@
     comment_regex = re.compile(r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE)
     return re.sub(comment_regex, replace_with_empty, code)
 
+
 def resolve_includes(target, source, env):
     # File collection
     FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
@@ -142,6 +144,7 @@
             file_to_write = "R\"(" + file_to_write + ")\""
             out_file.write(file_to_write)
 
+
 def create_version_file(target, source, env):
 # Generate string with build options library version to embed in the library:
     try:
@@ -153,72 +156,87 @@
     with open(target[0].get_path(), "w") as fd:
         fd.write(build_info)
 
-def get_cpu_runtime_files(operator):
-    file_list = []
-    operators = filelist['cpu']['operators']
 
-    if "operator" in operators[operator]["files"]:
-        file_list += operators[operator]["files"]["operator"]
-    return file_list
+def get_attrs_list(arch, estate, data_types, data_layouts):
+    attrs = []
 
-def get_gpu_runtime_files(operator):
-    file_list = []
-    operators = filelist['gpu']['operators']
+    # Manage data-types
+    if any(i in data_types for i in ['all']):
+        attrs += ['fp16', 'fp32', 'integer', 'qasymm8', 'qasymm8_signed', 'qsymm16']
+    else:
+        if any(i in data_types for i in ['fp16']): attrs += ['fp16']
+        if any(i in data_types for i in ['fp32']): attrs += ['fp32']
+        if any(i in data_types for i in ['integer']): attrs += ['integer']
+        if any(i in data_types for i in ['qasymm8']): attrs += ['qasymm8']
+        if any(i in data_types for i in ['qasymm8_signed']): attrs += ['qasymm8_signed']
+        if any(i in data_types for i in ['qsymm16']): attrs += ['qsymm16']
 
-    if "operator" in operators[operator]["files"]:
-        file_list += operators[operator]["files"]["operator"]
-    return file_list
+    # Manage data-layouts
+    if any(i in data_layouts for i in ['all']):
+        attrs += ['nhwc', 'nchw']
+    else:
+        if any(i in data_layouts for i in ['nhwc']): attrs += ['nhwc']
+        if any(i in data_layouts for i in ['nchw']): attrs += ['nchw']
 
-def get_cpu_kernel_files(operator):
+    # Manage execution state
+    estate_attr = 'estate32' if (estate == 'auto' and 'v7a' in arch) or '32' in estate else 'estate64'
+    attrs += [ estate_attr ]
 
-    file_list = []
-    file_list_sve = []
-    operators = filelist['cpu']['operators']
+    return attrs
 
-    if env['estate'] == '64' and "neon" in operators[operator]['files'] and "estate64" in operators[operator]['files']['neon']:
-        file_list += operators[operator]['files']['neon']['estate64']
-    if env['estate'] == '32' and "neon" in operators[operator]['files'] and "estate32" in operators[operator]['files']['neon']:
-        file_list += operators[operator]['files']['neon']['estate32']
 
-    if "kernel" in operators[operator]["files"]:
-        file_list += operators[operator]["files"]["kernel"]
+def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[]):
+    files = { "common" : [] }
 
-    if ("neon" in operators[operator]["files"]):
-        if any(i in env['data_type_support'] for i in ['all', 'qasymm8']) and ("qasymm8" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]["files"]["neon"]["qasymm8"]
-        if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']) and ("qasymm8_signed" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]["files"]["neon"]["qasymm8_signed"]
-        if any(i in env['data_type_support'] for i in ['all', 'qsymm16']) and ("qsymm16" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]["files"]["neon"]["qsymm16"]
-        if any(i in env['data_type_support'] for i in ['all', 'integer']) and ("integer" in operators[operator]["files"]["neon"]):
-                file_list += operators[operator]["files"]["neon"]["integer"]
+    # Early return if filelist is empty
+    if backend not in filelist:
+        return files
 
-    if (not "sve" in env['arch'] or env['fat_binary']) and ("neon" in operators[operator]["files"]):
-        if any(i in env['data_type_support'] for i in ['all', 'fp16']) and ("fp16" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]["files"]["neon"]["fp16"]
-        if any(i in env['data_type_support'] for i in ['all', 'fp32']) and ("fp32" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]["files"]["neon"]["fp32"]
-        if any(i in env['data_layout_support'] for i in ['all', 'nchw']) and ("nchw" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]['files']['neon']['nchw']
-        if ("all" in operators[operator]["files"]["neon"]):
-            file_list += operators[operator]["files"]["neon"]["all"]
-    if ("sve" in env['arch'] or env['fat_binary']) and ("sve" in operators[operator]["files"]):
-        if any(i in env['data_type_support'] for i in ['all', 'fp16']) and ("fp16" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["fp16"]
-        if any(i in env['data_type_support'] for i in ['all', 'fp32'])  and ("fp32" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["fp32"]
-        if any(i in env['data_type_support'] for i in ['all', 'qasymm8']) and ("qasymm8" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["qasymm8"]
-        if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']) and ("qasymm8_signed" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["qasymm8_signed"]
-        if any(i in env['data_type_support'] for i in ['all', 'qsymm16']) and ("qsymm16" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["qsymm16"]
-        if any(i in env['data_type_support'] for i in ['all', 'integer']) and ("integer" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["integer"]
-        if ("all" in operators[operator]["files"]["sve"]):
-            file_list_sve += operators[operator]["files"]["sve"]["all"]
+    # Iterate over operators and create the file lists to compiler
+    for operator in operators:
+        if operator in filelist[backend]['operators']:
+            files['common'] += filelist[backend]['operators'][operator]["files"]["common"]
+            for tech in techs:
+                if tech in filelist[backend]['operators'][operator]["files"]:
+                    # Add tech as a key to dictionary if not there
+                    if tech not in files:
+                        files[tech] = []
 
-    return file_list, file_list_sve
+                    # Add tech files to the tech file list
+                    tech_files = filelist[backend]['operators'][operator]["files"][tech]
+                    files[tech] += tech_files.get('common', [])
+                    for attr in attrs:
+                        files[tech] += tech_files.get(attr, [])
+
+    # Remove duplicates if they exist
+    return {k: list(set(v)) for k,v in files.items()}
+
+def collect_operators(filelist, operators, backend=''):
+    ops = set()
+    for operator in operators:
+        if operator in filelist[backend]['operators']:
+            ops.add(operator)
+            if 'deps' in filelist[backend]['operators'][operator]:
+                ops.update(filelist[backend]['operators'][operator]['deps'])
+        else:
+            print("Operator {0} is unsupported on {1} backend!".format(operator, backend))
+
+    return ops
+
+
+def resolve_operator_dependencies(filelist, operators, backend=''):
+    resolved_operators = collect_operators(filelist, operators, backend)
+
+    are_ops_resolved = False
+    while not are_ops_resolved:
+        resolution_pass = collect_operators(filelist, resolved_operators, backend)
+        if len(resolution_pass) != len(resolved_operators):
+            resolved_operators.update(resolution_pass)
+        else:
+            are_ops_resolved = True
+
+    return resolved_operators
+
 
 arm_compute_env = env.Clone()
 version_file = arm_compute_env.Command("src/core/arm_compute_version.embed", "", action=create_version_file)
@@ -385,70 +403,61 @@
 with (open(Dir('#').path + '/filelist.json')) as fp:
     filelist = json.load(fp)
 
-core_files = Glob('src/core/*.cpp')
-core_files += Glob('src/core/CPP/*.cpp')
-core_files += Glob('src/core/CPP/kernels/*.cpp')
-core_files += Glob('src/core/helpers/*.cpp')
-core_files += Glob('src/core/utils/*.cpp')
-core_files += Glob('src/core/utils/helpers/*.cpp')
-core_files += Glob('src/core/utils/io/*.cpp')
-core_files += Glob('src/core/utils/quantization/*.cpp')
-core_files += Glob('src/core/utils/misc/*.cpp')
-if env["logging"]:
-    core_files += Glob('src/core/utils/logging/*.cpp')
+# Common backend files
+lib_files = filelist['common']
 
-runtime_files_hp = Glob('src/runtime/*.cpp')
-runtime_files_hp += Glob('src/runtime/CPP/ICPPSimpleFunction.cpp')
-runtime_files = Glob('src/runtime/CPP/functions/*.cpp')
+# Logging files
+if env["logging"]:
+    lib_files += filelist['logging']
 
 # C API files
-runtime_files_hp += filelist['c_api']['common']
-runtime_files_hp += filelist['c_api']['operators']
+lib_files += filelist['c_api']['common']
+lib_files += filelist['c_api']['operators']
 
-if env['opencl']:
-    runtime_files_hp += filelist['c_api']['gpu']
+# Scheduler infrastructure
+lib_files += filelist['scheduler']['single']
+if env['cppthreads']:
+     lib_files += filelist['scheduler']['threads']
+if env['openmp']:
+     lib_files += filelist['scheduler']['omp']
 
-# Common backend files
-core_files += filelist['common']
-
-# Initialize high priority core files
-core_files_hp = core_files
-core_files_sve_hp = []
-core_files = []
-
-runtime_files_hp += Glob('src/runtime/CPP/SingleThreadScheduler.cpp')
-
+# Graph files
 graph_files = Glob('src/graph/*.cpp')
 graph_files += Glob('src/graph/*/*.cpp')
 
-if env['cppthreads']:
-     runtime_files_hp += Glob('src/runtime/CPP/CPPScheduler.cpp')
-
-if env['openmp']:
-     runtime_files_hp += Glob('src/runtime/OMP/OMPScheduler.cpp')
+# Specify user-defined priority operators
+use_priority_ops = env['high_priority']
+priority_operators = filelist['high_priority']
+if env['build_config'] != "":
+    build_config = env['build_config']
+    build_config_contents = {}
+    if os.path.isfile(build_config):
+        with open(build_config) as f:
+            try:
+                build_config_contents = json.load(f)
+            except:
+                print("Warning: Build configuration file is of invalid JSON format!")
+    else:
+        try:
+            build_config_contents = json.loads(build_config)
+        except:
+            print("Warning: Build configuration string is of invalid JSON format!")
+    if build_config_contents:
+        priority_operators = build_config_contents.get("operators", [])
 
 if env['opencl']:
-    operators = filelist['gpu']['operators']
-    for operator in operators:
-        if operator in filelist['gpu']['high_priority']:
-            runtime_files_hp += get_gpu_runtime_files(operator)
-            if "kernel" in operators[operator]["files"]:
-                core_files_hp += operators[operator]["files"]["kernel"]
-        else:
-            runtime_files += get_gpu_runtime_files(operator)
-            if "kernel" in operators[operator]["files"]:
-                core_files += operators[operator]["files"]["kernel"]
+    lib_files += filelist['c_api']['gpu']
+    lib_files += filelist['gpu']['common']
 
-    runtime_files_hp += filelist['gpu']['common']
-    runtime_files += Glob('src/runtime/CL/functions/*.cpp')
+    cl_operators = priority_operators if use_priority_ops else filelist['gpu']['operators'].keys()
+    cl_ops_to_build = resolve_operator_dependencies(filelist, cl_operators, 'gpu')
+    lib_files += get_operator_backend_files(filelist, cl_ops_to_build, 'gpu')['common']
 
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
 sve_o = []
-core_files_sve = []
+lib_files_sve = []
 if env['neon']:
-    core_files += Glob('src/core/NEON/*.cpp')
-
     # build winograd/depthwise sources for either v7a / v8a
     arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/convolution/common/",
                                       "src/core/NEON/kernels/convolution/winograd/",
@@ -457,58 +466,55 @@
                                       "arm_compute/core/NEON/kernels/assembly/",
                                       "src/cpu/kernels/assembly/",])
 
-    # Load files based on user's options
-    operators = filelist['cpu']['operators']
-    for operator in operators:
-        if operator in filelist['cpu']['high_priority']:
-            runtime_files_hp += get_cpu_runtime_files(operator)
-            file_list, file_list_sve = get_cpu_kernel_files(operator)
-            core_files_hp += file_list
-            core_files_sve_hp += file_list_sve
-        else:
-            runtime_files += get_cpu_runtime_files(operator)
-            file_list, file_list_sve = get_cpu_kernel_files(operator)
-            core_files += file_list
-            core_files_sve += file_list_sve
+    lib_files += filelist['cpu']['common']
 
-    runtime_files_hp += filelist['cpu']['common']
-    runtime_files_hp += Glob('src/runtime/NEON/*.cpp')
-    runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
+    # Setup SIMD file list to include
+    simd = []
+    if 'sve' in env['arch'] or env['fat_binary']: simd += ['sve']
+    if 'sve' not in env['arch'] or env['fat_binary']: simd += ['neon']
+
+    # Get attributes
+    attrs = get_attrs_list(env['arch'], env['estate'], env['data_type_support'], env['data_layout_support'])
+
+    # Setup data-type and data-layout files to include
+    cpu_operators = priority_operators if use_priority_ops else filelist['cpu']['operators'].keys()
+    cpu_ops_to_build = resolve_operator_dependencies(filelist, filelist['cpu']['operators'], 'cpu')
+    cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs)
+    lib_files += cpu_files.get('common', [])
+    lib_files += cpu_files.get('neon', [])
+    lib_files_sve += cpu_files.get('sve', [])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')
 
+# Restrict from building graph API if a reduced operator list has been provided
+if use_priority_ops:
+    print("Graph library requires all operators to be built")
+    graph_files = []
+
+# Build bootcode in case of bare-metal
 bootcode_o = []
 if env['os'] == 'bare_metal':
     bootcode_files = Glob('bootcode/*.s')
     bootcode_o = build_bootcode_objs(bootcode_files)
 Export('bootcode_o')
 
-high_priority_o = build_objs(core_files_hp + runtime_files_hp)
-high_priority_sve_o = []
+# Build static libraries
 if (env['fat_binary']):
-    sve_o = build_sve_objs(core_files_sve)
-    high_priority_sve_o = build_sve_objs(core_files_sve_hp)
-    arm_compute_a = build_library('arm_compute-static', arm_compute_env, core_files + sve_o + high_priority_o + high_priority_sve_o + runtime_files, static=True)
+    sve_o = build_sve_objs(lib_files_sve)
+    arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files + sve_o, static=True)
 else:
-    high_priority_o += build_objs(core_files_sve_hp)
-    arm_compute_a = build_library('arm_compute-static', arm_compute_env, core_files + core_files_sve + high_priority_o + runtime_files, static=True)
+    arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files + lib_files_sve, static=True)
 Export('arm_compute_a')
-if env['high_priority']:
-    arm_compute_hp_a = build_library('arm_compute_hp-static', arm_compute_env, high_priority_o + high_priority_sve_o, static=True)
-    Export('arm_compute_hp_a')
 
+# Build shared libraries
 if env['os'] != 'bare_metal' and not env['standalone']:
     if (env['fat_binary']):
-        arm_compute_so = build_library('arm_compute', arm_compute_env, core_files + sve_o + high_priority_sve_o + high_priority_o + runtime_files, static=False)
+        arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files + sve_o, static=False)
     else:
-        arm_compute_so = build_library('arm_compute', arm_compute_env, core_files + core_files_sve + high_priority_o + runtime_files , static=False)
+        arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files + lib_files_sve, static=False)
 
     Export('arm_compute_so')
 
-    if env['high_priority']:
-        arm_compute_hp_so = build_library('arm_compute_hp', arm_compute_env, high_priority_sve_o + high_priority_o, static=False)
-        Export('arm_compute_hp_so')
-
 # Generate dummy core lib for backwards compatibility
 arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True)
 Export('arm_compute_core_a')
@@ -519,6 +525,7 @@
 
 arm_compute_graph_env = arm_compute_env.Clone()
 
+# Build graph libraries
 arm_compute_graph_env.Append(CXXFLAGS = ['-Wno-redundant-move', '-Wno-pessimizing-move'])
 
 arm_compute_graph_a = build_library('arm_compute_graph-static', arm_compute_graph_env, graph_files, static=True, libs = [ arm_compute_a])
diff --git a/SConstruct b/SConstruct
index ee8108b..7591075 100644
--- a/SConstruct
+++ b/SConstruct
@@ -23,8 +23,10 @@
 # SOFTWARE.
 
 import SCons
+import json
 import os
 import subprocess
+import sys
 
 def version_at_least(version, required):
 
@@ -76,7 +78,8 @@
     ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", ""),
     ("extra_link_flags", "Extra LD flags to be appended to the build command", ""),
     ("compiler_cache", "Command to prefix to the C and C++ compiler (e.g ccache)", ""),
-    ("specs_file", "Specs file to use (e.g. rdimon.specs)", "")
+    ("specs_file", "Specs file to use (e.g. rdimon.specs)", ""),
+    ("build_config", "Operator/Data-type/Data-layout configuration to use for tailored ComputeLibrary builds. Can be a JSON file or a JSON formatted string", "")
 )
 
 env = Environment(platform="posix", variables=vars, ENV = os.environ)
@@ -317,6 +320,13 @@
                            '-DARM_COMPUTE_ENABLE_FP16', '-DARM_COMPUTE_ENABLE_BF16',
                            '-DARM_COMPUTE_ENABLE_I8MM', '-DARM_COMPUTE_ENABLE_SVEF32MM'])
 
+if env['high_priority'] and env['build_config']:
+    print("The high priority library cannot be built in conjuction with a user-specified build configuration")
+    Exit(1)
+
+if not env['high_priority'] and not env['build_config']:
+    env.Append(CPPDEFINES = ['ARM_COMPUTE_GRAPH_ENABLED'])
+
 if env['data_type_support']:
     if any(i in env['data_type_support'] for i in ['all', 'fp16']):
         env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 62c9415..442d407 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -57,7 +57,6 @@
 #include "arm_compute/runtime/CL/functions/CLFFT2D.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFill.h"
-#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFloor.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
deleted file mode 100644
index 20f2e15..0000000
--- a/arm_compute/runtime/CL/functions/CLFillBorder.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFILLBORDER_H
-#define ARM_COMPUTE_CLFILLBORDER_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class CLCompileContext;
-class ICLTensor;
-
-/** Basic function to run @ref CLFillBorderKernel */
-class CLFillBorder : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * Valid data layouts:
-     * - All
-     *
-     * Valid data type configurations:
-     * |src            |dst            |
-     * |:--------------|:--------------|
-     * |All            |All            |
-     *
-     * @param[in,out] tensor                Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
-     * @param[in]     border_width          The border width
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-    /** Initialize the function
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] tensor                Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
-     * @param[in]     border_width          The border width
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-};
-}
-#endif /*ARM_COMPUTE_FILLBORDER_H */
diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox
index 6c7b7e9..fc08dbc 100644
--- a/docs/user_guide/library.dox
+++ b/docs/user_guide/library.dox
@@ -561,6 +561,35 @@
 Based on the CPU support, the appropriate kernel will be selected at runtime for execution. Currently this option is
 only supported with armv8.2-a as the base architecture.
 
+@subsection architecture_experimental_per_operator_build Per-operator build
+
+Dependencies for all operators have been explicitly defined, this provides the ability to users to generate Compute Library
+binaries that include a user-defined list of operators.
+
+An experimental flag 'build_config' has been introduced where a JSON configuration file can be provided and consumed.
+An example config looks like:
+@code{.py}
+{
+    "operators": [
+        "Activation",
+        "DepthwiseConv2d",
+        "Conv2d",
+        "Permute",
+        "Pool2d",
+        "Reshape"
+    ],
+    "data_types": [
+        "NHWC"
+    ]
+}
+@endcode
+
+Supported data-types options are:
+- "NHWC"
+- "NCHW"
+
+The list of supported operators can be found in filelist.json in the root of Compute Library repo.
+
 @subsection architecture_experimental_build_high_priority_operators Build high priority operators
 
 Selecting high_priority when building Compute Library, one new library will be created: libarm_compute_hp and
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 92b8f9b..27ba52d 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -1404,9 +1404,9 @@
     <tr><td>All<td>All
     </table>
 <tr>
-  <td rowspan="2">FillBorder
-  <td rowspan="2" style="width:200px;"> Function to fill the borders within the XY-planes.
-  <td rowspan="2">
+  <td rowspan="1">FillBorder
+  <td rowspan="1" style="width:200px;"> Function to fill the borders within the XY-planes.
+  <td rowspan="1">
       <ul>
        <li>n/a
       </ul>
@@ -1421,17 +1421,6 @@
     <tr><td>All<td>All
     </table>
 <tr>
-  <td>CLFillBorder
-  <td>
-      <ul>
-       <li>All
-      </ul>
-  <td>
-    <table>
-    <tr><th>src<th>dst
-    <tr><td>All<td>All
-    </table>
-<tr>
   <td rowspan="2">FlattenLayer
   <td rowspan="2" style="width:200px;"> Reshape a tensor to be 1D
   <td rowspan="2">
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 2eb9aac..583cf4f 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -1315,7 +1315,7 @@
     - NEDequantizationLayerKernel / @ref NEDequantizationLayer
     - NEFloorKernel / @ref NEFloor
     - @ref NEL2NormalizeLayerKernel / @ref NEL2NormalizeLayer
-    - NEQuantizationLayerKernel @ref NEMinMaxLayerKernel / @ref NEQuantizationLayer
+    - NEQuantizationLayerKernel NEMinMaxLayerKernel / @ref NEQuantizationLayer
     - @ref NEROIPoolingLayerKernel / @ref NEROIPoolingLayer
     - @ref NEReductionOperationKernel / @ref NEReductionOperation
     - NEReshapeLayerKernel / @ref NEReshapeLayer
@@ -1329,7 +1329,7 @@
     - CLGEMMTranspose1xW
     - CLGEMMMatrixVectorMultiplyKernel
     - @ref CLL2NormalizeLayerKernel / @ref CLL2NormalizeLayer
-    - CLQuantizationLayerKernel @ref CLMinMaxLayerKernel / @ref CLQuantizationLayer
+    - CLQuantizationLayerKernel CLMinMaxLayerKernel / @ref CLQuantizationLayer
     - @ref CLROIPoolingLayerKernel / @ref CLROIPoolingLayer
     - @ref CLReductionOperationKernel / @ref CLReductionOperation
     - CLReshapeLayerKernel / @ref CLReshapeLayer
diff --git a/filelist.json b/filelist.json
index 5171f39..4b85408 100644
--- a/filelist.json
+++ b/filelist.json
@@ -7,8 +7,80 @@
     "src/common/AllocatorWrapper.cpp",
     "src/common/ITensorV2.cpp",
     "src/common/TensorPack.cpp",
-    "src/common/IOperator.cpp"
+    "src/common/IOperator.cpp",
+    "src/core/AccessWindowAutoPadding.cpp",
+    "src/core/AccessWindowStatic.cpp",
+    "src/core/AccessWindowTranspose.cpp",
+    "src/core/Error.cpp",
+    "src/core/GPUTarget.cpp",
+    "src/core/Helpers.cpp",
+    "src/core/IAccessWindow.cpp",
+    "src/core/IKernel.cpp",
+    "src/core/ITensor.cpp",
+    "src/core/ITensorPack.cpp",
+    "src/core/Rounding.cpp",
+    "src/core/Size2D.cpp",
+    "src/core/SubTensorInfo.cpp",
+    "src/core/TensorInfo.cpp",
+    "src/core/Utils.cpp",
+    "src/core/Validate.cpp",
+    "src/core/Version.cpp",
+    "src/core/helpers/SoftmaxHelpers.cpp",
+    "src/core/helpers/WindowHelpers.cpp",
+    "src/core/utils/AssemblyUtils.cpp",
+    "src/core/utils/ScaleUtils.cpp",
+    "src/core/utils/helpers/fft.cpp",
+    "src/core/utils/helpers/tensor_transform.cpp",
+    "src/core/utils/io/FileHandler.cpp",
+    "src/core/utils/misc/MMappedFile.cpp",
+    "src/core/utils/quantization/AsymmHelpers.cpp",
+    "src/core/CPP/CPPTypes.cpp",
+    "src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp",
+    "src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp",
+    "src/core/CPP/kernels/CPPPermuteKernel.cpp",
+    "src/core/CPP/kernels/CPPTopKVKernel.cpp",
+    "src/core/CPP/kernels/CPPUpsampleKernel.cpp",
+    "src/runtime/Allocator.cpp",
+    "src/runtime/BlobLifetimeManager.cpp",
+    "src/runtime/BlobMemoryPool.cpp",
+    "src/runtime/ISimpleLifetimeManager.cpp",
+    "src/runtime/ITensorAllocator.cpp",
+    "src/runtime/IWeightsManager.cpp",
+    "src/runtime/IScheduler.cpp",
+    "src/runtime/Memory.cpp",
+    "src/runtime/MemoryManagerOnDemand.cpp",
+    "src/runtime/OffsetLifetimeManager.cpp",
+    "src/runtime/OffsetMemoryPool.cpp",
+    "src/runtime/OperatorTensor.cpp",
+    "src/runtime/PoolManager.cpp",
+    "src/runtime/RuntimeContext.cpp",
+    "src/runtime/Scheduler.cpp",
+    "src/runtime/SchedulerFactory.cpp",
+    "src/runtime/SchedulerUtils.cpp",
+    "src/runtime/SubTensor.cpp",
+    "src/runtime/Tensor.cpp",
+    "src/runtime/TensorAllocator.cpp",
+    "src/runtime/Utils.cpp",
+    "src/runtime/CPP/ICPPSimpleFunction.cpp",
+    "src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp",
+    "src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp",
+    "src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp",
+    "src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp",
+    "src/runtime/CPP/functions/CPPPermute.cpp",
+    "src/runtime/CPP/functions/CPPTopKV.cpp",
+    "src/runtime/CPP/functions/CPPUpsample.cpp"
   ],
+  "logging": [
+    "src/core/utils/logging/FilePrinter.cpp",
+    "src/core/utils/logging/Helpers.cpp",
+    "src/core/utils/logging/Logger.cpp",
+    "src/core/utils/logging/LoggerRegistry.cpp"
+  ],
+  "scheduler": {
+    "single": [ "src/runtime/CPP/SingleThreadScheduler.cpp" ],
+    "threads": [ "src/runtime/CPP/CPPScheduler.cpp" ],
+    "omp": [ "src/runtime/OMP/OMPScheduler.cpp"]
+  },
   "c_api": {
     "common": [
       "src/c/AclContext.cpp",
@@ -28,6 +100,14 @@
       "src/c/operators/AclActivation.cpp"
     ]
   },
+  "high_priority": [
+    "Activation",
+    "DepthwiseConv2d",
+    "Conv2d",
+    "Permute",
+    "Pool2d",
+    "Reshape"
+  ],
   "gpu": {
     "common": [
       "src/core/CL/CLCompileContext.cpp",
@@ -41,19 +121,11 @@
       "src/core/CL/ICLSimpleKernel.cpp",
       "src/core/CL/ICLTensor.cpp",
       "src/core/CL/OpenCL.cpp",
-      "src/gpu/cl/ClKernelLibrary.cpp",
-      "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp",
-      "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp",
-      "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp",
-      "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp",
-      "src/core/CL/kernels/CLFillBorderKernel.cpp",
       "src/gpu/cl/ClContext.cpp",
+      "src/gpu/cl/ClKernelLibrary.cpp",
       "src/gpu/cl/ClQueue.cpp",
       "src/gpu/cl/ClTensor.cpp",
+      "src/core/CL/kernels/CLFillBorderKernel.cpp",
       "src/runtime/CL/CLBufferAllocator.cpp",
       "src/runtime/CL/CLGEMMHeuristicsHandle.cpp",
       "src/runtime/CL/CLHelpers.cpp",
@@ -68,888 +140,1022 @@
       "src/runtime/CL/CLTuner.cpp",
       "src/runtime/CL/ICLSimpleFunction.cpp",
       "src/runtime/CL/Utils.cpp",
-      "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp",
-      "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp",
-      "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp",
-      "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp",
       "src/runtime/CL/mlgo/HeuristicTree.cpp",
       "src/runtime/CL/mlgo/MLGOHeuristics.cpp",
       "src/runtime/CL/mlgo/MLGOParser.cpp",
       "src/runtime/CL/mlgo/Utils.cpp",
       "src/runtime/CL/tuners/CLTuningParametersList.cpp"
     ],
-    "high_priority": [
-      "Activation",
-      "DepthwiseConv2d",
-      "DirectConv2d",
-      "Permute",
-      "Pool2d",
-      "Reshape"
-    ],
     "operators": {
-      "Activation": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClActivation.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClActivationKernel.cpp"
-          ]
-        }
-      },
-      "Add": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClAdd.cpp"
-          ]
-        }
-      },
-      "Cast": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClCast.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClCastKernel.cpp"
-          ]
-        }
-      },
-      "Concatenate": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClConcatenate.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
-            "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
-            "src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
-            "src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
-            "src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
-            "src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp"
-          ]
-        }
-      },
-      "DirectConv2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClDirectConv2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClDirectConv2dKernel.cpp"
-          ]
-        }
-      },
-      "FullyConnected": {
-        "deps": [
-          "ClFlatten",
-          "ClConvertFullyConnectedWeights",
-          "ClGemm",
-          "ClGemmLowpMatrixMultiplyCore",
-          "ClTranspose"
-        ],
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFullyConnected.cpp"
-          ]
-        }
-      },
-      "ConvertFullyConnectedWeights": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp"
-          ]
-        }
-      },
-      "Permute": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClPermute.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClPermuteKernel.cpp"
-          ]
-        }
-      },
-      "Pool2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClPool2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClPool2dKernel.cpp"
-          ]
-        }
-      },
-      "Conv2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClConv2d.cpp"
-          ]
-        }
-      },
-      "PRelu": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClPRelu.cpp"
-          ]
-        }
-      },
-      "Reshape": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClReshape.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClReshapeKernel.cpp"
-          ]
-        }
-      },
-      "Copy": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClCopy.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClCopyKernel.cpp"
-          ]
-        }
-      },
-      "Crop": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClCrop.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClCropKernel.cpp"
-          ]
-        }
-      },
-      "Dequantize": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClDequantize.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClDequantizeKernel.cpp"
-          ]
-        }
-      },
-      "Elementwise": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClElementwiseOperations.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClElementwiseKernel.cpp"
-          ]
-        }
-      },
-      "ElementwiseUnary": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClElementwiseUnary.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp"
-          ]
-        }
-      },
-      "Fill": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFill.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClFillKernel.cpp"
-          ]
-        }
-      },
-      "Flatten": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFlatten.cpp"
-          ]
-        }
-      },
-      "Floor": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFloor.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClFloorKernel.cpp"
-          ]
-        }
-      },
-      "GEMM": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClGemm.cpp",
-            "src/gpu/cl/operators/ClGemmConv2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowp": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp",
-            "src/gpu/cl/operators/ClGemmLowpOutputStage.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp" 
-          ]
-        }
-      },
-      "Mul": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClMul.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClMulKernel.cpp"
-          ]
-        }
-      },
-      "Quantize": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClQuantize.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClQuantizeKernel.cpp"
-          ]
-        }
-      },
-      "Scale": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClScale.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClScaleKernel.cpp"
-          ]
-        }
-      },
-      "Softmax": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClSoftmax.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClSoftmaxKernel.cpp"
-          ]
-        }
-      },
-      "Sub": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClSub.cpp"
-          ]
-        }
-      },
-      "Transpose": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClTranspose.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClTransposeKernel.cpp"
-          ]
-        }
-      },
-      "GenerateProposals": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp"
-          ]
-        }
-      },
-      "ArgMinMax": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp"
-          ]
-        }
-      },
-      "BatchNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp"
-          ]
-        }
-      },
-      "BatchToSpace": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp"
-          ]
-        }
-      },
-      "Bitwise": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBitwiseKernel.cpp"
-          ]
-        }
-      },
-      "BoundingBoxTransform": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp"
-          ]
-        }
-      },
-      "ChannelShuffleLayer": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp"
-          ]
-        }
-      },
-      "GEMMConv2d": {
-        "files": {
-          "kernel": [
-            "src/gpu/cl/kernels/ClCol2ImKernel.cpp",
-            "src/gpu/cl/kernels/ClIm2ColKernel.cpp"
-          ]
-        }
-      },
-      "Comparison": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLComparisonKernel.cpp"
-          ]
-        }
-      },
-      "DeconvolutionLayerUpsample": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp"
-          ]
-        }
-      },
-      "DeconvolutionReshapeOutput": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp"
-          ]
-        }
-      },
-      "DepthToSpace": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp"
-          ]
-        }
-      },
-      "DepthwiseConvolutionLayerNative": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp"
-          ]
-        }
-      },
-      "FFTDigitReverse": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp"
-          ]
-        }
-      },
-      "FFTRadixStage": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFFTRadixStageKernel.cpp"
-          ]
-        }
-      },
-      "FFTScale": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFFTScaleKernel.cpp"
-          ]
-        }
-      },
-      "FuseBatchNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "Gather": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGatherKernel.cpp"
-          ]
-        }
-      },
-      "InstanceNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp"
-          ]
-        }
-      },
-      "L2Normalize": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp"
-          ]
-        }
-      },
-      "LogicalNot": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClLogicalNot.cpp"
-          ]
-        }
-      },
-      "MaxUnpooling": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "MeanStdDevNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "MinMax": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLMinMaxLayerKernel.cpp"
-          ]
-        }
-      },
-      "Normalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLNormalizationLayerKernel.cpp"
-          ]
-        }
-      },
-      "NormalizePlanarYUV": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp"
-          ]
-        }
-      },
-      "Pad": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLPadLayerKernel.cpp"
-          ]
-        }
-      },
-      "PriorBox": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp"
-          ]
-        }
-      },
-      "QLSTMLayerNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "Range": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLRangeKernel.cpp"
-          ]
-        }
-      },
-      "ReductionOperation": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLReductionOperationKernel.cpp"
-          ]
-        }
-      },
-      "Remap": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLRemapKernel.cpp"
-          ]
-        }
-      },
-      "Reorg": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLReorgLayerKernel.cpp"
-          ]
-        }
-      },
-      "Reverse": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLReverseKernel.cpp"
-          ]
-        }
-      },
-      "ROIAlign": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLROIAlignLayerKernel.cpp"
-          ]
-        }
-      },
-      "ROIPooling": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "Select": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLSelectKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToBatch": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToDepth": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp"
-          ]
-        }
-      },
-      "Stack": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLStackLayerKernel.cpp"
-          ]
-        }
-      },
-      "StridedSlice": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLStridedSliceKernel.cpp"
-          ]
-        }
-      },
-      "Tile": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLTileKernel.cpp"
-          ]
-        }
-      },
-      "WeightsReshape": {
-        "files": {
-          "kernel": [
-            "src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp"
-          ]
-        }
-      },
-      "WinogradConv2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClWinogradConv2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp",
-            "src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp",
-            "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp"
-          ]
-        }
+    "Activation":{
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClActivationKernel.cpp",
+          "src/gpu/cl/operators/ClActivation.cpp",
+          "src/runtime/CL/functions/CLActivationLayer.cpp"
+        ]
+      }
+    },
+    "ArgMinMax": {
+      "deps": [ "Reshape" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
+          "src/runtime/CL/functions/CLArgMinMaxLayer.cpp"
+        ]
+      }
+    },
+    "Add": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseKernel.cpp",
+          "src/gpu/cl/operators/ClAdd.cpp"
+        ]
+      }
+    },
+    "BatchNormalization": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
+          "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
+          "src/runtime/CL/functions/CLBatchNormalizationLayer.cpp",
+          "src/runtime/CL/functions/CLFuseBatchNormalization.cpp"
+        ]
+      }
+    },
+    "BatchToSpace": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
+          "src/runtime/CL/functions/CLBatchToSpaceLayer.cpp"
+         ]
+      }
+    },
+    "Bitwise": {
+      "files": {
+        "common": [ "src/core/CL/kernels/CLBitwiseKernel.cpp" ]
+      }
+    },
+    "BitwiseAnd": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseAnd.cpp" ]
+      }
+    },
+    "BitwiseNot": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseNot.cpp" ]
+      }
+    },
+    "BitwiseOr": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseOr.cpp" ]
+      }
+    },
+    "BitwiseXor": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseXor.cpp" ]
+      }
+    },
+    "BoundingBoxTransform": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp",
+          "src/runtime/CL/functions/CLBoundingBoxTransform.cpp"
+        ]
+      }
+    },
+    "Cast": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClCastKernel.cpp",
+          "src/gpu/cl/operators/ClCast.cpp",
+          "src/runtime/CL/functions/CLCast.cpp"
+        ]
+      }
+    },
+    "ChannelShuffle": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp",
+          "src/runtime/CL/functions/CLChannelShuffleLayer.cpp"
+        ]
+      }
+    },
+    "Comparison":  {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLComparisonKernel.cpp",
+          "src/runtime/CL/functions/CLComparison.cpp"
+        ]
+      }
+    },
+    "Concatenate": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp",
+          "src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
+          "src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
+          "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
+          "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
+          "src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
+          "src/gpu/cl/operators/ClConcatenate.cpp",
+          "src/runtime/CL/functions/CLConcatenateLayer.cpp"
+        ]
+      }
+    },
+    "Conv2d": {
+      "deps": [
+        "Activation",
+        "ElementwiseBinary",
+        "FFT2D",
+        "Gemm",
+        "Mul",
+        "Pad",
+        "Permute",
+        "Reduction",
+        "Reshape",
+        "Reverse",
+        "Slice"
+      ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClDirectConv2dKernel.cpp",
+          "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp",
+          "src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp",
+          "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp",
+          "src/gpu/cl/kernels/ClIm2ColKernel.cpp",
+          "src/gpu/cl/kernels/ClCol2ImKernel.cpp",
+          "src/gpu/cl/operators/ClConv2d.cpp",
+          "src/gpu/cl/operators/ClDirectConv2d.cpp",
+          "src/gpu/cl/operators/ClGemmConv2d.cpp",
+          "src/gpu/cl/operators/ClWinogradConv2d.cpp",
+          "src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp",
+          "src/runtime/CL/functions/CLConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLDirectConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp"
+        ]
+      }
+    },
+    "Copy": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClCopyKernel.cpp",
+          "src/gpu/cl/operators/ClCopy.cpp",
+          "src/runtime/CL/functions/CLCopy.cpp"
+        ]
+      }
+    },
+    "CropResize": {
+      "deps": [ "Copy", "Fill", "Scale" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClCropKernel.cpp",
+          "src/gpu/cl/operators/ClCrop.cpp",
+          "src/runtime/CL/functions/CLCrop.cpp",
+          "src/runtime/CL/functions/CLCropResize.cpp"
+        ]
+      }
+    },
+    "Deconv2d": {
+      "deps": [ "Conv2d", "Reverse", "Transpose"],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
+          "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
+          "src/runtime/CL/functions/CLDeconvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp",
+          "src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp"
+        ]
+      }
+    },
+    "DepthConvert": {
+      "deps": [ "Cast"],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLDepthConvertLayer.cpp" ]
+      }
+    },
+    "DepthToSpace": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp",
+          "src/runtime/CL/functions/CLDepthToSpaceLayer.cpp"
+        ]
+      }
+    },
+    "DepthwiseConv2d": {
+      "deps": [ "Permute" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp",
+          "src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp"
+        ]
+      }
+    },
+    "Dequantize": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClDequantizeKernel.cpp",
+          "src/gpu/cl/operators/ClDequantize.cpp",
+          "src/runtime/CL/functions/CLDequantizationLayer.cpp"
+        ]
+      }
+    },
+    "ElementwiseBinary": {
+      "deps": ["Add", "Sub"],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseKernel.cpp",
+          "src/gpu/cl/operators/ClElementwiseOperations.cpp",
+          "src/runtime/CL/functions/CLElementwiseOperations.cpp"
+        ]
+      }
+    },
+    "ElementwiseUnary":{
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
+          "src/gpu/cl/operators/ClElementwiseUnary.cpp",
+          "src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp"
+        ]
+      }
+    },
+    "FFT1D": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp",
+          "src/core/CL/kernels/CLFFTRadixStageKernel.cpp",
+          "src/core/CL/kernels/CLFFTScaleKernel.cpp",
+          "src/runtime/CL/functions/CLFFT1D.cpp"
+        ]
+      }
+    },
+    "FFT2D": {
+      "deps": [ "FFT1D" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLFFT2D.cpp" ]
+      }
+    },
+    "Fill": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClFillKernel.cpp",
+          "src/gpu/cl/operators/ClFill.cpp",
+          "src/runtime/CL/functions/CLFill.cpp"
+        ]
+      }
+    },
+    "Flatten": {
+      "files": {
+        "common": [
+          "src/gpu/cl/operators/ClFlatten.cpp",
+          "src/runtime/CL/functions/CLFlattenLayer.cpp"
+        ]
+      }
+    },
+    "Floor": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClFloorKernel.cpp",
+          "src/gpu/cl/operators/ClFloor.cpp",
+          "src/runtime/CL/functions/CLFloor.cpp"
+        ]
+      }
+    },
+    "FullyConnected": {
+      "deps": [ "Flatten", "Gemm", "Transpose"],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp",
+          "src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp",
+          "src/gpu/cl/operators/ClFullyConnected.cpp",
+          "src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp",
+          "src/runtime/CL/functions/CLFullyConnectedLayer.cpp"
+        ]
+      }
+    },
+    "Gather": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLGatherKernel.cpp",
+          "src/runtime/CL/functions/CLGather.cpp"]
+      }
+    },
+    "Gemm": {
+      "deps": [ "Cast" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp",
+          "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp",
+          "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp",
+          "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp",
+          "src/gpu/cl/operators/ClGemm.cpp",
+          "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp",
+          "src/gpu/cl/operators/ClGemmLowpOutputStage.cpp",
+          "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp",
+          "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp",
+          "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp",
+          "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp",
+          "src/runtime/CL/functions/CLGEMM.cpp",
+          "src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp",
+          "src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp"
+        ]
+      }
+    },
+    "GenerateProposals": {
+      "deps": [ "BoundingBoxTransform", "Dequantize", "Pad", "Permute", "Quantize", "Reshape" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp",
+          "src/runtime/CL/functions/CLGenerateProposalsLayer.cpp"
+        ]
+      }
+    },
+    "InstanceNormalize": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
+          "src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp"
+        ]
+      }
+    },
+    "L2Normalize": {
+      "deps": [ "Reduction" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp",
+          "src/runtime/CL/functions/CLL2NormalizeLayer.cpp"
+        ]
+      }
+    },
+    "Logical": {
+      "files": {
+        "common": [
+          "src/gpu/cl/operators/ClLogicalNot.cpp",
+          "src/runtime/CL/functions/CLLogicalAnd.cpp",
+          "src/runtime/CL/functions/CLLogicalNot.cpp",
+          "src/runtime/CL/functions/CLLogicalOr.cpp"
+        ]
+      }
+    },
+    "LSTM": {
+      "deps": [
+        "Activation",
+        "Concatenate",
+        "Copy",
+        "Dequantize",
+        "ElementwiseBinary",
+        "Fill",
+        "FullyConnected",
+        "Gemm",
+        "MeanStdDevNormalize",
+        "Mul",
+        "Quantize",
+        "Slice",
+        "Transpose"
+      ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp",
+          "src/runtime/CL/functions/CLQLSTMLayer.cpp",
+          "src/runtime/CL/functions/CLLSTMLayer.cpp",
+          "src/runtime/CL/functions/CLLSTMLayerQuantized.cpp"
+        ]
+      }
+    },
+    "MaxUnpool2d": {
+      "deps": [ "Fill" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
+          "src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp"
+        ]
+      }
+    },
+    "MeanStdDevNormalize": {
+      "deps": [ "Reduction" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
+          "src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp",
+          "src/runtime/CL/functions/CLReduceMean.cpp"
+        ]
+      }
+    },
+    "Mul": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClMulKernel.cpp",
+          "src/gpu/cl/operators/ClMul.cpp",
+          "src/runtime/CL/functions/CLPixelWiseMultiplication.cpp"
+        ]
+      }
+    },
+    "Normalize": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLNormalizationLayerKernel.cpp",
+          "src/runtime/CL/functions/CLNormalizationLayer.cpp"
+        ]
+      }
+    },
+    "Pad": {
+      "deps": [ "Copy" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLPadLayerKernel.cpp",
+          "src/runtime/CL/functions/CLPadLayer.cpp"
+        ]
+      }
+    },
+    "Permute": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClPermuteKernel.cpp",
+          "src/gpu/cl/operators/ClPermute.cpp",
+          "src/runtime/CL/functions/CLPermute.cpp"
+        ]
+      }
+    },
+    "Pool2d": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClPool2dKernel.cpp",
+          "src/gpu/cl/operators/ClPool2d.cpp",
+          "src/runtime/CL/functions/CLPoolingLayer.cpp"
+        ]
+      }
+    },
+    "PRelu": {
+      "deps": [ "ElementwiseBinary" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/operators/ClPRelu.cpp",
+          "src/runtime/CL/functions/CLPReluLayer.cpp"
+        ]
+      }
+    },
+    "PriorBox": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp",
+          "src/runtime/CL/functions/CLPriorBoxLayer.cpp"
+        ]
+      }
+    },
+    "Quantize": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClQuantizeKernel.cpp",
+          "src/gpu/cl/operators/ClQuantize.cpp",
+          "src/runtime/CL/functions/CLQuantizationLayer.cpp"
+        ]
+      }
+    },
+    "Range": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLRangeKernel.cpp",
+          "src/runtime/CL/functions/CLRange.cpp"
+        ]
+      }
+    },
+    "Reduction": {
+      "deps": [ "Reshape" ],
+      "files": {
+        "common": [ 
+          "src/core/CL/kernels/CLReductionOperationKernel.cpp",
+          "src/runtime/CL/functions/CLReductionOperation.cpp"
+        ]
+      }
+    },
+    "Remap": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLRemapKernel.cpp",
+          "src/runtime/CL/functions/CLRemap.cpp"]
+      }
+    },
+    "Reorg": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLReorgLayerKernel.cpp",
+          "src/runtime/CL/functions/CLReorgLayer.cpp"
+        ]
+      }
+    },
+    "Reshape": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClReshapeKernel.cpp",
+          "src/gpu/cl/operators/ClReshape.cpp",
+          "src/runtime/CL/functions/CLReshapeLayer.cpp"
+        ]
+      }
+    },
+    "Reverse": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLReverseKernel.cpp",
+          "src/runtime/CL/functions/CLReverse.cpp"
+        ]
+      }
+    },
+    "RNN": {
+      "deps": [ "Activation", "Cast", "ElementwiseBinary", "FullyConnected", "Gemm"],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLRNNLayer.cpp" ]
+      }
+    },
+    "ROIAlign": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLROIAlignLayerKernel.cpp",
+          "src/runtime/CL/functions/CLROIAlignLayer.cpp"
+        ]
+      }
+    },
+    "ROIPool2d": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp",
+          "src/runtime/CL/functions/CLROIPoolingLayer.cpp"
+        ]
+      }
+    },
+    "Scale": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClScaleKernel.cpp",
+          "src/gpu/cl/operators/ClScale.cpp",
+          "src/runtime/CL/functions/CLScale.cpp"
+        ]
+      }
+    },
+    "Select": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLSelectKernel.cpp",
+          "src/runtime/CL/functions/CLSelect.cpp"
+        ]
+      }
+    },
+    "Slice": {
+      "deps": [ "StridedSlice" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLSlice.cpp" ]
+      }
+    },
+    "Softmax": {
+      "deps": [ "Permute" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClSoftmaxKernel.cpp",
+          "src/gpu/cl/operators/ClSoftmax.cpp",
+          "src/runtime/CL/functions/CLSoftmaxLayer.cpp"
+        ]
+      }
+    },
+    "SpaceToBatch": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp",
+          "src/runtime/CL/functions/CLSpaceToBatchLayer.cpp"
+        ]
+      }
+    },
+    "SpaceToDepth": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp",
+          "src/runtime/CL/functions/CLSpaceToDepthLayer.cpp"
+        ]
+      }
+    },
+    "Split": {
+      "deps": [ "StridedSlice" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLSplit.cpp" ]
+      }
+    },
+    "Stack": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLStackLayerKernel.cpp",
+          "src/runtime/CL/functions/CLStackLayer.cpp"
+        ]
+      }
+    },
+    "StridedSlice": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLStridedSliceKernel.cpp",
+          "src/runtime/CL/functions/CLStridedSlice.cpp"
+        ]
+      }
+    },
+    "Sub": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseKernel.cpp",
+          "src/gpu/cl/operators/ClSub.cpp"
+        ]
+      }
+    },
+    "Tile": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLTileKernel.cpp",
+          "src/runtime/CL/functions/CLTile.cpp"
+        ]
+      }
+    },
+    "Transpose": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClTransposeKernel.cpp",
+          "src/gpu/cl/operators/ClTranspose.cpp",
+          "src/runtime/CL/functions/CLTranspose.cpp"
+        ]
+      }
+    },
+    "Unstack": {
+      "deps": [ "StridedSlice" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLUnstack.cpp" ]
+      }
+    },
+    "YUVNormalize": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp",
+          "src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp"
+        ]
       }
     }
-  },
+  }
+},
   "cpu": {
     "common": [
       "src/cpu/CpuContext.cpp",
       "src/cpu/CpuQueue.cpp",
-      "src/cpu/CpuTensor.cpp"
-    ],
-    "high_priority": [
-      "Activation",
-      "DepthwiseConv2d",
-      "DirectConv2d",
-      "Permute",
-      "Pool2d",
-      "Reshape",
-      "FillBorder"
+      "src/cpu/CpuTensor.cpp",
+      "src/core/NEON/kernels/NEFillBorderKernel.cpp",
+      "src/runtime/NEON/INEOperator.cpp",
+      "src/runtime/NEON/INESimpleFunction.cpp",
+      "src/runtime/NEON/INESimpleFunctionNoBorder.cpp"
     ],
     "operators": {
       "Activation": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuActivation.cpp"
+          "common": [
+            "src/cpu/operators/CpuActivation.cpp",
+            "src/cpu/kernels/CpuActivationKernel.cpp",
+            "src/runtime/NEON/functions/NEActivationLayer.cpp"
           ],
-          "kernel": [
-            "src/cpu/kernels/CpuActivationKernel.cpp"
-          ],
-          "sve": {
-            "fp32": [
-              "src/cpu/kernels/activation/sve/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/activation/sve/fp16.cpp"
-            ],
-            "qsymm16": [
-              "src/cpu/kernels/activation/sve/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/activation/sve/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/activation/sve/qasymm8_signed.cpp"
-            ]
-          },
           "neon": {
-            "fp32": [
-              "src/cpu/kernels/activation/neon/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/activation/neon/fp16.cpp"
-            ],
-            "qsymm16": [
-              "src/cpu/kernels/activation/neon/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/activation/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/activation/neon/qasymm8_signed.cpp"
-            ]
+            "fp16": [ "src/cpu/kernels/activation/neon/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/neon/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/activation/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/activation/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/activation/neon/qsymm16.cpp" ]
+          },
+          "sve": {
+            "fp16": [ "src/cpu/kernels/activation/sve/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/sve/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/activation/neon/qasymm8.cpp", "src/cpu/kernels/activation/sve/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/activation/neon/qasymm8_signed.cpp", "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/activation/neon/qsymm16.cpp", "src/cpu/kernels/activation/sve/qsymm16.cpp" ]
           }
         }
       },
+      "ArgMinMax": {
+        "deps": [ "Reduction" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NEArgMinMaxLayer.cpp" ]
+        }
+      },
       "Add": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuAdd.cpp"
+          "common": [
+            "src/cpu/operators/CpuAdd.cpp",
+            "src/cpu/kernels/CpuAddKernel.cpp",
+            "src/runtime/NEON/functions/NEArithmeticAddition.cpp"
           ],
-          "kernel": [
-            "src/cpu/kernels/CpuAddKernel.cpp"
-          ],
-          "sve": {
-            "all": [
-              "src/cpu/kernels/add/sve/impl.cpp"
-            ],
-            "qsymm16": [
-              "src/cpu/kernels/add/sve/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/add/sve/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/add/sve/qasymm8_signed.cpp"
-            ]
-          },
           "neon": {
-            "qsymm16": [
-              "src/cpu/kernels/add/neon/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/add/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/add/neon/qasymm8_signed.cpp"
-            ]
+            "qasymm8": [ "src/cpu/kernels/add/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/add/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/add/neon/qsymm16.cpp" ]
+          },
+          "sve": {
+            "common": [ "src/cpu/kernels/add/sve/impl.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/add/neon/qasymm8.cpp", "src/cpu/kernels/add/sve/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/add/neon/qasymm8_signed.cpp", "src/cpu/kernels/add/sve/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/add/neon/qsymm16.cpp", "src/cpu/kernels/add/sve/qsymm16.cpp" ]
           }
         }
       },
-      "BatchNorm": {
+      "BatchNormalize": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp",
+            "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp",
+            "src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp",
+            "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp"
           ],
-          "sve": {
-            "fp32": [
-              "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp"
-            ],
-            "fp16": [
-              "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp"
-            ]
-          },
           "neon": {
-            "fp32": [
-              "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp"
-            ],
-            "fp16": [
-              "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp"
-            ]
+            "fp16": [ "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp" ],
+            "fp32": [ "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp" ]
+          },
+          "sve": {
+            "fp16": [ "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp" ],
+            "fp32": [ "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp" ]
           }
         }
       },
       "BatchToSpace": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp"
           ]
         }
       },
       "BitwiseAnd": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseAndKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseAndKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseAnd.cpp"
           ]
         }
       },
       "BitwiseNot": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseNotKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseNotKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseNot.cpp"
           ]
         }
       },
       "BitwiseOr": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseOrKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseOrKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseOr.cpp"
           ]
         }
       },
       "BitwiseXor": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseXorKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseXorKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseXor.cpp"
           ]
         }
       },
       "BoundingBoxTransform": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp"
-          ]
-        }
-      },
-      "ChannelShuffleLayer": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp"
-          ]
-        }
-      },
-      "Col2Im": {
-        "files": {
-          "kernel": [
-            "src/cpu/kernels/CpuCol2ImKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp",
+            "src/runtime/NEON/functions/NEBoundingBoxTransform.cpp"
           ]
         }
       },
       "Cast": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuCast.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuCastKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuCast.cpp",
+            "src/cpu/kernels/CpuCastKernel.cpp",
+            "src/runtime/NEON/functions/NECast.cpp"
+          ]
+        }
+      },
+      "ChannelShuffle": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEChannelShuffleLayer.cpp"
           ]
         }
       },
       "Concatenate": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuConcatenate.cpp"
-          ],
-          "kernel": [
+          "common": [
+            "src/cpu/operators/CpuConcatenate.cpp",
             "src/cpu/kernels/CpuConcatenateWidthKernel.cpp",
             "src/cpu/kernels/CpuConcatenateBatchKernel.cpp",
             "src/cpu/kernels/CpuConcatenateDepthKernel.cpp",
-            "src/cpu/kernels/CpuConcatenateHeightKernel.cpp"
+            "src/cpu/kernels/CpuConcatenateHeightKernel.cpp",
+            "src/runtime/NEON/functions/NEConcatenateLayer.cpp"
           ]
         }
       },
-      "ConvertFullyConnectedWeights": {
+      "Conv2d": {
+        "deps": [
+          "Activation",
+          "ElementwiseBinary",
+          "FFT2D",
+          "Gemm",
+          "Mul",
+          "Pad",
+          "Permute",
+          "Reshape",
+          "Reverse",
+          "Slice"
+        ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuConvertFullyConnectedWeights.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp"
-          ]
-        }
-      },
-      "ConvertQuantizedSignedness": {
-        "files": {
-          "kernel": [
-            "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp"
-          ]
-        }
-      },
-      "Convolution": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuConv2d.cpp"
+          "common": [
+            "src/cpu/operators/CpuConv2d.cpp",
+            "src/cpu/operators/CpuDirectConv2d.cpp",
+            "src/cpu/operators/CpuGemmDirectConv2d.cpp",
+            "src/cpu/operators/CpuGemmConv2d.cpp",
+            "src/cpu/operators/CpuWinogradConv2d.cpp",
+            "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
+            "src/cpu/kernels/CpuDirectConv2dKernel.cpp",
+            "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp",
+            "src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
+            "src/cpu/kernels/CpuCol2ImKernel.cpp",
+            "src/cpu/kernels/CpuIm2ColKernel.cpp",
+            "src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
+            "src/core/NEON/kernels/convolution/common/padding.cpp",
+            "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
+            "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
+            "src/core/NEON/kernels/convolution/common/utils.cpp",
+            "src/core/NEON/kernels/convolution/winograd/padding.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
+            "src/runtime/NEON/functions/NEConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
+            "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp"
           ]
         }
       },
       "Copy": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuCopy.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuCopyKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuCopy.cpp",
+            "src/cpu/kernels/CpuCopyKernel.cpp",
+            "src/runtime/NEON/functions/NECopy.cpp"
           ]
         }
       },
-      "Crop": {
+      "CropResize": {
+        "deps": [ "Scale" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NECropKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NECropKernel.cpp",
+            "src/runtime/NEON/functions/NECropResize.cpp"
+          ]
+        }
+      },
+      "Deconv2d": {
+        "deps": [ "Conv2d", "Reverse", "Transpose"],
+        "files": {
+          "common": [
+            "src/runtime/NEON/functions/NEDeconvolutionLayer.cpp"
+          ]
+        }
+      },
+      "DepthConvert": {
+        "deps": [ "Cast"],
+        "files": {
+          "common": [
+            "src/runtime/NEON/functions/NEDepthConvertLayer.cpp"
+          ]
+        }
+      },
+      "DepthToSpace": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp"
           ]
         }
       },
       "DepthwiseConv2d": {
-        "deps": [
-          "Activation",
-          "Permute"
-        ],
+        "deps": [ "Activation", "Permute" ],
         "files": {
-          "operator": [
+          "common": [
             "src/cpu/operators/CpuDepthwiseConv2d.cpp",
             "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
-            "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp"
-          ],
-          "kernel": [
+            "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
+            "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp",
             "src/core/NEON/kernels/convolution/common/padding.cpp",
             "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
             "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
             "src/core/NEON/kernels/convolution/common/utils.cpp",
-            "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp"
+            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
+            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
+            "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+            "src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp"
           ],
+          "neon": {
+            "estate64": [
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp"
+            ]
+          },
           "sve": {
-            "all": [
+            "common": [
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp",
@@ -994,17 +1200,7 @@
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp"
-            ]
-          },
-          "neon": {
-            "estate64": [
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
@@ -1059,166 +1255,122 @@
           }
         }
       },
-      "DepthToSpaceLayer": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp"
-          ]
-        }
-      },
       "Dequantize": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuDequantize.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuDequantizeKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuDequantize.cpp",
+            "src/cpu/kernels/CpuDequantizeKernel.cpp",
+            "src/runtime/NEON/functions/NEDequantizationLayer.cpp"
           ]
         }
       },
-      "DirectConv2d": {
-        "deps": [
-          "Activation",
-          "FillBorder"
-        ],
+      "DetectionPostProcess": {
+        "deps": [ "Dequantize" ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuDirectConv2d.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuDirectConv2dKernel.cpp",
-            "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp"
-          ]
+          "common" : [ "src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp" ]
         }
       },
-      "Elementwise": {
+      "ElementwiseBinary": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuElementwise.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuElementwiseKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuElementwise.cpp",
+            "src/cpu/kernels/CpuElementwiseKernel.cpp",
+            "src/runtime/NEON/functions/NEElementwiseOperations.cpp"
           ],
           "sve": {
-            "all": [
-              "src/cpu/kernels/elementwise/sve/elementwise.cpp"
-            ]
+            "common": [ "src/cpu/kernels/elementwise/sve/elementwise.cpp" ]
           }
         }
       },
-      "ElementwiseUnary": {
+      "ElementwiseUnary":{
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuElementwiseUnary.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuElementwiseUnary.cpp",
+            "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
+            "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp"
           ],
           "sve": {
-            "all": [
-              "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp"
-            ]
+            "common": [ "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" ]
           }
         }
       },
       "FFT1D": {
         "files": {
-          "kernel": [
+          "common": [
             "src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp",
             "src/core/NEON/kernels/NEFFTRadixStageKernel.cpp",
-            "src/core/NEON/kernels/NEFFTScaleKernel.cpp"
+            "src/core/NEON/kernels/NEFFTScaleKernel.cpp",
+            "src/runtime/NEON/functions/NEFFT1D.cpp"
           ]
         }
       },
-      "FillBorder": {
+      "FFT2D": {
+        "deps": [ "FFT1D" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEFillBorderKernel.cpp"
-          ]
-        }
-      },
-      "Flatten": {
-        "deps: ": [
-          "Reshape"
-        ],
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuFlatten.cpp"
+          "common": [
+            "src/runtime/NEON/functions/NEFFT2D.cpp"
           ]
         }
       },
       "Fill": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuFill.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuFillKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuFill.cpp",
+            "src/cpu/kernels/CpuFillKernel.cpp",
+            "src/runtime/NEON/functions/NEFill.cpp"
+          ]
+        }
+      },
+      "Flatten": {
+        "deps: ": [ "Reshape" ],
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuFlatten.cpp",
+            "src/runtime/NEON/functions/NEFlattenLayer.cpp"
           ]
         }
       },
       "Floor": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuFloor.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuFloorKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuFloor.cpp",
+            "src/cpu/kernels/CpuFloorKernel.cpp",
+            "src/runtime/NEON/functions/NEFloor.cpp"
           ],
           "neon": {
-            "fp32": [
-              "src/cpu/kernels/floor/neon/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/floor/neon/fp16.cpp"
-            ]
+            "fp32": [ "src/cpu/kernels/floor/neon/fp32.cpp" ],
+            "fp16": [ "src/cpu/kernels/floor/neon/fp16.cpp" ]
           }
         }
       },
       "FullyConnected": {
-        "deps": [
-          "CpuFlatten",
-          "CpuConvertFullyConnectedWeights",
-          "CpuGemm",
-          "CpuGemmLowpMatrixMultiplyCore"
-        ],
+        "deps": [ "Flatten", "Gemm", "Transpose"],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuFullyConnected.cpp"
-          ]
-        },
-        "kernel": [
-          "CpuTransposeKernel"
-        ]
-      },
-      "FuseBatchNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp"
+          "common": [
+            "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
+            "src/cpu/operators/CpuConvertFullyConnectedWeights.cpp",
+            "src/cpu/operators/CpuFullyConnected.cpp",
+            "src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp",
+            "src/runtime/NEON/functions/NEFullyConnectedLayer.cpp"
           ]
         }
       },
-      "GEMM": {
+      "Gather": {
         "files": {
-          "operator" : ["src/cpu/operators/CpuGemm.cpp"],
-          "kernel": [
+          "common": [
+            "src/core/NEON/kernels/NEGatherKernel.cpp",
+            "src/runtime/NEON/functions/NEGather.cpp"
+          ]
+        }
+      },
+      "Gemm": {
+        "files": {
+          "common": [
+            "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp",
             "src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp",
             "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp",
             "src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp",
-            "src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowp": {
-        "deps": [
-          "GemmAssemblyDispatch"
-        ],
-        "files": {
-          "operator" : [
-              "src/cpu/operators/CpuGemmLowpOutputStage.cpp",
-              "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp"
-          ],
-          "kernel": [
+            "src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp",
             "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
@@ -1226,36 +1378,12 @@
             "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp",
-            "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp"
-          ]
-        }
-      },
-      "GEMMConvolution": {
-        "deps": [
-          "Activation",
-          "Col2Im",
-          "Reshape",
-          "Im2Col",
-          "GEMMLowpOffsetContributionOutputStage",
-          "ConvertQuantizedSignedness"
-        ],
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuGemmConv2d.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuWeightsReshapeKernel.cpp"
-          ]
-        }
-      },
-      "GemmAssemblyDispatch": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp"
-          ],
-          "kernel": [
-            "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
+            "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp",
+            "src/cpu/operators/CpuGemm.cpp",
+            "src/cpu/operators/CpuGemmLowpOutputStage.cpp",
+            "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
+            "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_int8.cpp",
@@ -1263,14 +1391,17 @@
             "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+            "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
             "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
             "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
-            "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
             "src/core/NEON/kernels/arm_gemm/misc.cpp",
             "src/core/NEON/kernels/arm_gemm/quantized.cpp",
             "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
             "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
-            "src/core/NEON/kernels/arm_gemm/transform.cpp"
+            "src/core/NEON/kernels/arm_gemm/transform.cpp",
+            "src/runtime/NEON/functions/NEGEMM.cpp",
+            "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
+            "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp"
           ],
           "neon": {
             "estate32": [
@@ -1344,7 +1475,7 @@
             ]
           },
           "sve": {
-            "all": [
+            "common": [
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp",
@@ -1384,152 +1515,196 @@
               "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
-              "src/core/NEON/kernels/arm_gemm/transform-sve.cpp"
+              "src/core/NEON/kernels/arm_gemm/transform-sve.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp"
             ]
           }
         }
       },
-      "GemmDirectConv2d": {
-        "deps": [
-          "Activation",
-          "GemmAssemblyDispatch",
-          "Permute"
-        ],
+      "GenerateProposals": {
+        "deps": [ "BoundingBoxTransform", "Dequantize", "Pad", "Permute", "Quantize", "Reshape" ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuGemmDirectConv2d.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp"
           ]
         }
       },
-      "Mul": {
+      "InstanceNormalize": {
+        "deps": [ "Permute", "Reduction" ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuMul.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuMulKernel.cpp"
-          ]
-        }
-      },
-      "Quantize": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuQuantize.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuQuantizeKernel.cpp"
-          ]
-        }
-      },
-      "Reshape": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuReshape.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuReshapeKernel.cpp"
-          ]
-        }
-      },
-      "Gather": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEGatherKernel.cpp"
-          ]
-        }
-      },
-      "GenerateProposalsLayer": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp"
-          ]
-        }
-      },
-      "Im2Col": {
-        "files": {
-          "kernel": [
-            "src/cpu/kernels/CpuIm2ColKernel.cpp"
-          ]
-        }
-      },
-      "InstanceNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp"
           ]
         }
       },
       "L2Normalize": {
-        "deps": [
-          "Reduction"
-        ],
+        "deps": [ "Reduction" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEL2NormalizeLayer.cpp"
           ]
         }
       },
       "Logical": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NELogicalKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NELogicalKernel.cpp",
+            "src/runtime/NEON/functions/NELogical.cpp"
           ]
         }
       },
-      "MaxUnpooling": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "MeanStdDevNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "MinMax": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEMinMaxLayerKernel.cpp"
-          ]
-        }
-      },
-      "Normalization": {
+      "LSTM": {
         "deps": [
-          "PixelWiseMultiplication"
+          "Activation",
+          "Concatenate",
+          "Copy",
+          "Dequantize",
+          "ElementwiseBinary",
+          "Fill",
+          "FullyConnected",
+          "Gemm",
+          "MeanStdDevNormalize",
+          "Mul",
+          "Quantize",
+          "Slice",
+          "Transpose"
         ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NENormalizationLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
+            "src/runtime/NEON/functions/NELSTMLayer.cpp",
+            "src/runtime/NEON/functions/NELSTMLayerQuantized.cpp",
+            "src/runtime/NEON/functions/NEQLSTMLayer.cpp"
+          ]
+        }
+      },
+      "MaxUnpool2d": {
+        "deps": [ "Fill" ],
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp"
+          ]
+        }
+      },
+      "Mean": {
+        "deps" : [ "Reduction" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NEReduceMean.cpp" ]
+        }
+      },
+      "MeanStdDevNormalize": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp",
+            "src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp"
+          ]
+        }
+      },
+      "Mul": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuMul.cpp",
+            "src/cpu/kernels/CpuMulKernel.cpp",
+            "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp"
+          ]
+        }
+      },
+      "Normalize": {
+        "deps": [ "Mul" ],
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
+            "src/runtime/NEON/functions/NENormalizationLayer.cpp"
           ]
         }
       },
       "Pad": {
+        "deps": [ "Concatenate", "Copy", "StridedSlice" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEPadLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEPadLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEPadLayer.cpp"
           ]
         }
       },
       "Permute": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuPermute.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuPermuteKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuPermute.cpp",
+            "src/cpu/kernels/CpuPermuteKernel.cpp",
+            "src/runtime/NEON/functions/NEPermute.cpp"
           ]
         }
       },
       "Pool2d": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuPool2d.cpp"
-          ],
-          "kernel": [
+          "common": [
+            "src/cpu/operators/CpuPool2d.cpp",
             "src/cpu/kernels/CpuPool2dKernel.cpp",
             "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
             "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
@@ -1538,24 +1713,15 @@
             "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
             "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
             "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp"
+            "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
+            "src/runtime/NEON/functions/NEPoolingLayer.cpp"
           ],
           "neon": {
-            "nchw": [
-              "src/cpu/kernels/pool2d/neon/nchw/all.cpp"
-            ],
-            "fp32": [
-              "src/cpu/kernels/pool2d/neon/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/pool2d/neon/fp16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/pool2d/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"
-            ],
+            "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ],
+            "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" ],
             "estate64": [
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
@@ -1578,15 +1744,17 @@
             ]
           },
           "sve": {
-            "all": [
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+            "qasymm8": [ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" ],
+            "common": [
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp",
@@ -1596,240 +1764,259 @@
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp"
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"
             ]
           }
         }
       },
-      "PriorBox": {
+      "PRelu": {
+        "deps": [ "ElementwiseBinary" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp"
+          "common": [
+            "src/runtime/NEON/functions/NEPReluLayer.cpp"
           ]
         }
       },
-      "QLSTMLayerNormalization": {
+      "PriorBox": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEPriorBoxLayer.cpp"
+          ]
+        }
+      },
+      "Quantize": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuQuantize.cpp",
+            "src/cpu/kernels/CpuQuantizeKernel.cpp",
+            "src/runtime/NEON/functions/NEQuantizationLayer.cpp"
           ]
         }
       },
       "Range": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NERangeKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NERangeKernel.cpp",
+            "src/runtime/NEON/functions/NERange.cpp"
           ]
         }
       },
-      "ReductionOperation": {
+      "Reduction":{
+        "deps": [ "Reshape" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEReductionOperationKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEReductionOperationKernel.cpp",
+            "src/runtime/NEON/functions/NEReductionOperation.cpp"
           ]
         }
       },
       "Remap": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NERemapKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NERemapKernel.cpp",
+            "src/runtime/NEON/functions/NERemap.cpp"
           ]
         }
       },
       "Reorg": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEReorgLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEReorgLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEReorgLayer.cpp"
+          ]
+        }
+      },
+      "Reshape": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuReshape.cpp",
+            "src/cpu/kernels/CpuReshapeKernel.cpp",
+            "src/runtime/NEON/functions/NEReshapeLayer.cpp"
           ]
         }
       },
       "Reverse": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEReverseKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEReverseKernel.cpp",
+            "src/runtime/NEON/functions/NEReverse.cpp"
           ]
         }
       },
+      "RNN": {
+        "deps": [ "Activation", "Add", "FullyConnected", "Gemm"],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NERNNLayer.cpp" ]
+        }
+      },
       "ROIAlign": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEROIAlignLayer.cpp"
           ]
         }
       },
-      "ROIPooling": {
+      "ROIPool2d": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "Select": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NESelectKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToBatch": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToDepth": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp"
-          ]
-        }
-      },
-      "Stack": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEStackLayerKernel.cpp"
-          ]
-        }
-      },
-      "StridedSlice": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEStridedSliceKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEROIPoolingLayer.cpp"
           ]
         }
       },
       "Scale": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuScale.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuScaleKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuScale.cpp",
+            "src/cpu/kernels/CpuScaleKernel.cpp",
+            "src/runtime/NEON/functions/NEScale.cpp"
           ],
           "sve": {
-            "fp32": [
-              "src/cpu/kernels/scale/sve/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/scale/sve/fp16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/scale/sve/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/scale/sve/qasymm8_signed.cpp"
-            ],
-            "integer": [
-              "src/cpu/kernels/scale/sve/integer.cpp"
-            ]
+            "fp16": [ "src/cpu/kernels/scale/sve/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/scale/sve/fp32.cpp" ],
+            "integer": [ "src/cpu/kernels/scale/sve/integer.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/scale/sve/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/scale/sve/qasymm8_signed.cpp" ]
+
           },
           "neon": {
-            "fp16": [
-              "src/cpu/kernels/scale/neon/fp16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/scale/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/scale/neon/qasymm8_signed.cpp"
-            ],
-            "integer": [
-              "src/cpu/kernels/scale/neon/integer.cpp"
-            ]
+            "fp16": [ "src/cpu/kernels/scale/neon/fp16.cpp" ],
+            "integer": [ "src/cpu/kernels/scale/neon/integer.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/scale/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/scale/neon/qasymm8_signed.cpp" ]
           }
         }
       },
+      "Select": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NESelectKernel.cpp",
+            "src/runtime/NEON/functions/NESelect.cpp"
+          ]
+        }
+      },
+      "Slice": {
+        "deps": [ "StridedSlice" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NESlice.cpp" ]
+        }
+      },
       "Softmax": {
         "deps": [
           "Permute"
         ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuSoftmax.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuSoftmaxKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuSoftmax.cpp",
+            "src/cpu/kernels/CpuSoftmaxKernel.cpp",
+            "src/runtime/NEON/functions/NESoftmaxLayer.cpp"
           ],
           "sve": {
-            "all": [
-              "src/cpu/kernels/softmax/impl/sve/impl.cpp"
-            ]
+            "common": [ "src/cpu/kernels/softmax/impl/sve/impl.cpp" ]
           }
         }
       },
+      "SpaceToBatch": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp",
+            "src/runtime/NEON/functions/NESpaceToBatchLayer.cpp"
+          ]
+        }
+      },
+      "SpaceToDepth": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp",
+            "src/runtime/NEON/functions/NESpaceToDepthLayer.cpp"
+          ]
+        }
+      },
+      "Split": {
+        "deps": [ "StridedSlice" ],
+        "files": {
+          "common": [
+            "src/runtime/NEON/functions/NESplit.cpp"
+          ]
+        }
+      },
+      "Stack": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEStackLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEStackLayer.cpp"
+          ]
+        }
+      },
+      "StridedSlice": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEStridedSliceKernel.cpp",
+            "src/runtime/NEON/functions/NEStridedSlice.cpp"
+          ]
+        }
+      },
       "Sub": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuSub.cpp"
+          "common": [
+            "src/cpu/operators/CpuSub.cpp",
+            "src/cpu/kernels/CpuSubKernel.cpp",
+            "src/runtime/NEON/functions/NEArithmeticSubtraction.cpp"
           ],
-          "kernel": [
-            "src/cpu/kernels/CpuSubKernel.cpp"
-          ],
+          "sve": {
+            "qasymm8": [ "src/cpu/kernels/sub/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/sub/neon/qsymm16.cpp" ]
+          },
           "neon": {
-            "qsymm16": [
-              "src/cpu/kernels/sub/neon/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/sub/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/sub/neon/qasymm8_signed.cpp"
-            ]
+            "qasymm8": [ "src/cpu/kernels/sub/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/sub/neon/qsymm16.cpp" ]
           }
         }
       },
-      "Transpose": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuTranspose.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuTransposeKernel.cpp"
-          ]
-        }
-      },
       "Tile": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NETileKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NETileKernel.cpp",
+            "src/runtime/NEON/functions/NETile.cpp"
           ]
         }
       },
-      "WinogradConvolution": {
-        "deps": [
-          "Activation",
-          "Permute"
-        ],
+      "Transpose": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuWinogradConv2d.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
-            "src/core/NEON/kernels/convolution/winograd/padding.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp"
+          "common": [
+            "src/cpu/kernels/CpuTransposeKernel.cpp",
+            "src/cpu/operators/CpuTranspose.cpp",
+            "src/runtime/NEON/functions/NETranspose.cpp"
           ]
         }
+      },
+      "Unstack": {
+        "deps": [ "StridedSlice" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NEUnstack.cpp" ]
+        }
       }
     }
   }
diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h
index f9d560f..0c295aa 100644
--- a/src/core/CL/CLKernels.h
+++ b/src/core/CL/CLKernels.h
@@ -47,7 +47,6 @@
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
 #include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
deleted file mode 100644
index f0202a9..0000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/CL/kernels/CLMinMaxLayerKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <climits>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        TensorShape output_shape = compute_min_max_shape(input);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape = compute_min_max_shape(input);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
-    const unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output, 0, 0, 2, output->dimension(1));
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
-}
-} // namespace
-
-CLMinMaxLayerKernel::CLMinMaxLayerKernel()
-    : _input(nullptr), _output(nullptr)
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "minmax_layer", build_opts);
-
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    ICLKernel::configure_internal(std::get<1>(win_config));
-}
-
-Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
-    return Status{};
-}
-
-void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)
-{
-    _output->map(queue, true);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator output(_output, window_output);
-
-    // Reset output
-    execute_window_loop(window_output, [&](const Coordinates &)
-    {
-        auto *ptr = reinterpret_cast<float *>(output.ptr());
-        ptr[0]    = std::numeric_limits<float>::max();
-        ptr[1]    = std::numeric_limits<float>::min();
-    },
-    output);
-
-    _output->unmap(queue);
-}
-
-void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
-    Window slice            = window_collapsed.first_slice_window_3D();
-    slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window output_slice = slice.shift_dimensions(2);
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_1D_tensor_argument(idx, _output, output_slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h
deleted file mode 100644
index aa2ff3f..0000000
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-
-#include "src/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output          Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    /** Resets global minimum and maximum
-     *
-     * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
-     */
-    void reset(cl::CommandQueue &queue);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 6d45a9d..af301c8 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -47,7 +47,6 @@
 #include "src/core/NEON/kernels/NELogicalKernel.h"
 #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
deleted file mode 100644
index 5ea8947..0000000
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <climits>
-#include <cstddef>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
-
-    if(output->tensor_shape().total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        TensorShape output_shape = compute_min_max_shape(input);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape = compute_min_max_shape(input);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, output_shape, 1, input->data_type());
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, 2);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win);
-}
-} // namespace
-
-NEMinMaxLayerKernel::NEMinMaxLayerKernel()
-    : _input(nullptr), _output(nullptr), _mtx()
-{
-}
-
-void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NEMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-
-    return Status{};
-}
-
-void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int x_start = window.x().start();
-    const int x_end   = window.x().end();
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over pixels
-    Window window_input(window);
-    window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_input.set(3, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, window_input);
-    Iterator output(_output, window_output);
-
-    execute_window_loop(window_output, [&](const Coordinates & id_batch)
-    {
-        float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
-        float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
-
-        float carry_min_scalar = std::numeric_limits<float>::max();
-        float carry_max_scalar = std::numeric_limits<float>::lowest();
-
-        execute_window_loop(window_input, [&](const Coordinates &)
-        {
-            int        x      = x_start;
-            const auto in_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
-
-            // Vector loop
-            for(; x <= x_end - 8; x += 8)
-            {
-                const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
-                const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
-                const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
-                const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
-                const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
-                carry_min                    = vmin_f32(tmp_min2, carry_min);
-                carry_max                    = vmax_f32(tmp_max2, carry_max);
-            }
-
-            // Process leftover pixels
-            for(; x < x_end; ++x)
-            {
-                const float pixel = in_ptr[x];
-                carry_min_scalar  = std::min(pixel, carry_min_scalar);
-                carry_max_scalar  = std::max(pixel, carry_max_scalar);
-            }
-        },
-        input);
-
-        // Reduce result
-        carry_min = vpmin_f32(carry_min, carry_min);
-        carry_max = vpmax_f32(carry_max, carry_max);
-        carry_min = vpmin_f32(carry_min, carry_min);
-        carry_max = vpmax_f32(carry_max, carry_max);
-
-        // Extract max/min values
-        const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
-        const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
-
-        auto out_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Perform reduction of local min/max values
-        update_min_max(out_ptr, min_i, max_i);
-    },
-    output);
-}
-
-void NEMinMaxLayerKernel::reset()
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-    float32x2_t reset_values = vdup_n_f32(0.0f);
-    reset_values             = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
-    reset_values             = vset_lane_f32(std::numeric_limits<float>::lowest(), reset_values, 1);
-
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator output(_output, window_output);
-
-    execute_window_loop(window_output, [&](const Coordinates &)
-    {
-        vst1_f32(reinterpret_cast<float *>(output.ptr()), reset_values);
-    },
-    output);
-}
-
-void NEMinMaxLayerKernel::update_min_max(float *out_ptr, float min, float max)
-{
-    arm_compute::lock_guard<Mutex> lock(_mtx);
-
-    const float32x2_t old_min = vld1_dup_f32(out_ptr);
-    const float32x2_t old_max = vld1_dup_f32(out_ptr + 1);
-    const float32x2_t new_min = vmin_f32(vdup_n_f32(min), old_min);
-    const float32x2_t new_max = vmax_f32(vdup_n_f32(max), old_max);
-
-    vst1_f32(out_ptr, vzip_f32(new_min, new_max).val[0]);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.h b/src/core/NEON/kernels/NEMinMaxLayerKernel.h
deleted file mode 100644
index b4852ad..0000000
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor. */
-class NEMinMaxLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLayerKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel &operator=(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel(NEMinMaxLayerKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel &operator=(NEMinMaxLayerKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxLayerKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note output[0] = minimum
-     * @note output[1] = maximum
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data type supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum value for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    void update_min_max(float *out_ptr, float min, float max);
-    const ITensor     *_input;
-    ITensor           *_output;
-    arm_compute::Mutex _mtx;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMINMAXLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
index f38912d..1c4c757 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -62,13 +62,11 @@
 
 namespace
 {
-
 bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
 {
   const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
   return qp->b_offset == 0;
 }
-
 }
 
 static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
deleted file mode 100644
index de9b857..0000000
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
-
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-
-#include "src/common/utils/Log.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value);
-}
-
-void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    ARM_COMPUTE_LOG_PARAMS(tensor, border_width, border_mode, constant_border_value);
-    auto k = std::make_unique<CLFillBorderKernel>();
-    k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value);
-    _kernel = std::move(k);
-}
diff --git a/tests/framework/instruments/OpenCLTimer.cpp b/tests/framework/instruments/OpenCLTimer.cpp
index 45eb4c5..e9f945b 100644
--- a/tests/framework/instruments/OpenCLTimer.cpp
+++ b/tests/framework/instruments/OpenCLTimer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,13 @@
 
 template <bool output_timestamps>
 OpenCLClock<output_timestamps>::OpenCLClock(ScaleFactor scale_factor)
-    : _kernels(), _real_function(nullptr), _real_graph_function(nullptr), _prefix(), _timer_enabled(false)
+    : _kernels(),
+      _real_function(nullptr),
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+      _real_graph_function(nullptr),
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+      _prefix(),
+      _timer_enabled(false)
 {
     auto                        q     = CLScheduler::get().queue();
     cl_command_queue_properties props = q.getInfo<CL_QUEUE_PROPERTIES>();
@@ -91,19 +97,17 @@
 {
     // Start intercepting enqueues:
     ARM_COMPUTE_ERROR_ON(_real_function != nullptr);
-    ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
-    _real_function       = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-    _real_graph_function = graph::TaskExecutor::get().execute_function;
-    auto interceptor     = [this](
-                               cl_command_queue command_queue,
-                               cl_kernel        kernel,
-                               cl_uint          work_dim,
-                               const size_t    *gwo,
-                               const size_t    *gws,
-                               const size_t    *lws,
-                               cl_uint          num_events_in_wait_list,
-                               const cl_event * event_wait_list,
-                               cl_event *       event)
+    _real_function   = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+    auto interceptor = [this](
+                           cl_command_queue command_queue,
+                           cl_kernel        kernel,
+                           cl_uint          work_dim,
+                           const size_t    *gwo,
+                           const size_t    *gws,
+                           const size_t    *lws,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event)
     {
         if(this->_timer_enabled)
         {
@@ -138,7 +142,11 @@
             return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event);
         }
     };
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+    ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
+    _real_graph_function = graph::TaskExecutor::get().execute_function;
     // Start intercepting tasks:
     auto task_interceptor = [this](graph::ExecutionTask & task)
     {
@@ -153,9 +161,8 @@
         this->_real_graph_function(task);
         this->_prefix = "";
     };
-
-    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
     graph::TaskExecutor::get().execute_function = task_interceptor;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 }
 
 template <bool output_timestamps>
@@ -175,9 +182,11 @@
 {
     // Restore real function
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function;
+    _real_function                              = nullptr;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     graph::TaskExecutor::get().execute_function = _real_graph_function;
     _real_graph_function                        = nullptr;
-    _real_function                              = nullptr;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 }
 
 template <bool              output_timestamps>
diff --git a/tests/framework/instruments/OpenCLTimer.h b/tests/framework/instruments/OpenCLTimer.h
index 9904035..1812272 100644
--- a/tests/framework/instruments/OpenCLTimer.h
+++ b/tests/framework/instruments/OpenCLTimer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 Arm Limited.
+ * Copyright (c) 2017-2018, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,9 +67,11 @@
     };
     std::list<kernel_info>                          _kernels;
     std::function<decltype(clEnqueueNDRangeKernel)> _real_function;
-    std::function<decltype(graph::execute_task)>    _real_graph_function;
-    std::string                                     _prefix;
-    bool                                            _timer_enabled;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+    std::function<decltype(graph::execute_task)> _real_graph_function;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+    std::string _prefix;
+    bool        _timer_enabled;
 #endif /* ARM_COMPUTE_CL */
 
 private:
diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp
index 35f960d..b753485 100644
--- a/tests/framework/instruments/SchedulerTimer.cpp
+++ b/tests/framework/instruments/SchedulerTimer.cpp
@@ -129,16 +129,24 @@
 
 private:
     std::list<struct SchedulerClock<output_timestamps>::kernel_info> &_kernels;
-    std::map<std::string, SchedulerTimer::LayerData>                 &_layer_data_map;
-    IScheduler                                                       &_real_scheduler;
-    WallClock<output_timestamps>                                      _timer;
-    std::string                                                       _prefix;
+    std::map<std::string, SchedulerTimer::LayerData> &_layer_data_map;
+    IScheduler                  &_real_scheduler;
+    WallClock<output_timestamps> _timer;
+    std::string                  _prefix;
 };
 
 template <bool output_timestamps>
 SchedulerClock<output_timestamps>::SchedulerClock(ScaleFactor scale_factor)
-    : _kernels(), _layer_data_map(), _real_scheduler(nullptr), _real_scheduler_type(), _real_graph_function(nullptr),
-      _scale_factor(scale_factor), _interceptor(nullptr), _scheduler_users()
+    : _kernels(),
+      _layer_data_map(),
+      _real_scheduler(nullptr),
+      _real_scheduler_type(),
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
+      _real_graph_function(nullptr),
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+      _scale_factor(scale_factor),
+      _interceptor(nullptr),
+      _scheduler_users()
 {
     if(instruments_info != nullptr)
     {
@@ -149,6 +157,7 @@
 template <bool output_timestamps>
 void           SchedulerClock<output_timestamps>::test_start()
 {
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     // Start intercepting tasks:
     ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
     _real_graph_function  = graph::TaskExecutor::get().execute_function;
@@ -182,6 +191,7 @@
             scheduler->set_prefix("");
         }
     };
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 
     ARM_COMPUTE_ERROR_ON(_real_scheduler != nullptr);
     _real_scheduler_type = Scheduler::get_type();
@@ -191,7 +201,9 @@
         _real_scheduler = &Scheduler::get();
         _interceptor    = std::make_shared<Interceptor<output_timestamps>>(_kernels, _layer_data_map, *_real_scheduler, _scale_factor);
         Scheduler::set(std::static_pointer_cast<IScheduler>(_interceptor));
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
         graph::TaskExecutor::get().execute_function = task_interceptor;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 
         // Create an interceptor for each scheduler
         // TODO(COMPID-2638) : Allow multiple schedulers, now it assumes the same scheduler is used.
@@ -217,10 +229,12 @@
 {
     // Restore real scheduler
     Scheduler::set(_real_scheduler_type);
-    _real_scheduler                             = nullptr;
-    _interceptor                                = nullptr;
+    _real_scheduler = nullptr;
+    _interceptor    = nullptr;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     graph::TaskExecutor::get().execute_function = _real_graph_function;
     _real_graph_function                        = nullptr;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
 
     // Restore schedulers
     std::for_each(std::begin(_scheduler_users), std::end(_scheduler_users),
@@ -270,9 +284,9 @@
 }
 
 template <bool output_timestamps>
-std::string SchedulerClock<output_timestamps>::instrument_header() const
+std::string    SchedulerClock<output_timestamps>::instrument_header() const
 {
-    std::string output{""};
+    std::string output{ "" };
     output += R"("layer_data" : {)";
     for(auto i_it = _layer_data_map.cbegin(), i_end = _layer_data_map.cend(); i_it != i_end; ++i_it)
     {
diff --git a/tests/framework/instruments/SchedulerTimer.h b/tests/framework/instruments/SchedulerTimer.h
index 9cc0381..c437f27 100644
--- a/tests/framework/instruments/SchedulerTimer.h
+++ b/tests/framework/instruments/SchedulerTimer.h
@@ -97,14 +97,16 @@
     };
 
 private:
-    std::list<kernel_info>                       _kernels;
-    std::map<std::string, LayerData>             _layer_data_map;
-    IScheduler                                  *_real_scheduler;
-    Scheduler::Type                              _real_scheduler_type;
+    std::list<kernel_info> _kernels;
+    std::map<std::string, LayerData> _layer_data_map;
+    IScheduler     *_real_scheduler;
+    Scheduler::Type _real_scheduler_type;
+#ifdef ARM_COMPUTE_GRAPH_ENABLED
     std::function<decltype(graph::execute_task)> _real_graph_function;
-    ScaleFactor                                  _scale_factor;
-    std::shared_ptr<IScheduler>                  _interceptor;
-    std::vector<ISchedulerUser *>                _scheduler_users;
+#endif /* ARM_COMPUTE_GRAPH_ENABLED */
+    ScaleFactor                   _scale_factor;
+    std::shared_ptr<IScheduler>   _interceptor;
+    std::vector<ISchedulerUser *> _scheduler_users;
 };
 
 using SchedulerTimer      = SchedulerClock<false>;