Per-operator build dependencies

Creates a list of operators their respective dependencies.
Alters the build system to walk-through them resolve the dependencies
and build Compute Library.

Removes the following unused kernels/functions:
-[NE|CL]MinMaxLayerKernel
-CLFillBorder

Resolves: COMPMID-4695,COMPMID-4696

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I35ebeef38dac25ec5459cfe9c5f7c9a708621124
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/357914
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6295
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/filelist.json b/filelist.json
index 5171f39..4b85408 100644
--- a/filelist.json
+++ b/filelist.json
@@ -7,8 +7,80 @@
     "src/common/AllocatorWrapper.cpp",
     "src/common/ITensorV2.cpp",
     "src/common/TensorPack.cpp",
-    "src/common/IOperator.cpp"
+    "src/common/IOperator.cpp",
+    "src/core/AccessWindowAutoPadding.cpp",
+    "src/core/AccessWindowStatic.cpp",
+    "src/core/AccessWindowTranspose.cpp",
+    "src/core/Error.cpp",
+    "src/core/GPUTarget.cpp",
+    "src/core/Helpers.cpp",
+    "src/core/IAccessWindow.cpp",
+    "src/core/IKernel.cpp",
+    "src/core/ITensor.cpp",
+    "src/core/ITensorPack.cpp",
+    "src/core/Rounding.cpp",
+    "src/core/Size2D.cpp",
+    "src/core/SubTensorInfo.cpp",
+    "src/core/TensorInfo.cpp",
+    "src/core/Utils.cpp",
+    "src/core/Validate.cpp",
+    "src/core/Version.cpp",
+    "src/core/helpers/SoftmaxHelpers.cpp",
+    "src/core/helpers/WindowHelpers.cpp",
+    "src/core/utils/AssemblyUtils.cpp",
+    "src/core/utils/ScaleUtils.cpp",
+    "src/core/utils/helpers/fft.cpp",
+    "src/core/utils/helpers/tensor_transform.cpp",
+    "src/core/utils/io/FileHandler.cpp",
+    "src/core/utils/misc/MMappedFile.cpp",
+    "src/core/utils/quantization/AsymmHelpers.cpp",
+    "src/core/CPP/CPPTypes.cpp",
+    "src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp",
+    "src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp",
+    "src/core/CPP/kernels/CPPPermuteKernel.cpp",
+    "src/core/CPP/kernels/CPPTopKVKernel.cpp",
+    "src/core/CPP/kernels/CPPUpsampleKernel.cpp",
+    "src/runtime/Allocator.cpp",
+    "src/runtime/BlobLifetimeManager.cpp",
+    "src/runtime/BlobMemoryPool.cpp",
+    "src/runtime/ISimpleLifetimeManager.cpp",
+    "src/runtime/ITensorAllocator.cpp",
+    "src/runtime/IWeightsManager.cpp",
+    "src/runtime/IScheduler.cpp",
+    "src/runtime/Memory.cpp",
+    "src/runtime/MemoryManagerOnDemand.cpp",
+    "src/runtime/OffsetLifetimeManager.cpp",
+    "src/runtime/OffsetMemoryPool.cpp",
+    "src/runtime/OperatorTensor.cpp",
+    "src/runtime/PoolManager.cpp",
+    "src/runtime/RuntimeContext.cpp",
+    "src/runtime/Scheduler.cpp",
+    "src/runtime/SchedulerFactory.cpp",
+    "src/runtime/SchedulerUtils.cpp",
+    "src/runtime/SubTensor.cpp",
+    "src/runtime/Tensor.cpp",
+    "src/runtime/TensorAllocator.cpp",
+    "src/runtime/Utils.cpp",
+    "src/runtime/CPP/ICPPSimpleFunction.cpp",
+    "src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp",
+    "src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp",
+    "src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp",
+    "src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp",
+    "src/runtime/CPP/functions/CPPPermute.cpp",
+    "src/runtime/CPP/functions/CPPTopKV.cpp",
+    "src/runtime/CPP/functions/CPPUpsample.cpp"
   ],
+  "logging": [
+    "src/core/utils/logging/FilePrinter.cpp",
+    "src/core/utils/logging/Helpers.cpp",
+    "src/core/utils/logging/Logger.cpp",
+    "src/core/utils/logging/LoggerRegistry.cpp"
+  ],
+  "scheduler": {
+    "single": [ "src/runtime/CPP/SingleThreadScheduler.cpp" ],
+    "threads": [ "src/runtime/CPP/CPPScheduler.cpp" ],
+    "omp": [ "src/runtime/OMP/OMPScheduler.cpp"]
+  },
   "c_api": {
     "common": [
       "src/c/AclContext.cpp",
@@ -28,6 +100,14 @@
       "src/c/operators/AclActivation.cpp"
     ]
   },
+  "high_priority": [
+    "Activation",
+    "DepthwiseConv2d",
+    "Conv2d",
+    "Permute",
+    "Pool2d",
+    "Reshape"
+  ],
   "gpu": {
     "common": [
       "src/core/CL/CLCompileContext.cpp",
@@ -41,19 +121,11 @@
       "src/core/CL/ICLSimpleKernel.cpp",
       "src/core/CL/ICLTensor.cpp",
       "src/core/CL/OpenCL.cpp",
-      "src/gpu/cl/ClKernelLibrary.cpp",
-      "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp",
-      "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp",
-      "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp",
-      "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp",
-      "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp",
-      "src/core/CL/kernels/CLFillBorderKernel.cpp",
       "src/gpu/cl/ClContext.cpp",
+      "src/gpu/cl/ClKernelLibrary.cpp",
       "src/gpu/cl/ClQueue.cpp",
       "src/gpu/cl/ClTensor.cpp",
+      "src/core/CL/kernels/CLFillBorderKernel.cpp",
       "src/runtime/CL/CLBufferAllocator.cpp",
       "src/runtime/CL/CLGEMMHeuristicsHandle.cpp",
       "src/runtime/CL/CLHelpers.cpp",
@@ -68,888 +140,1022 @@
       "src/runtime/CL/CLTuner.cpp",
       "src/runtime/CL/ICLSimpleFunction.cpp",
       "src/runtime/CL/Utils.cpp",
-      "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp",
-      "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp",
-      "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp",
-      "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp",
       "src/runtime/CL/mlgo/HeuristicTree.cpp",
       "src/runtime/CL/mlgo/MLGOHeuristics.cpp",
       "src/runtime/CL/mlgo/MLGOParser.cpp",
       "src/runtime/CL/mlgo/Utils.cpp",
       "src/runtime/CL/tuners/CLTuningParametersList.cpp"
     ],
-    "high_priority": [
-      "Activation",
-      "DepthwiseConv2d",
-      "DirectConv2d",
-      "Permute",
-      "Pool2d",
-      "Reshape"
-    ],
     "operators": {
-      "Activation": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClActivation.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClActivationKernel.cpp"
-          ]
-        }
-      },
-      "Add": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClAdd.cpp"
-          ]
-        }
-      },
-      "Cast": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClCast.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClCastKernel.cpp"
-          ]
-        }
-      },
-      "Concatenate": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClConcatenate.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
-            "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
-            "src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
-            "src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
-            "src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
-            "src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp"
-          ]
-        }
-      },
-      "DirectConv2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClDirectConv2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClDirectConv2dKernel.cpp"
-          ]
-        }
-      },
-      "FullyConnected": {
-        "deps": [
-          "ClFlatten",
-          "ClConvertFullyConnectedWeights",
-          "ClGemm",
-          "ClGemmLowpMatrixMultiplyCore",
-          "ClTranspose"
-        ],
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFullyConnected.cpp"
-          ]
-        }
-      },
-      "ConvertFullyConnectedWeights": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp"
-          ]
-        }
-      },
-      "Permute": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClPermute.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClPermuteKernel.cpp"
-          ]
-        }
-      },
-      "Pool2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClPool2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClPool2dKernel.cpp"
-          ]
-        }
-      },
-      "Conv2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClConv2d.cpp"
-          ]
-        }
-      },
-      "PRelu": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClPRelu.cpp"
-          ]
-        }
-      },
-      "Reshape": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClReshape.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClReshapeKernel.cpp"
-          ]
-        }
-      },
-      "Copy": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClCopy.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClCopyKernel.cpp"
-          ]
-        }
-      },
-      "Crop": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClCrop.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClCropKernel.cpp"
-          ]
-        }
-      },
-      "Dequantize": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClDequantize.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClDequantizeKernel.cpp"
-          ]
-        }
-      },
-      "Elementwise": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClElementwiseOperations.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClElementwiseKernel.cpp"
-          ]
-        }
-      },
-      "ElementwiseUnary": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClElementwiseUnary.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp"
-          ]
-        }
-      },
-      "Fill": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFill.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClFillKernel.cpp"
-          ]
-        }
-      },
-      "Flatten": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFlatten.cpp"
-          ]
-        }
-      },
-      "Floor": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClFloor.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClFloorKernel.cpp"
-          ]
-        }
-      },
-      "GEMM": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClGemm.cpp",
-            "src/gpu/cl/operators/ClGemmConv2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowp": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp",
-            "src/gpu/cl/operators/ClGemmLowpOutputStage.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp",
-            "src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp" 
-          ]
-        }
-      },
-      "Mul": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClMul.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClMulKernel.cpp"
-          ]
-        }
-      },
-      "Quantize": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClQuantize.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClQuantizeKernel.cpp"
-          ]
-        }
-      },
-      "Scale": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClScale.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClScaleKernel.cpp"
-          ]
-        }
-      },
-      "Softmax": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClSoftmax.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClSoftmaxKernel.cpp"
-          ]
-        }
-      },
-      "Sub": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClSub.cpp"
-          ]
-        }
-      },
-      "Transpose": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClTranspose.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClTransposeKernel.cpp"
-          ]
-        }
-      },
-      "GenerateProposals": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp"
-          ]
-        }
-      },
-      "ArgMinMax": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp"
-          ]
-        }
-      },
-      "BatchNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp"
-          ]
-        }
-      },
-      "BatchToSpace": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp"
-          ]
-        }
-      },
-      "Bitwise": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBitwiseKernel.cpp"
-          ]
-        }
-      },
-      "BoundingBoxTransform": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp"
-          ]
-        }
-      },
-      "ChannelShuffleLayer": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp"
-          ]
-        }
-      },
-      "GEMMConv2d": {
-        "files": {
-          "kernel": [
-            "src/gpu/cl/kernels/ClCol2ImKernel.cpp",
-            "src/gpu/cl/kernels/ClIm2ColKernel.cpp"
-          ]
-        }
-      },
-      "Comparison": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLComparisonKernel.cpp"
-          ]
-        }
-      },
-      "DeconvolutionLayerUpsample": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp"
-          ]
-        }
-      },
-      "DeconvolutionReshapeOutput": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp"
-          ]
-        }
-      },
-      "DepthToSpace": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp"
-          ]
-        }
-      },
-      "DepthwiseConvolutionLayerNative": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp"
-          ]
-        }
-      },
-      "FFTDigitReverse": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp"
-          ]
-        }
-      },
-      "FFTRadixStage": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFFTRadixStageKernel.cpp"
-          ]
-        }
-      },
-      "FFTScale": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFFTScaleKernel.cpp"
-          ]
-        }
-      },
-      "FuseBatchNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "Gather": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLGatherKernel.cpp"
-          ]
-        }
-      },
-      "InstanceNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp"
-          ]
-        }
-      },
-      "L2Normalize": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp"
-          ]
-        }
-      },
-      "LogicalNot": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClLogicalNot.cpp"
-          ]
-        }
-      },
-      "MaxUnpooling": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "MeanStdDevNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "MinMax": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLMinMaxLayerKernel.cpp"
-          ]
-        }
-      },
-      "Normalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLNormalizationLayerKernel.cpp"
-          ]
-        }
-      },
-      "NormalizePlanarYUV": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp"
-          ]
-        }
-      },
-      "Pad": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLPadLayerKernel.cpp"
-          ]
-        }
-      },
-      "PriorBox": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp"
-          ]
-        }
-      },
-      "QLSTMLayerNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "Range": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLRangeKernel.cpp"
-          ]
-        }
-      },
-      "ReductionOperation": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLReductionOperationKernel.cpp"
-          ]
-        }
-      },
-      "Remap": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLRemapKernel.cpp"
-          ]
-        }
-      },
-      "Reorg": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLReorgLayerKernel.cpp"
-          ]
-        }
-      },
-      "Reverse": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLReverseKernel.cpp"
-          ]
-        }
-      },
-      "ROIAlign": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLROIAlignLayerKernel.cpp"
-          ]
-        }
-      },
-      "ROIPooling": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "Select": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLSelectKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToBatch": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToDepth": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp"
-          ]
-        }
-      },
-      "Stack": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLStackLayerKernel.cpp"
-          ]
-        }
-      },
-      "StridedSlice": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLStridedSliceKernel.cpp"
-          ]
-        }
-      },
-      "Tile": {
-        "files": {
-          "kernel": [
-            "src/core/CL/kernels/CLTileKernel.cpp"
-          ]
-        }
-      },
-      "WeightsReshape": {
-        "files": {
-          "kernel": [
-            "src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp"
-          ]
-        }
-      },
-      "WinogradConv2d": {
-        "files": {
-          "operator": [
-            "src/gpu/cl/operators/ClWinogradConv2d.cpp"
-          ],
-          "kernel": [
-            "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp",
-            "src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp",
-            "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp"
-          ]
-        }
+    "Activation":{
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClActivationKernel.cpp",
+          "src/gpu/cl/operators/ClActivation.cpp",
+          "src/runtime/CL/functions/CLActivationLayer.cpp"
+        ]
+      }
+    },
+    "ArgMinMax": {
+      "deps": [ "Reshape" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
+          "src/runtime/CL/functions/CLArgMinMaxLayer.cpp"
+        ]
+      }
+    },
+    "Add": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseKernel.cpp",
+          "src/gpu/cl/operators/ClAdd.cpp"
+        ]
+      }
+    },
+    "BatchNormalization": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
+          "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
+          "src/runtime/CL/functions/CLBatchNormalizationLayer.cpp",
+          "src/runtime/CL/functions/CLFuseBatchNormalization.cpp"
+        ]
+      }
+    },
+    "BatchToSpace": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
+          "src/runtime/CL/functions/CLBatchToSpaceLayer.cpp"
+         ]
+      }
+    },
+    "Bitwise": {
+      "files": {
+        "common": [ "src/core/CL/kernels/CLBitwiseKernel.cpp" ]
+      }
+    },
+    "BitwiseAnd": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseAnd.cpp" ]
+      }
+    },
+    "BitwiseNot": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseNot.cpp" ]
+      }
+    },
+    "BitwiseOr": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseOr.cpp" ]
+      }
+    },
+    "BitwiseXor": {
+      "deps": [ "Bitwise" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLBitwiseXor.cpp" ]
+      }
+    },
+    "BoundingBoxTransform": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp",
+          "src/runtime/CL/functions/CLBoundingBoxTransform.cpp"
+        ]
+      }
+    },
+    "Cast": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClCastKernel.cpp",
+          "src/gpu/cl/operators/ClCast.cpp",
+          "src/runtime/CL/functions/CLCast.cpp"
+        ]
+      }
+    },
+    "ChannelShuffle": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp",
+          "src/runtime/CL/functions/CLChannelShuffleLayer.cpp"
+        ]
+      }
+    },
+    "Comparison":  {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLComparisonKernel.cpp",
+          "src/runtime/CL/functions/CLComparison.cpp"
+        ]
+      }
+    },
+    "Concatenate": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp",
+          "src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
+          "src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
+          "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
+          "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
+          "src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
+          "src/gpu/cl/operators/ClConcatenate.cpp",
+          "src/runtime/CL/functions/CLConcatenateLayer.cpp"
+        ]
+      }
+    },
+    "Conv2d": {
+      "deps": [
+        "Activation",
+        "ElementwiseBinary",
+        "FFT2D",
+        "Gemm",
+        "Mul",
+        "Pad",
+        "Permute",
+        "Reduction",
+        "Reshape",
+        "Reverse",
+        "Slice"
+      ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClDirectConv2dKernel.cpp",
+          "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp",
+          "src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp",
+          "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp",
+          "src/gpu/cl/kernels/ClIm2ColKernel.cpp",
+          "src/gpu/cl/kernels/ClCol2ImKernel.cpp",
+          "src/gpu/cl/operators/ClConv2d.cpp",
+          "src/gpu/cl/operators/ClDirectConv2d.cpp",
+          "src/gpu/cl/operators/ClGemmConv2d.cpp",
+          "src/gpu/cl/operators/ClWinogradConv2d.cpp",
+          "src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp",
+          "src/runtime/CL/functions/CLConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLDirectConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp"
+        ]
+      }
+    },
+    "Copy": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClCopyKernel.cpp",
+          "src/gpu/cl/operators/ClCopy.cpp",
+          "src/runtime/CL/functions/CLCopy.cpp"
+        ]
+      }
+    },
+    "CropResize": {
+      "deps": [ "Copy", "Fill", "Scale" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClCropKernel.cpp",
+          "src/gpu/cl/operators/ClCrop.cpp",
+          "src/runtime/CL/functions/CLCrop.cpp",
+          "src/runtime/CL/functions/CLCropResize.cpp"
+        ]
+      }
+    },
+    "Deconv2d": {
+      "deps": [ "Conv2d", "Reverse", "Transpose"],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
+          "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
+          "src/runtime/CL/functions/CLDeconvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp",
+          "src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp",
+          "src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp"
+        ]
+      }
+    },
+    "DepthConvert": {
+      "deps": [ "Cast"],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLDepthConvertLayer.cpp" ]
+      }
+    },
+    "DepthToSpace": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp",
+          "src/runtime/CL/functions/CLDepthToSpaceLayer.cpp"
+        ]
+      }
+    },
+    "DepthwiseConv2d": {
+      "deps": [ "Permute" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp",
+          "src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp"
+        ]
+      }
+    },
+    "Dequantize": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClDequantizeKernel.cpp",
+          "src/gpu/cl/operators/ClDequantize.cpp",
+          "src/runtime/CL/functions/CLDequantizationLayer.cpp"
+        ]
+      }
+    },
+    "ElementwiseBinary": {
+      "deps": ["Add", "Sub"],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseKernel.cpp",
+          "src/gpu/cl/operators/ClElementwiseOperations.cpp",
+          "src/runtime/CL/functions/CLElementwiseOperations.cpp"
+        ]
+      }
+    },
+    "ElementwiseUnary":{
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
+          "src/gpu/cl/operators/ClElementwiseUnary.cpp",
+          "src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp"
+        ]
+      }
+    },
+    "FFT1D": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp",
+          "src/core/CL/kernels/CLFFTRadixStageKernel.cpp",
+          "src/core/CL/kernels/CLFFTScaleKernel.cpp",
+          "src/runtime/CL/functions/CLFFT1D.cpp"
+        ]
+      }
+    },
+    "FFT2D": {
+      "deps": [ "FFT1D" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLFFT2D.cpp" ]
+      }
+    },
+    "Fill": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClFillKernel.cpp",
+          "src/gpu/cl/operators/ClFill.cpp",
+          "src/runtime/CL/functions/CLFill.cpp"
+        ]
+      }
+    },
+    "Flatten": {
+      "files": {
+        "common": [
+          "src/gpu/cl/operators/ClFlatten.cpp",
+          "src/runtime/CL/functions/CLFlattenLayer.cpp"
+        ]
+      }
+    },
+    "Floor": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClFloorKernel.cpp",
+          "src/gpu/cl/operators/ClFloor.cpp",
+          "src/runtime/CL/functions/CLFloor.cpp"
+        ]
+      }
+    },
+    "FullyConnected": {
+      "deps": [ "Flatten", "Gemm", "Transpose"],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp",
+          "src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp",
+          "src/gpu/cl/operators/ClFullyConnected.cpp",
+          "src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp",
+          "src/runtime/CL/functions/CLFullyConnectedLayer.cpp"
+        ]
+      }
+    },
+    "Gather": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLGatherKernel.cpp",
+          "src/runtime/CL/functions/CLGather.cpp"]
+      }
+    },
+    "Gemm": {
+      "deps": [ "Cast" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp",
+          "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp",
+          "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp",
+          "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp",
+          "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp",
+          "src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp",
+          "src/gpu/cl/operators/ClGemm.cpp",
+          "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp",
+          "src/gpu/cl/operators/ClGemmLowpOutputStage.cpp",
+          "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp",
+          "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp",
+          "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp",
+          "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp",
+          "src/runtime/CL/functions/CLGEMM.cpp",
+          "src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp",
+          "src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp"
+        ]
+      }
+    },
+    "GenerateProposals": {
+      "deps": [ "BoundingBoxTransform", "Dequantize", "Pad", "Permute", "Quantize", "Reshape" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp",
+          "src/runtime/CL/functions/CLGenerateProposalsLayer.cpp"
+        ]
+      }
+    },
+    "InstanceNormalize": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
+          "src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp"
+        ]
+      }
+    },
+    "L2Normalize": {
+      "deps": [ "Reduction" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp",
+          "src/runtime/CL/functions/CLL2NormalizeLayer.cpp"
+        ]
+      }
+    },
+    "Logical": {
+      "files": {
+        "common": [
+          "src/gpu/cl/operators/ClLogicalNot.cpp",
+          "src/runtime/CL/functions/CLLogicalAnd.cpp",
+          "src/runtime/CL/functions/CLLogicalNot.cpp",
+          "src/runtime/CL/functions/CLLogicalOr.cpp"
+        ]
+      }
+    },
+    "LSTM": {
+      "deps": [
+        "Activation",
+        "Concatenate",
+        "Copy",
+        "Dequantize",
+        "ElementwiseBinary",
+        "Fill",
+        "FullyConnected",
+        "Gemm",
+        "MeanStdDevNormalize",
+        "Mul",
+        "Quantize",
+        "Slice",
+        "Transpose"
+      ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp",
+          "src/runtime/CL/functions/CLQLSTMLayer.cpp",
+          "src/runtime/CL/functions/CLLSTMLayer.cpp",
+          "src/runtime/CL/functions/CLLSTMLayerQuantized.cpp"
+        ]
+      }
+    },
+    "MaxUnpool2d": {
+      "deps": [ "Fill" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
+          "src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp"
+        ]
+      }
+    },
+    "MeanStdDevNormalize": {
+      "deps": [ "Reduction" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
+          "src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp",
+          "src/runtime/CL/functions/CLReduceMean.cpp"
+        ]
+      }
+    },
+    "Mul": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClMulKernel.cpp",
+          "src/gpu/cl/operators/ClMul.cpp",
+          "src/runtime/CL/functions/CLPixelWiseMultiplication.cpp"
+        ]
+      }
+    },
+    "Normalize": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLNormalizationLayerKernel.cpp",
+          "src/runtime/CL/functions/CLNormalizationLayer.cpp"
+        ]
+      }
+    },
+    "Pad": {
+      "deps": [ "Copy" ],
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLPadLayerKernel.cpp",
+          "src/runtime/CL/functions/CLPadLayer.cpp"
+        ]
+      }
+    },
+    "Permute": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClPermuteKernel.cpp",
+          "src/gpu/cl/operators/ClPermute.cpp",
+          "src/runtime/CL/functions/CLPermute.cpp"
+        ]
+      }
+    },
+    "Pool2d": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClPool2dKernel.cpp",
+          "src/gpu/cl/operators/ClPool2d.cpp",
+          "src/runtime/CL/functions/CLPoolingLayer.cpp"
+        ]
+      }
+    },
+    "PRelu": {
+      "deps": [ "ElementwiseBinary" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/operators/ClPRelu.cpp",
+          "src/runtime/CL/functions/CLPReluLayer.cpp"
+        ]
+      }
+    },
+    "PriorBox": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp",
+          "src/runtime/CL/functions/CLPriorBoxLayer.cpp"
+        ]
+      }
+    },
+    "Quantize": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClQuantizeKernel.cpp",
+          "src/gpu/cl/operators/ClQuantize.cpp",
+          "src/runtime/CL/functions/CLQuantizationLayer.cpp"
+        ]
+      }
+    },
+    "Range": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLRangeKernel.cpp",
+          "src/runtime/CL/functions/CLRange.cpp"
+        ]
+      }
+    },
+    "Reduction": {
+      "deps": [ "Reshape" ],
+      "files": {
+        "common": [ 
+          "src/core/CL/kernels/CLReductionOperationKernel.cpp",
+          "src/runtime/CL/functions/CLReductionOperation.cpp"
+        ]
+      }
+    },
+    "Remap": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLRemapKernel.cpp",
+          "src/runtime/CL/functions/CLRemap.cpp"]
+      }
+    },
+    "Reorg": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLReorgLayerKernel.cpp",
+          "src/runtime/CL/functions/CLReorgLayer.cpp"
+        ]
+      }
+    },
+    "Reshape": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClReshapeKernel.cpp",
+          "src/gpu/cl/operators/ClReshape.cpp",
+          "src/runtime/CL/functions/CLReshapeLayer.cpp"
+        ]
+      }
+    },
+    "Reverse": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLReverseKernel.cpp",
+          "src/runtime/CL/functions/CLReverse.cpp"
+        ]
+      }
+    },
+    "RNN": {
+      "deps": [ "Activation", "Cast", "ElementwiseBinary", "FullyConnected", "Gemm"],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLRNNLayer.cpp" ]
+      }
+    },
+    "ROIAlign": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLROIAlignLayerKernel.cpp",
+          "src/runtime/CL/functions/CLROIAlignLayer.cpp"
+        ]
+      }
+    },
+    "ROIPool2d": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp",
+          "src/runtime/CL/functions/CLROIPoolingLayer.cpp"
+        ]
+      }
+    },
+    "Scale": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClScaleKernel.cpp",
+          "src/gpu/cl/operators/ClScale.cpp",
+          "src/runtime/CL/functions/CLScale.cpp"
+        ]
+      }
+    },
+    "Select": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLSelectKernel.cpp",
+          "src/runtime/CL/functions/CLSelect.cpp"
+        ]
+      }
+    },
+    "Slice": {
+      "deps": [ "StridedSlice" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLSlice.cpp" ]
+      }
+    },
+    "Softmax": {
+      "deps": [ "Permute" ],
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClSoftmaxKernel.cpp",
+          "src/gpu/cl/operators/ClSoftmax.cpp",
+          "src/runtime/CL/functions/CLSoftmaxLayer.cpp"
+        ]
+      }
+    },
+    "SpaceToBatch": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp",
+          "src/runtime/CL/functions/CLSpaceToBatchLayer.cpp"
+        ]
+      }
+    },
+    "SpaceToDepth": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp",
+          "src/runtime/CL/functions/CLSpaceToDepthLayer.cpp"
+        ]
+      }
+    },
+    "Split": {
+      "deps": [ "StridedSlice" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLSplit.cpp" ]
+      }
+    },
+    "Stack": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLStackLayerKernel.cpp",
+          "src/runtime/CL/functions/CLStackLayer.cpp"
+        ]
+      }
+    },
+    "StridedSlice": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLStridedSliceKernel.cpp",
+          "src/runtime/CL/functions/CLStridedSlice.cpp"
+        ]
+      }
+    },
+    "Sub": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClElementwiseKernel.cpp",
+          "src/gpu/cl/operators/ClSub.cpp"
+        ]
+      }
+    },
+    "Tile": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLTileKernel.cpp",
+          "src/runtime/CL/functions/CLTile.cpp"
+        ]
+      }
+    },
+    "Transpose": {
+      "files": {
+        "common": [
+          "src/gpu/cl/kernels/ClTransposeKernel.cpp",
+          "src/gpu/cl/operators/ClTranspose.cpp",
+          "src/runtime/CL/functions/CLTranspose.cpp"
+        ]
+      }
+    },
+    "Unstack": {
+      "deps": [ "StridedSlice" ],
+      "files": {
+        "common": [ "src/runtime/CL/functions/CLUnstack.cpp" ]
+      }
+    },
+    "YUVNormalize": {
+      "files": {
+        "common": [
+          "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp",
+          "src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp"
+        ]
       }
     }
-  },
+  }
+},
   "cpu": {
     "common": [
       "src/cpu/CpuContext.cpp",
       "src/cpu/CpuQueue.cpp",
-      "src/cpu/CpuTensor.cpp"
-    ],
-    "high_priority": [
-      "Activation",
-      "DepthwiseConv2d",
-      "DirectConv2d",
-      "Permute",
-      "Pool2d",
-      "Reshape",
-      "FillBorder"
+      "src/cpu/CpuTensor.cpp",
+      "src/core/NEON/kernels/NEFillBorderKernel.cpp",
+      "src/runtime/NEON/INEOperator.cpp",
+      "src/runtime/NEON/INESimpleFunction.cpp",
+      "src/runtime/NEON/INESimpleFunctionNoBorder.cpp"
     ],
     "operators": {
       "Activation": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuActivation.cpp"
+          "common": [
+            "src/cpu/operators/CpuActivation.cpp",
+            "src/cpu/kernels/CpuActivationKernel.cpp",
+            "src/runtime/NEON/functions/NEActivationLayer.cpp"
           ],
-          "kernel": [
-            "src/cpu/kernels/CpuActivationKernel.cpp"
-          ],
-          "sve": {
-            "fp32": [
-              "src/cpu/kernels/activation/sve/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/activation/sve/fp16.cpp"
-            ],
-            "qsymm16": [
-              "src/cpu/kernels/activation/sve/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/activation/sve/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/activation/sve/qasymm8_signed.cpp"
-            ]
-          },
           "neon": {
-            "fp32": [
-              "src/cpu/kernels/activation/neon/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/activation/neon/fp16.cpp"
-            ],
-            "qsymm16": [
-              "src/cpu/kernels/activation/neon/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/activation/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/activation/neon/qasymm8_signed.cpp"
-            ]
+            "fp16": [ "src/cpu/kernels/activation/neon/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/neon/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/activation/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/activation/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/activation/neon/qsymm16.cpp" ]
+          },
+          "sve": {
+            "fp16": [ "src/cpu/kernels/activation/sve/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/activation/sve/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/activation/neon/qasymm8.cpp", "src/cpu/kernels/activation/sve/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/activation/neon/qasymm8_signed.cpp", "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/activation/neon/qsymm16.cpp", "src/cpu/kernels/activation/sve/qsymm16.cpp" ]
           }
         }
       },
+      "ArgMinMax": {
+        "deps": [ "Reduction" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NEArgMinMaxLayer.cpp" ]
+        }
+      },
       "Add": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuAdd.cpp"
+          "common": [
+            "src/cpu/operators/CpuAdd.cpp",
+            "src/cpu/kernels/CpuAddKernel.cpp",
+            "src/runtime/NEON/functions/NEArithmeticAddition.cpp"
           ],
-          "kernel": [
-            "src/cpu/kernels/CpuAddKernel.cpp"
-          ],
-          "sve": {
-            "all": [
-              "src/cpu/kernels/add/sve/impl.cpp"
-            ],
-            "qsymm16": [
-              "src/cpu/kernels/add/sve/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/add/sve/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/add/sve/qasymm8_signed.cpp"
-            ]
-          },
           "neon": {
-            "qsymm16": [
-              "src/cpu/kernels/add/neon/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/add/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/add/neon/qasymm8_signed.cpp"
-            ]
+            "qasymm8": [ "src/cpu/kernels/add/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/add/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/add/neon/qsymm16.cpp" ]
+          },
+          "sve": {
+            "common": [ "src/cpu/kernels/add/sve/impl.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/add/neon/qasymm8.cpp", "src/cpu/kernels/add/sve/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/add/neon/qasymm8_signed.cpp", "src/cpu/kernels/add/sve/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/add/neon/qsymm16.cpp", "src/cpu/kernels/add/sve/qsymm16.cpp" ]
           }
         }
       },
-      "BatchNorm": {
+      "BatchNormalize": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp",
+            "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp",
+            "src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp",
+            "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp"
           ],
-          "sve": {
-            "fp32": [
-              "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp"
-            ],
-            "fp16": [
-              "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp"
-            ]
-          },
           "neon": {
-            "fp32": [
-              "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp"
-            ],
-            "fp16": [
-              "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp"
-            ]
+            "fp16": [ "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp" ],
+            "fp32": [ "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp" ]
+          },
+          "sve": {
+            "fp16": [ "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp" ],
+            "fp32": [ "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp" ]
           }
         }
       },
       "BatchToSpace": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp"
           ]
         }
       },
       "BitwiseAnd": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseAndKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseAndKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseAnd.cpp"
           ]
         }
       },
       "BitwiseNot": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseNotKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseNotKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseNot.cpp"
           ]
         }
       },
       "BitwiseOr": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseOrKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseOrKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseOr.cpp"
           ]
         }
       },
       "BitwiseXor": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBitwiseXorKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBitwiseXorKernel.cpp",
+            "src/runtime/NEON/functions/NEBitwiseXor.cpp"
           ]
         }
       },
       "BoundingBoxTransform": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp"
-          ]
-        }
-      },
-      "ChannelShuffleLayer": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp"
-          ]
-        }
-      },
-      "Col2Im": {
-        "files": {
-          "kernel": [
-            "src/cpu/kernels/CpuCol2ImKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp",
+            "src/runtime/NEON/functions/NEBoundingBoxTransform.cpp"
           ]
         }
       },
       "Cast": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuCast.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuCastKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuCast.cpp",
+            "src/cpu/kernels/CpuCastKernel.cpp",
+            "src/runtime/NEON/functions/NECast.cpp"
+          ]
+        }
+      },
+      "ChannelShuffle": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEChannelShuffleLayer.cpp"
           ]
         }
       },
       "Concatenate": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuConcatenate.cpp"
-          ],
-          "kernel": [
+          "common": [
+            "src/cpu/operators/CpuConcatenate.cpp",
             "src/cpu/kernels/CpuConcatenateWidthKernel.cpp",
             "src/cpu/kernels/CpuConcatenateBatchKernel.cpp",
             "src/cpu/kernels/CpuConcatenateDepthKernel.cpp",
-            "src/cpu/kernels/CpuConcatenateHeightKernel.cpp"
+            "src/cpu/kernels/CpuConcatenateHeightKernel.cpp",
+            "src/runtime/NEON/functions/NEConcatenateLayer.cpp"
           ]
         }
       },
-      "ConvertFullyConnectedWeights": {
+      "Conv2d": {
+        "deps": [
+          "Activation",
+          "ElementwiseBinary",
+          "FFT2D",
+          "Gemm",
+          "Mul",
+          "Pad",
+          "Permute",
+          "Reshape",
+          "Reverse",
+          "Slice"
+        ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuConvertFullyConnectedWeights.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp"
-          ]
-        }
-      },
-      "ConvertQuantizedSignedness": {
-        "files": {
-          "kernel": [
-            "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp"
-          ]
-        }
-      },
-      "Convolution": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuConv2d.cpp"
+          "common": [
+            "src/cpu/operators/CpuConv2d.cpp",
+            "src/cpu/operators/CpuDirectConv2d.cpp",
+            "src/cpu/operators/CpuGemmDirectConv2d.cpp",
+            "src/cpu/operators/CpuGemmConv2d.cpp",
+            "src/cpu/operators/CpuWinogradConv2d.cpp",
+            "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
+            "src/cpu/kernels/CpuDirectConv2dKernel.cpp",
+            "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp",
+            "src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
+            "src/cpu/kernels/CpuCol2ImKernel.cpp",
+            "src/cpu/kernels/CpuIm2ColKernel.cpp",
+            "src/cpu/kernels/CpuWeightsReshapeKernel.cpp",
+            "src/core/NEON/kernels/convolution/common/padding.cpp",
+            "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
+            "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
+            "src/core/NEON/kernels/convolution/common/utils.cpp",
+            "src/core/NEON/kernels/convolution/winograd/padding.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
+            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
+            "src/runtime/NEON/functions/NEConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
+            "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
+            "src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp"
           ]
         }
       },
       "Copy": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuCopy.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuCopyKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuCopy.cpp",
+            "src/cpu/kernels/CpuCopyKernel.cpp",
+            "src/runtime/NEON/functions/NECopy.cpp"
           ]
         }
       },
-      "Crop": {
+      "CropResize": {
+        "deps": [ "Scale" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NECropKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NECropKernel.cpp",
+            "src/runtime/NEON/functions/NECropResize.cpp"
+          ]
+        }
+      },
+      "Deconv2d": {
+        "deps": [ "Conv2d", "Reverse", "Transpose"],
+        "files": {
+          "common": [
+            "src/runtime/NEON/functions/NEDeconvolutionLayer.cpp"
+          ]
+        }
+      },
+      "DepthConvert": {
+        "deps": [ "Cast"],
+        "files": {
+          "common": [
+            "src/runtime/NEON/functions/NEDepthConvertLayer.cpp"
+          ]
+        }
+      },
+      "DepthToSpace": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp"
           ]
         }
       },
       "DepthwiseConv2d": {
-        "deps": [
-          "Activation",
-          "Permute"
-        ],
+        "deps": [ "Activation", "Permute" ],
         "files": {
-          "operator": [
+          "common": [
             "src/cpu/operators/CpuDepthwiseConv2d.cpp",
             "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
-            "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp"
-          ],
-          "kernel": [
+            "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
+            "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp",
             "src/core/NEON/kernels/convolution/common/padding.cpp",
             "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
             "src/core/NEON/kernels/convolution/common/qsymm8.cpp",
             "src/core/NEON/kernels/convolution/common/utils.cpp",
-            "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp"
+            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
+            "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
+            "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+            "src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp"
           ],
+          "neon": {
+            "estate64": [
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp"
+            ]
+          },
           "sve": {
-            "all": [
+            "common": [
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp",
@@ -994,17 +1200,7 @@
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp"
-            ]
-          },
-          "neon": {
-            "estate64": [
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
-              "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+              "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp",
               "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
@@ -1059,166 +1255,122 @@
           }
         }
       },
-      "DepthToSpaceLayer": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp"
-          ]
-        }
-      },
       "Dequantize": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuDequantize.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuDequantizeKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuDequantize.cpp",
+            "src/cpu/kernels/CpuDequantizeKernel.cpp",
+            "src/runtime/NEON/functions/NEDequantizationLayer.cpp"
           ]
         }
       },
-      "DirectConv2d": {
-        "deps": [
-          "Activation",
-          "FillBorder"
-        ],
+      "DetectionPostProcess": {
+        "deps": [ "Dequantize" ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuDirectConv2d.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuDirectConv2dKernel.cpp",
-            "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp"
-          ]
+          "common" : [ "src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp" ]
         }
       },
-      "Elementwise": {
+      "ElementwiseBinary": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuElementwise.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuElementwiseKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuElementwise.cpp",
+            "src/cpu/kernels/CpuElementwiseKernel.cpp",
+            "src/runtime/NEON/functions/NEElementwiseOperations.cpp"
           ],
           "sve": {
-            "all": [
-              "src/cpu/kernels/elementwise/sve/elementwise.cpp"
-            ]
+            "common": [ "src/cpu/kernels/elementwise/sve/elementwise.cpp" ]
           }
         }
       },
-      "ElementwiseUnary": {
+      "ElementwiseUnary":{
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuElementwiseUnary.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuElementwiseUnary.cpp",
+            "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
+            "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp"
           ],
           "sve": {
-            "all": [
-              "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp"
-            ]
+            "common": [ "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" ]
           }
         }
       },
       "FFT1D": {
         "files": {
-          "kernel": [
+          "common": [
             "src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp",
             "src/core/NEON/kernels/NEFFTRadixStageKernel.cpp",
-            "src/core/NEON/kernels/NEFFTScaleKernel.cpp"
+            "src/core/NEON/kernels/NEFFTScaleKernel.cpp",
+            "src/runtime/NEON/functions/NEFFT1D.cpp"
           ]
         }
       },
-      "FillBorder": {
+      "FFT2D": {
+        "deps": [ "FFT1D" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEFillBorderKernel.cpp"
-          ]
-        }
-      },
-      "Flatten": {
-        "deps: ": [
-          "Reshape"
-        ],
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuFlatten.cpp"
+          "common": [
+            "src/runtime/NEON/functions/NEFFT2D.cpp"
           ]
         }
       },
       "Fill": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuFill.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuFillKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuFill.cpp",
+            "src/cpu/kernels/CpuFillKernel.cpp",
+            "src/runtime/NEON/functions/NEFill.cpp"
+          ]
+        }
+      },
+      "Flatten": {
+        "deps: ": [ "Reshape" ],
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuFlatten.cpp",
+            "src/runtime/NEON/functions/NEFlattenLayer.cpp"
           ]
         }
       },
       "Floor": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuFloor.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuFloorKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuFloor.cpp",
+            "src/cpu/kernels/CpuFloorKernel.cpp",
+            "src/runtime/NEON/functions/NEFloor.cpp"
           ],
           "neon": {
-            "fp32": [
-              "src/cpu/kernels/floor/neon/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/floor/neon/fp16.cpp"
-            ]
+            "fp32": [ "src/cpu/kernels/floor/neon/fp32.cpp" ],
+            "fp16": [ "src/cpu/kernels/floor/neon/fp16.cpp" ]
           }
         }
       },
       "FullyConnected": {
-        "deps": [
-          "CpuFlatten",
-          "CpuConvertFullyConnectedWeights",
-          "CpuGemm",
-          "CpuGemmLowpMatrixMultiplyCore"
-        ],
+        "deps": [ "Flatten", "Gemm", "Transpose"],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuFullyConnected.cpp"
-          ]
-        },
-        "kernel": [
-          "CpuTransposeKernel"
-        ]
-      },
-      "FuseBatchNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp"
+          "common": [
+            "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
+            "src/cpu/operators/CpuConvertFullyConnectedWeights.cpp",
+            "src/cpu/operators/CpuFullyConnected.cpp",
+            "src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp",
+            "src/runtime/NEON/functions/NEFullyConnectedLayer.cpp"
           ]
         }
       },
-      "GEMM": {
+      "Gather": {
         "files": {
-          "operator" : ["src/cpu/operators/CpuGemm.cpp"],
-          "kernel": [
+          "common": [
+            "src/core/NEON/kernels/NEGatherKernel.cpp",
+            "src/runtime/NEON/functions/NEGather.cpp"
+          ]
+        }
+      },
+      "Gemm": {
+        "files": {
+          "common": [
+            "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp",
             "src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp",
             "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp",
             "src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp",
-            "src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp"
-          ]
-        }
-      },
-      "GEMMLowp": {
-        "deps": [
-          "GemmAssemblyDispatch"
-        ],
-        "files": {
-          "operator" : [
-              "src/cpu/operators/CpuGemmLowpOutputStage.cpp",
-              "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp"
-          ],
-          "kernel": [
+            "src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp",
             "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
@@ -1226,36 +1378,12 @@
             "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp",
             "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp",
-            "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp"
-          ]
-        }
-      },
-      "GEMMConvolution": {
-        "deps": [
-          "Activation",
-          "Col2Im",
-          "Reshape",
-          "Im2Col",
-          "GEMMLowpOffsetContributionOutputStage",
-          "ConvertQuantizedSignedness"
-        ],
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuGemmConv2d.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuWeightsReshapeKernel.cpp"
-          ]
-        }
-      },
-      "GemmAssemblyDispatch": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp"
-          ],
-          "kernel": [
-            "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
+            "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp",
+            "src/cpu/operators/CpuGemm.cpp",
+            "src/cpu/operators/CpuGemmLowpOutputStage.cpp",
+            "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
+            "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_int8.cpp",
@@ -1263,14 +1391,17 @@
             "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
             "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+            "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
             "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp",
             "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
-            "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
             "src/core/NEON/kernels/arm_gemm/misc.cpp",
             "src/core/NEON/kernels/arm_gemm/quantized.cpp",
             "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
             "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
-            "src/core/NEON/kernels/arm_gemm/transform.cpp"
+            "src/core/NEON/kernels/arm_gemm/transform.cpp",
+            "src/runtime/NEON/functions/NEGEMM.cpp",
+            "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
+            "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp"
           ],
           "neon": {
             "estate32": [
@@ -1344,7 +1475,7 @@
             ]
           },
           "sve": {
-            "all": [
+            "common": [
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp",
@@ -1384,152 +1515,196 @@
               "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
               "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp",
-              "src/core/NEON/kernels/arm_gemm/transform-sve.cpp"
+              "src/core/NEON/kernels/arm_gemm/transform-sve.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
+              "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp"
             ]
           }
         }
       },
-      "GemmDirectConv2d": {
-        "deps": [
-          "Activation",
-          "GemmAssemblyDispatch",
-          "Permute"
-        ],
+      "GenerateProposals": {
+        "deps": [ "BoundingBoxTransform", "Dequantize", "Pad", "Permute", "Quantize", "Reshape" ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuGemmDirectConv2d.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp"
           ]
         }
       },
-      "Mul": {
+      "InstanceNormalize": {
+        "deps": [ "Permute", "Reduction" ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuMul.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuMulKernel.cpp"
-          ]
-        }
-      },
-      "Quantize": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuQuantize.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuQuantizeKernel.cpp"
-          ]
-        }
-      },
-      "Reshape": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuReshape.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuReshapeKernel.cpp"
-          ]
-        }
-      },
-      "Gather": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEGatherKernel.cpp"
-          ]
-        }
-      },
-      "GenerateProposalsLayer": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp"
-          ]
-        }
-      },
-      "Im2Col": {
-        "files": {
-          "kernel": [
-            "src/cpu/kernels/CpuIm2ColKernel.cpp"
-          ]
-        }
-      },
-      "InstanceNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp"
           ]
         }
       },
       "L2Normalize": {
-        "deps": [
-          "Reduction"
-        ],
+        "deps": [ "Reduction" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEL2NormalizeLayer.cpp"
           ]
         }
       },
       "Logical": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NELogicalKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NELogicalKernel.cpp",
+            "src/runtime/NEON/functions/NELogical.cpp"
           ]
         }
       },
-      "MaxUnpooling": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "MeanStdDevNormalization": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp"
-          ]
-        }
-      },
-      "MinMax": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEMinMaxLayerKernel.cpp"
-          ]
-        }
-      },
-      "Normalization": {
+      "LSTM": {
         "deps": [
-          "PixelWiseMultiplication"
+          "Activation",
+          "Concatenate",
+          "Copy",
+          "Dequantize",
+          "ElementwiseBinary",
+          "Fill",
+          "FullyConnected",
+          "Gemm",
+          "MeanStdDevNormalize",
+          "Mul",
+          "Quantize",
+          "Slice",
+          "Transpose"
         ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NENormalizationLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
+            "src/runtime/NEON/functions/NELSTMLayer.cpp",
+            "src/runtime/NEON/functions/NELSTMLayerQuantized.cpp",
+            "src/runtime/NEON/functions/NEQLSTMLayer.cpp"
+          ]
+        }
+      },
+      "MaxUnpool2d": {
+        "deps": [ "Fill" ],
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp"
+          ]
+        }
+      },
+      "Mean": {
+        "deps" : [ "Reduction" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NEReduceMean.cpp" ]
+        }
+      },
+      "MeanStdDevNormalize": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp",
+            "src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp"
+          ]
+        }
+      },
+      "Mul": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuMul.cpp",
+            "src/cpu/kernels/CpuMulKernel.cpp",
+            "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp"
+          ]
+        }
+      },
+      "Normalize": {
+        "deps": [ "Mul" ],
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
+            "src/runtime/NEON/functions/NENormalizationLayer.cpp"
           ]
         }
       },
       "Pad": {
+        "deps": [ "Concatenate", "Copy", "StridedSlice" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEPadLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEPadLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEPadLayer.cpp"
           ]
         }
       },
       "Permute": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuPermute.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuPermuteKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuPermute.cpp",
+            "src/cpu/kernels/CpuPermuteKernel.cpp",
+            "src/runtime/NEON/functions/NEPermute.cpp"
           ]
         }
       },
       "Pool2d": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuPool2d.cpp"
-          ],
-          "kernel": [
+          "common": [
+            "src/cpu/operators/CpuPool2d.cpp",
             "src/cpu/kernels/CpuPool2dKernel.cpp",
             "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
             "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
@@ -1538,24 +1713,15 @@
             "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp",
             "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp",
             "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp",
-            "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp"
+            "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp",
+            "src/runtime/NEON/functions/NEPoolingLayer.cpp"
           ],
           "neon": {
-            "nchw": [
-              "src/cpu/kernels/pool2d/neon/nchw/all.cpp"
-            ],
-            "fp32": [
-              "src/cpu/kernels/pool2d/neon/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/pool2d/neon/fp16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/pool2d/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"
-            ],
+            "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ],
+            "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" ],
             "estate64": [
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
@@ -1578,15 +1744,17 @@
             ]
           },
           "sve": {
-            "all": [
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+            "qasymm8": [ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" ],
+            "common": [
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp",
@@ -1596,240 +1764,259 @@
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp",
               "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
-              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp"
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
+              "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"
             ]
           }
         }
       },
-      "PriorBox": {
+      "PRelu": {
+        "deps": [ "ElementwiseBinary" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp"
+          "common": [
+            "src/runtime/NEON/functions/NEPReluLayer.cpp"
           ]
         }
       },
-      "QLSTMLayerNormalization": {
+      "PriorBox": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEPriorBoxLayer.cpp"
+          ]
+        }
+      },
+      "Quantize": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuQuantize.cpp",
+            "src/cpu/kernels/CpuQuantizeKernel.cpp",
+            "src/runtime/NEON/functions/NEQuantizationLayer.cpp"
           ]
         }
       },
       "Range": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NERangeKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NERangeKernel.cpp",
+            "src/runtime/NEON/functions/NERange.cpp"
           ]
         }
       },
-      "ReductionOperation": {
+      "Reduction":{
+        "deps": [ "Reshape" ],
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEReductionOperationKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEReductionOperationKernel.cpp",
+            "src/runtime/NEON/functions/NEReductionOperation.cpp"
           ]
         }
       },
       "Remap": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NERemapKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NERemapKernel.cpp",
+            "src/runtime/NEON/functions/NERemap.cpp"
           ]
         }
       },
       "Reorg": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEReorgLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEReorgLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEReorgLayer.cpp"
+          ]
+        }
+      },
+      "Reshape": {
+        "files": {
+          "common": [
+            "src/cpu/operators/CpuReshape.cpp",
+            "src/cpu/kernels/CpuReshapeKernel.cpp",
+            "src/runtime/NEON/functions/NEReshapeLayer.cpp"
           ]
         }
       },
       "Reverse": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEReverseKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEReverseKernel.cpp",
+            "src/runtime/NEON/functions/NEReverse.cpp"
           ]
         }
       },
+      "RNN": {
+        "deps": [ "Activation", "Add", "FullyConnected", "Gemm"],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NERNNLayer.cpp" ]
+        }
+      },
       "ROIAlign": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEROIAlignLayer.cpp"
           ]
         }
       },
-      "ROIPooling": {
+      "ROIPool2d": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp"
-          ]
-        }
-      },
-      "Select": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NESelectKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToBatch": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp"
-          ]
-        }
-      },
-      "SpaceToDepth": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp"
-          ]
-        }
-      },
-      "Stack": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEStackLayerKernel.cpp"
-          ]
-        }
-      },
-      "StridedSlice": {
-        "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NEStridedSliceKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEROIPoolingLayer.cpp"
           ]
         }
       },
       "Scale": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuScale.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuScaleKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuScale.cpp",
+            "src/cpu/kernels/CpuScaleKernel.cpp",
+            "src/runtime/NEON/functions/NEScale.cpp"
           ],
           "sve": {
-            "fp32": [
-              "src/cpu/kernels/scale/sve/fp32.cpp"
-            ],
-            "fp16": [
-              "src/cpu/kernels/scale/sve/fp16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/scale/sve/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/scale/sve/qasymm8_signed.cpp"
-            ],
-            "integer": [
-              "src/cpu/kernels/scale/sve/integer.cpp"
-            ]
+            "fp16": [ "src/cpu/kernels/scale/sve/fp16.cpp" ],
+            "fp32": [ "src/cpu/kernels/scale/sve/fp32.cpp" ],
+            "integer": [ "src/cpu/kernels/scale/sve/integer.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/scale/sve/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/scale/sve/qasymm8_signed.cpp" ]
+
           },
           "neon": {
-            "fp16": [
-              "src/cpu/kernels/scale/neon/fp16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/scale/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/scale/neon/qasymm8_signed.cpp"
-            ],
-            "integer": [
-              "src/cpu/kernels/scale/neon/integer.cpp"
-            ]
+            "fp16": [ "src/cpu/kernels/scale/neon/fp16.cpp" ],
+            "integer": [ "src/cpu/kernels/scale/neon/integer.cpp" ],
+            "qasymm8": [ "src/cpu/kernels/scale/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/scale/neon/qasymm8_signed.cpp" ]
           }
         }
       },
+      "Select": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NESelectKernel.cpp",
+            "src/runtime/NEON/functions/NESelect.cpp"
+          ]
+        }
+      },
+      "Slice": {
+        "deps": [ "StridedSlice" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NESlice.cpp" ]
+        }
+      },
       "Softmax": {
         "deps": [
           "Permute"
         ],
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuSoftmax.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuSoftmaxKernel.cpp"
+          "common": [
+            "src/cpu/operators/CpuSoftmax.cpp",
+            "src/cpu/kernels/CpuSoftmaxKernel.cpp",
+            "src/runtime/NEON/functions/NESoftmaxLayer.cpp"
           ],
           "sve": {
-            "all": [
-              "src/cpu/kernels/softmax/impl/sve/impl.cpp"
-            ]
+            "common": [ "src/cpu/kernels/softmax/impl/sve/impl.cpp" ]
           }
         }
       },
+      "SpaceToBatch": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp",
+            "src/runtime/NEON/functions/NESpaceToBatchLayer.cpp"
+          ]
+        }
+      },
+      "SpaceToDepth": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp",
+            "src/runtime/NEON/functions/NESpaceToDepthLayer.cpp"
+          ]
+        }
+      },
+      "Split": {
+        "deps": [ "StridedSlice" ],
+        "files": {
+          "common": [
+            "src/runtime/NEON/functions/NESplit.cpp"
+          ]
+        }
+      },
+      "Stack": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEStackLayerKernel.cpp",
+            "src/runtime/NEON/functions/NEStackLayer.cpp"
+          ]
+        }
+      },
+      "StridedSlice": {
+        "files": {
+          "common": [
+            "src/core/NEON/kernels/NEStridedSliceKernel.cpp",
+            "src/runtime/NEON/functions/NEStridedSlice.cpp"
+          ]
+        }
+      },
       "Sub": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuSub.cpp"
+          "common": [
+            "src/cpu/operators/CpuSub.cpp",
+            "src/cpu/kernels/CpuSubKernel.cpp",
+            "src/runtime/NEON/functions/NEArithmeticSubtraction.cpp"
           ],
-          "kernel": [
-            "src/cpu/kernels/CpuSubKernel.cpp"
-          ],
+          "sve": {
+            "qasymm8": [ "src/cpu/kernels/sub/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/sub/neon/qsymm16.cpp" ]
+          },
           "neon": {
-            "qsymm16": [
-              "src/cpu/kernels/sub/neon/qsymm16.cpp"
-            ],
-            "qasymm8": [
-              "src/cpu/kernels/sub/neon/qasymm8.cpp"
-            ],
-            "qasymm8_signed": [
-              "src/cpu/kernels/sub/neon/qasymm8_signed.cpp"
-            ]
+            "qasymm8": [ "src/cpu/kernels/sub/neon/qasymm8.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" ],
+            "qsymm16": [ "src/cpu/kernels/sub/neon/qsymm16.cpp" ]
           }
         }
       },
-      "Transpose": {
-        "files": {
-          "operator": [
-            "src/cpu/operators/CpuTranspose.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuTransposeKernel.cpp"
-          ]
-        }
-      },
       "Tile": {
         "files": {
-          "kernel": [
-            "src/core/NEON/kernels/NETileKernel.cpp"
+          "common": [
+            "src/core/NEON/kernels/NETileKernel.cpp",
+            "src/runtime/NEON/functions/NETile.cpp"
           ]
         }
       },
-      "WinogradConvolution": {
-        "deps": [
-          "Activation",
-          "Permute"
-        ],
+      "Transpose": {
         "files": {
-          "operator": [
-            "src/cpu/operators/CpuWinogradConv2d.cpp"
-          ],
-          "kernel": [
-            "src/cpu/kernels/CpuWinogradConv2dKernel.cpp",
-            "src/core/NEON/kernels/convolution/winograd/padding.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
-            "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp"
+          "common": [
+            "src/cpu/kernels/CpuTransposeKernel.cpp",
+            "src/cpu/operators/CpuTranspose.cpp",
+            "src/runtime/NEON/functions/NETranspose.cpp"
           ]
         }
+      },
+      "Unstack": {
+        "deps": [ "StridedSlice" ],
+        "files": {
+          "common": [ "src/runtime/NEON/functions/NEUnstack.cpp" ]
+        }
       }
     }
   }