Integrate Dynamic Fusion patches * Add public interfaces: * OperatorGraph: Describe a workload that could contain fused kernels * IWorkload: Generic interface for workloads built from OperatorGraph * ClWorkload: OpenCL workloads built from OperatorGraph * ClCompositeOperator: Runtime async operator to execute a ClWorkload * DependencyGraph (will likely be deprecated in later iterations) * Add example * cl_fused_conv2d_elementwise_add.cpp to explain how to use the new interfaces * Add internal translation layer * Refactor ClKernelBuildingAPI * Remove non-tile based gemm native kernel component * Minor interface changes * Add integration tests Resolves COMPMID-5161 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: b63b1196adea8b07dd8db77c2492a212650deba0 [log] [tgz]
author: SiCong Li <sicong.li@arm.com> Fri Jan 28 18:24:39 2022 +0000
committer: SiCong Li <sicong.li@arm.com> Fri May 06 15:01:45 2022 +0000
tree: b264035197873f56c69784bec68cad7041b5d423
parent: 3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 [diff]
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index 3e9ed06..3d49dde 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
@@ -49,69 +51,46 @@
     return *_impl;
 }
 
-Status add_tensor_argument(ClKernelBlueprint &kernel_blueprint, const ClTensorDescriptor &tensor_desc, ArgumentID &id)
+Status add_tensor(ClKernelBlueprint &kernel_blueprint, ITensorInfo *tensor_info, ArgumentID &id, ArgumentID merge_point)
 {
-    id = kernel_blueprint.impl().add_kernel_argument(tensor_desc);
+    id = kernel_blueprint.impl().add_kernel_tensor(tensor_info, merge_point);
     return Status{};
 }
 
-Status add_tensor_intermed(ClKernelBlueprint &kernel_blueprint, ArgumentID &id)
-{
-    id = kernel_blueprint.impl().add_intermediate_tensor();
-    return Status{};
-}
-
-Status add_kcomp_gemm_native(ClKernelBlueprint          &kernel_blueprint, const ClKernelComponentDescriptor &,
-                             const GemmNativeDescriptor &gemm_native_desc,
-                             ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id)
-{
-    kernel_blueprint.impl().validate_arg_ids({ lhs_id, rhs_id, bias_id, dst_id });
-    kernel_blueprint.impl().add_component(
-        std::make_unique<ClGemmNativeKernelComponent>(
-            &kernel_blueprint,
-            gemm_native_desc,
-            SharedVarLink{ lhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(lhs_id) },
-            SharedVarLink{ rhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(rhs_id) },
-            SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
-            SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) }));
-
-    return Status{};
-}
-
-Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, const EltwiseAddDescriptor &,
+Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClEltwiseAddKernelDescriptor &,
                              ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
 {
     kernel_blueprint.impl().add_component(
         std::make_unique<ClElementwiseAddKernelComponent>(
             &kernel_blueprint,
-            SharedVarLink{ src0_id, SharedVarIO::Input, kernel_blueprint.impl().group(src0_id) },
-            SharedVarLink{ src1_id, SharedVarIO::Input, kernel_blueprint.impl().group(src1_id) },
-            SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }));
+            SharedVarLink{ src0_id, SharedVarIO::Input },
+            SharedVarLink{ src1_id, SharedVarIO::Input },
+            SharedVarLink{ dst_id, SharedVarIO::Output }));
 
     return Status{};
 }
-Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID, ArgumentID &)
+Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &)
 {
     return Status{};
 }
 
-Status add_kcomp_direct_conv(ClKernelBlueprint                 &kernel_blueprint, const ClKernelComponentDescriptor &,
-                             const DirectConvolutionDescriptor &direct_conv2d_desc,
-                             ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id)
+Status add_kcomp_direct_conv2d(ClKernelBlueprint                    &kernel_blueprint,
+                               const ClDirectConv2dKernelDescriptor &direct_conv2d_desc,
+                               ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id)
 {
     kernel_blueprint.impl().add_component(
         std::make_unique<ClDirectConvolutionKernelComponent>(
             &kernel_blueprint,
             direct_conv2d_desc,
-            SharedVarLink{ src_id, SharedVarIO::Input, kernel_blueprint.impl().group(src_id) },
-            SharedVarLink{ weight_id, SharedVarIO::Input, kernel_blueprint.impl().group(weight_id) },
-            SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
-            SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) }));
+            SharedVarLink{ src_id, SharedVarIO::Input },
+            SharedVarLink{ weight_id, SharedVarIO::Input },
+            SharedVarLink{ dst_id, SharedVarIO::Output },
+            SharedVarLink{ bias_id, SharedVarIO::Input }));
 
     return Status{};
 }
 
-Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, ArgumentID src_tile, ArgumentID dst_tile, const StoreType &store_type)
+Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const StoreType &store_type, ArgumentID src_tile, ArgumentID dst_tile)
 {
     switch(store_type)
     {
@@ -119,15 +98,15 @@
             kernel_blueprint.impl().add_component(
                 std::make_unique<ClStoreBlockBoundaryAwareKernelComponent>(
                     &kernel_blueprint,
-                    SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
-                    SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
+                    SharedVarLink{ src_tile, SharedVarIO::Input },
+                    SharedVarLink{ dst_tile, SharedVarIO::Output }));
             break;
         case StoreType::TStoreIndirectWidthSelect:
             kernel_blueprint.impl().add_component(
                 std::make_unique<ClStoreIndirectWidthSelectKernelComponent>(
                     &kernel_blueprint,
-                    SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
-                    SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
+                    SharedVarLink{ src_tile, SharedVarIO::Input },
+                    SharedVarLink{ dst_tile, SharedVarIO::Output }));
             break;
         default:
             ARM_COMPUTE_ERROR("Store mode not yet supported.");
@@ -136,6 +115,11 @@
     return Status{};
 }
 
+Status update_merge_point(ClKernelBlueprint &bp, ArgumentID t_id, ArgumentID merge_point)
+{
+    return bp.impl().update_merge_point(t_id, merge_point);
+}
+
 Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info)
 {
     bp.impl().set_tile_info(tile_info);
@@ -143,6 +127,7 @@
 }
 Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &kernel_blueprint)
 {
+    kernel_blueprint.impl().finalize();
     code.name = kernel_blueprint.impl().build_kernel_name();
     code.code = kernel_blueprint.impl().build_code();
 
@@ -153,12 +138,14 @@
 
     return Status{};
 }
+DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint)
+{
+    return blueprint.impl().get_graph();
+}
 Status tune_static(ClExecutionDescriptor &, const ClKernelCode &)
 {
     return Status{};
 }
 } // namespace dynamic_fusion
 } // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 23629f4..3dccdd7 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h

@@ -21,13 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
 #define ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
 
 namespace arm_compute
 {
@@ -35,46 +40,9 @@
 {
 namespace dynamic_fusion
 {
-using ArgumentID = int32_t;
+using ArgumentID = DependencyGraph::Id;
 
-static constexpr ArgumentID g_arg_placeholder = -1;
-
-/** Verbose and explicit way to enumerate all the tensor arguments variants used by
- *  all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
- */
-enum class TensorArgType : int
-{
-    Scalar,
-
-    Vector,
-
-    Image,
-    Image_Reinterpret_As_3D,
-    Image_Export_To_ClImage2D,
-
-    Image_3D, // 3D Tensor represented as a 2D Image + stride_z
-    Image_3D_Export_To_ClImage2D,
-
-    Tensor_3D,
-    Tensor_4D,
-
-    Tensor_4D_t_Buffer,
-    Tensor_4D_t_Image
-};
-/** Describes all the info required to add a kernel argument at run time */
-struct ClKernelArgRuntimeDescriptor
-{
-    ClKernelArgRuntimeDescriptor(int arg_id, TensorArgType type, bool slide_along_dimz = true)
-        : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
-    {
-    }
-    ~ClKernelArgRuntimeDescriptor() = default;
-    int           arg_id{ g_arg_placeholder }; // Arg ID in the blueprint
-    TensorArgType tensor_arg_type{ TensorArgType::Image };
-    bool          slide_along_dimz{ true };
-};
-
-using ClKernelArgList = std::vector<ClKernelArgRuntimeDescriptor>;
+static constexpr ArgumentID g_arg_placeholder = DependencyGraph::empty_id();
 
 /** Intermediate representation of the final, complete kernel source. */
 class ClKernelBlueprint
@@ -93,145 +61,38 @@
 };
 
 ///// Kernel Components /////
-
-/** Meta information about all Cl Kernel Components */
-struct ClKernelComponentDescriptor
-{
-    int32_t version{ 1 }; /**< Operator version */
-};
-
-/** Component: Tensor Argument */
-struct ClTensorDescriptor
-{
-    ClTensorDescriptor(ITensorInfo *info)
-        : tensor_info(info)
-    {
-    }
-
-    ITensorInfo *tensor_info;
-};
-
-Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &);
-Status add_tensor_intermed(ClKernelBlueprint &, ArgumentID &);
-
-/** Component: Gemm Native */
-struct GemmNativeDescriptor
-{
-    float             alpha{};
-    float             beta{};
-    unsigned int      m{};
-    unsigned int      n{};
-    unsigned int      k{};
-    unsigned int      depth_output_gemm3d{};
-    bool              reinterpret_input_as_3d{};
-    bool              broadcast_bias{};
-    bool              fp_mixed_precision{};
-    bool              has_pad_y{};
-    int               nmult_transpose1xW_width{};
-    int               mult_interleave4x4_height{};
-    GEMMLHSMatrixInfo lhs_info{};
-    GEMMRHSMatrixInfo rhs_info{};
-    int32_t           a_offset{};
-    int32_t           b_offset{};
-};
-
-Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
-                             ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id);
-
 /** Component: Eltwise Add */
-struct EltwiseAddDescriptor
-{
-    ConvertPolicy convert_policy{ ConvertPolicy::SATURATE };
-};
-Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const EltwiseAddDescriptor &, ArgumentID src0_id,
+Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClEltwiseAddKernelDescriptor &, ArgumentID src0_id,
                              ArgumentID src1_id, ArgumentID &dst_id);
 
 /** Component: Activation */
-struct ActivationDescriptor
-{
-};
-Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
+Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
 
 /** Component: Direct Convolution **/
-struct DirectConvolutionDescriptor
-{
-    PadStrideInfo pad_stride_info{};
-};
-Status add_kcomp_direct_conv(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const DirectConvolutionDescriptor &,
-                             ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id);
+Status add_kcomp_direct_conv2d(ClKernelBlueprint &, const ClDirectConv2dKernelDescriptor &,
+                               ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id);
 
-enum class ClippingStrategy
-{
-    TOP_LEFT,
-    TOP_RIGHT,
-    BOTTOM_LEFT,
-    BOTTOM_RIGHT,
-};
+Status add_kcomp_store(ClKernelBlueprint &, const StoreType &store_type, ArgumentID src_id, ArgumentID dst_id);
 
-/** Component: Store */
-struct TileDescriptor
-{
-    Size2D           tile_dims{};
-    Size2D           boundaries{};
-    ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT };
-
-    TileDescriptor()
-    {
-    }
-
-    TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip)
-        : tile_dims(dims), boundaries(bound), clipping(clip)
-    {
-    }
-
-    bool empty() const
-    {
-        return (tile_dims.area() == 0) || (boundaries.area() == 0);
-    }
-};
-
-enum class StoreType
-{
-    VStore,
-    VStorePartial,
-    StoreRow,
-    ConvertStoreRow,
-    StoreBlock,
-    ConvertStoreBlock,
-    StoreRowPartial,
-    StoreBlockPartial,
-    StoreBlockBoundaryAware,
-    StoreVectorSelect,
-    TStoreIndirectWidthSelect
-};
-
-Status add_kcomp_store(ClKernelBlueprint &, const ClKernelComponentDescriptor &, ArgumentID src_id, ArgumentID dst_id, const StoreType &store_type);
+Status add_tensor(ClKernelBlueprint &, ITensorInfo *, ArgumentID &, ArgumentID merge_point = DependencyGraph::empty_id());
 
 ///// Kernel Components /////
 
 ///// Building /////
 
-/** Information required for kernel compilation. The build results of KernelBlueprint */
-struct ClKernelCode
-{
-    std::string     name{};          /**< Kernel name */
-    std::string     code{};          /**< Kernel source code */
-    std::string     config_id{};     /**< Generated from blueprint based on complex component */
-    CLBuildOptions  build_options{}; /**< Kernel build options */
-    Window          window{};        /**< Execution window */
-    ClKernelArgList arguments{};     /**< Kernel argument specficiations */
+/** Update existing merge tensor @p merge_point to point to @p t_id
+ *
+ * @param t_id
+ * @param merge_point
+ * @return Status
+ */
+Status update_merge_point(ClKernelBlueprint &, ArgumentID t_id, ArgumentID merge_point);
 
-    bool operator==(const ClKernelCode &other) const
-    {
-        return name == other.name && code == other.code && build_options == other.build_options;
-    }
-};
-
-/** GPU information for building the @ref ClKernelCode */
-struct GpuInfo
-{
-    GPUTarget target{ GPUTarget::UNKNOWN };
-};
+/** Get dependency graph
+ *
+ * @return DependencyGraph
+ */
+DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint);
 
 /** All information required for building the @ref ClKernelCode */
 struct ClCodeBuilderContext
@@ -247,12 +108,6 @@
 ///// Building /////
 
 ///// Tuning /////
-struct ClExecutionDescriptor
-{
-    cl::NDRange suggested_lws{};              /**< Suggested local work-group size for optimal performance if not zero */
-    cl::NDRange gws{};                        /**< Global work-group to be used */
-    bool        skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
-};
 
 Status tune_static(ClExecutionDescriptor &, const ClKernelCode &);
 
@@ -261,6 +116,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index aa27572..17437c2 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
@@ -36,6 +38,7 @@
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
 
+#include <iostream>
 #include <queue>
 #include <stack>
 #include <string>
@@ -63,8 +66,8 @@
 
 enum class SharedVarGroup
 {
-    Argument, // Parameters to a kernel function
-    Automatic // Automatic variables declared within the kernel body
+    Argument, // Parameters to a kernel function  == dst or src tensors of the whole blueprint graph
+    Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph
 };
 
 /** Specifies a shared variable link for a component.
@@ -74,85 +77,151 @@
  */
 struct SharedVarLink
 {
-    ArgumentID     arg_id{ g_arg_placeholder };
-    SharedVarIO    io{ SharedVarIO::Input };
-    SharedVarGroup group{ SharedVarGroup::Argument };
-    bool           is_empty() const
+    ArgumentID  arg_id{ g_arg_placeholder };
+    SharedVarIO io{ SharedVarIO::Input };
+    bool        is_empty() const
     {
         return arg_id == g_arg_placeholder;
     }
 };
 
 /** A table of all the variables used in the kernel / blueprint
+ * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) **
+ *
  * NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID
  * NOTE: the variables all have the scope of the full kernel function
  */
 class SharedVarTable
 {
 public:
+    /** A fully realized SharedVarLink
+     */
     struct SharedVar
     {
-        SharedVarGroup               group;
-        std::string                  uniq_name; // Unique name, also the final variable name used in the built code
-        ClKernelArgRuntimeDescriptor desc;      // Automatic variables can and should still be described using this struct
+        ArgumentID            arg_id{ g_arg_placeholder };
+        SharedVarIO           io{ SharedVarIO::Input };
+        SharedVarGroup        group{ SharedVarGroup::Argument };
+        std::string           uniq_name{}; // Unique name, also the final variable name used in the built code
+        ClKernelArgDescriptor desc{};      // Automatic variables can and should still be described using this struct
+        bool                  is_empty() const
+        {
+            return arg_id == g_arg_placeholder;
+        }
     };
 
-    using Arguments = std::vector<SharedVar>;
+    class Arguments
+    {
+    public:
+        Arguments() = default;
+        void add_var(const SharedVar &var)
+        {
+            ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument);
+            _vars.push_back(var);
+        }
+        std::vector<SharedVar> get_all_vars() const
+        {
+            return _vars;
+        }
+        std::vector<SharedVar> get_src_vars() const
+        {
+            std::vector<SharedVar> src_vars;
+            std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var)
+            {
+                return var.io == SharedVarIO::Input;
+            });
+            return src_vars;
+        }
+        SharedVar get_dst_var() const
+        {
+            std::vector<SharedVar> dst_vars;
+            std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var)
+            {
+                return var.io == SharedVarIO::Output;
+            });
+            ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1);
+            return dst_vars.at(0);
+        }
 
-    /** @note: The order of insertion is important. There is one precondition:
+    private:
+        std::vector<SharedVar> _vars{};
+    };
+
+    /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one
+     *
+     * @note: The order of insertion is important. There is one precondition:
      *        PRECOND: The components have been sorted topologically / is being traversed in topological order
      *                 This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred
      */
-    SharedVar add(SharedVarLink var_link, ClKernelArgRuntimeDescriptor runtime_desc, const std::string &name = "unnamed")
+    void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed")
     {
         ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected");
+        if(!get(var_link).is_empty())
+        {
+            return;
+        }
+
         auto              var_id = _num_var;
         std::stringstream ss;
         ss << name << "_" << var_id;
         const auto uniq_name = ss.str();
-        SharedVar  var{ var_link.group, uniq_name, runtime_desc };
+        SharedVar  var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc };
 
-        if(var_link.group == SharedVarGroup::Argument)
+        if(group == SharedVarGroup::Argument)
         {
             _arguments.emplace(var_id, var);
+            _arg_id_map.emplace(var_link.arg_id, var_id);
             _num_var++;
-            _var_id_lut[var_link.arg_id] = var_id;
         }
-        else if(var_link.group == SharedVarGroup::Automatic)
+        else if(group == SharedVarGroup::Automatic)
         {
-            if(var_link.io == SharedVarIO::Output)
+            if(_global_vars.empty())
             {
-                _global_vars.emplace(var_id, var);
-                _num_var++;
-                _var_id_lut[var_link.arg_id] = var_id;
+                if(var_link.io == SharedVarIO::Output)
+                {
+                    _global_vars.emplace(var_id, var);
+                    _arg_id_map.emplace(var_link.arg_id, var_id);
+                    _num_var++;
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Component likely not traversed in topological order");
+                }
             }
             else
             {
-                // For the input link, the var (and thus its arg_id) will always have been added by the time we get here if we traverse components in topological order
-                var = get_var(var_link.arg_id);
+                // Associate additional SharedVarLinks with the single global shared variable
+                const auto global_var_id     = _global_vars.begin()->first;
+                _arg_id_map[var_link.arg_id] = global_var_id;
             }
         }
         else
         {
             ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup");
         }
-        return var;
     }
 
-    SharedVar get_var(ArgumentID arg_id) const
+    /** Get the SharedVar associated with @p var_link
+     *
+     * @param var_link
+     * @return SharedVar
+     */
+    SharedVar get(const SharedVarLink &var_link) const
     {
-        const auto var_id = _var_id_lut.at(arg_id); // arg_id has to exist in lut to begin with
-        auto       it     = _global_vars.find(var_id);
-        if(it != _global_vars.end())
+        const SharedVar empty_var{};
+        if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end())
         {
-            return it->second;
+            const auto var_id  = _arg_id_map.at(var_link.arg_id);
+            const auto arg_var = _arguments.find(var_id);
+            if(arg_var != _arguments.end())
+            {
+                return arg_var->second;
+            }
+            else
+            {
+                return _global_vars.at(var_id);
+            }
         }
-        it = _arguments.find(var_id);
-        if(it != _arguments.end())
-        {
-            return it->second;
-        }
-        ARM_COMPUTE_ERROR("Cannot find component variable");
+        return empty_var;
     }
 
     /** @note The arguments are returned in the order they are added
@@ -162,7 +231,7 @@
         Arguments args{};
         for(const auto &a : _arguments)
         {
-            args.push_back(a.second);
+            args.add_var(a.second);
         }
         return args;
     }
@@ -171,9 +240,9 @@
     using VarID = int32_t;
 
 private:
-    std::map<VarID, SharedVar>            _global_vars{};
-    std::map<VarID, SharedVar>            _arguments{};
-    std::unordered_map<ArgumentID, VarID> _var_id_lut{};
+    std::map<VarID, SharedVar>  _global_vars{}; // Shared, global variable
+    std::map<VarID, SharedVar>  _arguments{};
+    std::map<ArgumentID, VarID> _arg_id_map{}; // Track ArgumentIDs that have already been added
     VarID _num_var{ 0 };
 };
 
@@ -184,7 +253,7 @@
     Store
 };
 
-using ComponentID   = int32_t;
+using ComponentID   = DependencyGraph::Id;
 using ComponentList = std::vector<ComponentID>;
 class IClKernelComponent
 {
@@ -224,7 +293,7 @@
     };
     using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
 public:
-    IClKernelComponent(const ClKernelBlueprint *blueprint)
+    IClKernelComponent(ClKernelBlueprint *blueprint)
         : _blueprint(blueprint)
     {
     }
@@ -304,12 +373,18 @@
     {
         return Window{};
     }
-    /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code
+    /** Get the tag look-up table used to instantiate the component code.
      *
      * @param vtable
      * @return TagLUT
      */
-    virtual TagLUT allocate_vars(SharedVarTable &vtable) const = 0;
+    virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0;
+
+    /** Allocate all shared variables used by the component in the @p vtable
+     *
+     * @param vtable
+     */
+    virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0;
 
     virtual std::string get_dst_addr_calculation() const
     {
@@ -331,7 +406,7 @@
     }
 
 protected:
-    const ClKernelBlueprint *_blueprint;
+    ClKernelBlueprint *_blueprint;
 
 private:
     ComponentID _id{};
@@ -348,18 +423,19 @@
     ~Implementation() = default;
 
 public:
-    ArgumentID add_kernel_argument(const ClTensorDescriptor &tensor_desc)
+    Status update_merge_point(ArgumentID t_id, ArgumentID merge_point)
     {
-        _kernel_arguments.insert(std::make_pair(_num_args, tensor_desc));
-        _shared_var_group_lut[_num_args] = SharedVarGroup::Argument;
-        return _num_args++;
+        return _graph.update_merge_point(t_id, merge_point);
     }
 
-    ArgumentID add_intermediate_tensor()
+    ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id())
     {
-        _intermediate_tensors.insert(_num_args);
-        _shared_var_group_lut[_num_args] = SharedVarGroup::Automatic;
-        return _num_args++;
+        const auto id = _graph.add_tensor(merge_point);
+        if(_kernel_tensors.find(id) == _kernel_tensors.end())
+        {
+            _kernel_tensors.insert(std::make_pair(id, tensor_info));
+        }
+        return id;
     }
 
     void set_tile_info(const TileDescriptor &tile_info)
@@ -382,7 +458,7 @@
         for(const auto arg_id : args)
         {
             ARM_COMPUTE_UNUSED(arg_id);
-            ARM_COMPUTE_ERROR_ON_MSG(_kernel_arguments.find(arg_id) == _kernel_arguments.end() && _intermediate_tensors.find(arg_id) == _intermediate_tensors.end() && arg_id != g_arg_placeholder,
+            ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder,
                                      "Trying to use an argument that hasn't been added to the blueprint");
         }
     }
@@ -395,29 +471,36 @@
             ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported.");
         }
 
-        // This flag specifies if the current component is the root of the component graph
-        // If the root is set to -1, it means that a root hasn't been added yet
-        bool is_graph_root = true;
-
         // Get an unique ID for the component that's being added
-        const ComponentID component_id = _num_components++;
+        std::vector<ArgumentID> src_tensors;
+        std::vector<ArgumentID> dst_tensors;
+        for(const auto &link : component->get_links())
+        {
+            if(link.is_empty())
+            {
+                continue;
+            }
+            if(link.io == SharedVarIO::Input)
+            {
+                src_tensors.push_back(link.arg_id);
+            }
+            else
+            {
+                dst_tensors.push_back(link.arg_id);
+            }
+        }
+        const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second;
         component->set_id(component_id);
 
         // Add this component to the component graph. Don't connect it to anything yet
         _component_graph.emplace(component_id, ComponentList{});
 
-        int32_t positional_arg = 0;
-
         // For every { arg_id, arg_io } passed along with this component...
         for(const auto &link : component->get_links())
         {
             const ArgumentID &arg_id = link.arg_id;
             const SharedVarIO &arg_io = link.io;
 
-            // A component is considered root only if all its input arguments are kernel arguments (or placeholders, which means nullptr)
-            // This performs a check on every argument, and if one of them doesn't respect the condition, the component is not considered root
-            is_graph_root &= (_kernel_arguments.find(arg_id) != _kernel_arguments.end()) || (arg_io == SharedVarIO::Output) || (arg_id == g_arg_placeholder);
-
             // Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there
             if(_outgoing_components.find(arg_id) == _outgoing_components.end())
             {
@@ -454,15 +537,9 @@
 
                 _incoming_components[arg_id].push_back(component_id);
             }
-
-            ++positional_arg;
         }
 
-        if(is_graph_root)
-        {
-            ARM_COMPUTE_ERROR_ON_MSG(_graph_root >= 0, "Trying to add more than one root to the graph");
-            _graph_root = component_id;
-        }
+        ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph");
 
         // Finally, add this component to the dictionary of components
         _components.insert(std::make_pair(component_id, std::move(component)));
@@ -489,17 +566,28 @@
         std::set<std::string>    additional_macros{};
         std::vector<std::string> component_codes{}; // vector because order matters
 
-        // Go through the components graph (topological sort) and fill the data structures above
+        // Step 1: Allocate all kernel argument shared variables before generating the component code
         auto stack = topological_sort();
         while(!stack.empty())
         {
             auto  curr_component_id = stack.top();
             auto &curr_component    = _components.find(curr_component_id)->second;
 
+            curr_component->allocate_shared_vars(_vtable);
+
+            stack.pop();
+        }
+        // Step 2: Generate component codes
+        stack = topological_sort();
+        while(!stack.empty())
+        {
+            auto  curr_component_id = stack.top();
+            auto &curr_component    = _components.find(curr_component_id)->second;
+
             auto       curr_headers_list      = curr_component->get_headers_list();
             auto       curr_additional_macros = curr_component->get_additional_macros();
             auto       curr_component_code    = curr_component->get_component_code();
-            const auto var_lut                = curr_component->allocate_vars(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+            const auto var_lut                = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
             component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut));
 
             headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
@@ -511,7 +599,7 @@
             stack.pop();
         }
 
-        // This section assembles the data gathered by traversing the graph into the string "code"
+        // Step 3: Assemble the data gathered by traversing the graph into the string "code"
         std::string code = "";
 
         for(auto &header : headers_list)
@@ -596,34 +684,79 @@
     ClKernelArgList get_arguments() const
     {
         ClKernelArgList arg_list{};
-        for(const auto &arg_var : _vtable.get_kernel_arguments())
+        for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars())
         {
-            arg_list.push_back(arg_var.desc);
+            arg_list[arg_var.desc.arg_id] = arg_var.desc;
         }
         return arg_list;
     }
 
-    const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const
+    /** Get the arguments as shared vars from the vtable
+     *
+     * @return SharedVarTable::Arguments
+     */
+    SharedVarTable::Arguments get_argument_shared_vars() const
     {
-        auto it = _kernel_arguments.find(id);
-        if(it != _kernel_arguments.end())
+        return _vtable.get_kernel_arguments();
+    }
+
+    const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+    {
+        auto it = _kernel_tensors.find(id);
+        if(it != _kernel_tensors.end())
         {
-            return &_kernel_arguments.find(id)->second;
+            return it->second;
         }
         return nullptr;
     }
 
-    ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+    ITensorInfo *get_kernel_argument_info(const ArgumentID id)
     {
-        const ClTensorDescriptor *arg_desc = get_kernel_argument(id);
-        if(arg_desc != nullptr)
+        auto it = _kernel_tensors.find(id);
+        if(it != _kernel_tensors.end())
         {
-            return arg_desc->tensor_info;
+            return it->second;
         }
         return nullptr;
     }
+    /** Finalize graph construction. Graph is expected to not mutate after being finalized
+     */
+    void finalize()
+    {
+        cache_root_component();
+        assign_shared_var_group();
+    }
+
+    DependencyGraph get_graph() const
+    {
+        return _graph;
+    }
 
 private:
+    void cache_root_component()
+    {
+        const auto roots = _graph.get_root_ops();
+        ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph");
+        _graph_root = roots.at(0);
+    }
+    /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building
+     */
+    void assign_shared_var_group()
+    {
+        for(const auto &tensor : _kernel_tensors)
+        {
+            const auto tensor_id = tensor.first;
+            if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id))
+            {
+                _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument;
+            }
+            else
+            {
+                _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic;
+            }
+        }
+    }
+
     void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
     {
         visited.insert(component_id);
@@ -666,41 +799,41 @@
         std::string code;
         switch(var.desc.tensor_arg_type)
         {
-            case TensorArgType::Vector:
+            case ClKernelTensorArgType::Vector:
             {
                 code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
                 break;
             }
-            case TensorArgType::Image:
+            case ClKernelTensorArgType::Image:
             {
                 code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
                 break;
             }
-            case TensorArgType::Image_3D:
+            case ClKernelTensorArgType::Image_3D:
             {
                 code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
                 code += "\n    uint " + var.uniq_name + "_stride_z";
                 break;
             }
-            case TensorArgType::Image_3D_Export_To_ClImage2D:
+            case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
             {
                 code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
                 code += "\n    uint " + var.uniq_name + "_stride_z";
                 break;
             }
-            case TensorArgType::Tensor_4D_t_Buffer:
+            case ClKernelTensorArgType::Tensor_4D_t_Buffer:
             {
                 code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
                 break;
             }
-            case TensorArgType::Tensor_4D_t_Image:
+            case ClKernelTensorArgType::Tensor_4D_t_Image:
             {
                 code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
                 break;
             }
             default:
             {
-                ARM_COMPUTE_ERROR("Unsupported declaration generation for TensorArgType");
+                ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType");
             }
         }
         return code;
@@ -710,7 +843,7 @@
     {
         std::string code = "\n__kernel void " + build_kernel_name() + "(";
 
-        for(const auto &arg : argument_list)
+        for(const auto &arg : argument_list.get_all_vars())
         {
             code += generate_argument_declaration(arg) + ",";
         }
@@ -722,54 +855,55 @@
 
     std::string generate_global_section() const
     {
-        std::string code = "";
-        code += "    uint g_x = get_global_id(0);\n";
-        code += "    uint g_y = get_global_id(1);\n";
-        code += "    uint g_z = get_global_id(2);\n\n";
+        auto       dst_info   = get_kernel_argument_info(_dst_id);
+        auto       dst_w      = dst_info->dimension(0);
+        auto       dst_h      = dst_info->dimension(1);
+        const auto tile_w     = std::max(1, get_execution_window().x().step());
+        const auto tile_h     = std::max(1, get_execution_window().y().step());
+        auto       leftover_w = dst_w % tile_w;
+        auto       leftover_h = dst_h % tile_h;
 
-        size_t tile_dim_x = _tile_info.empty() ? 1 : _tile_info.tile_dims.x();
-        size_t tile_dim_y = _tile_info.empty() ? 1 : _tile_info.tile_dims.y();
+        std::string code = "";
+        code += std::string("    int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+        code += std::string("    int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + std::to_string(leftover_h) + ");\n";
+        code += std::string("    int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n");
 
         switch(_tile_info.clipping)
         {
             case ClippingStrategy::TOP_LEFT:
-                code += "    const bool g_cond_x = (g_x == 0);\n";
-                code += "    const bool g_cond_y = (g_y == 0);\n";
+                code += "    const bool g_cond_x = (cout == 0);\n";
+                code += "    const bool g_cond_y = (mout == 0);\n";
                 break;
             case ClippingStrategy::TOP_RIGHT:
-                code += "    const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
-                code += "    const bool g_cond_y = (g_y == 0);\n";
+                code += "    const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
+                code += "    const bool g_cond_y = (mout == 0);\n";
                 break;
             case ClippingStrategy::BOTTOM_LEFT:
-                code += "    const bool g_cond_x = (g_x == 0);\n";
-                code += "    const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
+                code += "    const bool g_cond_x = (cout == 0);\n";
+                code += "    const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
                 break;
             case ClippingStrategy::BOTTOM_RIGHT:
-                code += "    const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
-                code += "    const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
+                code += "    const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
+                code += "    const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
                 break;
             default:
                 ARM_COMPUTE_ERROR("Unsupported clipping strategy");
         }
 
-        code += "\n    REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n";
-        code += "    REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n";
-
         return code;
     }
 
     TileDescriptor _tile_info{};
 
-    int32_t _num_args{};
-    int32_t _num_components{};
     int32_t _num_complex_components{};
 
     ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
 
-    // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
+    DependencyGraph _graph{};
+
+    // Tensors, components and IDs with corresponding ptrs (except intermediate)
     std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
-    std::unordered_map<ArgumentID, ClTensorDescriptor>  _kernel_arguments{};
-    std::unordered_set<ArgumentID> _intermediate_tensors{};
+    std::unordered_map<ArgumentID, ITensorInfo *>       _kernel_tensors{};
     // Argument group lookup. Can be replaced by extending the ArgumentID type to include group info
     std::unordered_map<ArgumentID, SharedVarGroup> _shared_var_group_lut{};
 
@@ -794,6 +928,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
index 41ab4e3..d4feac7 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
@@ -72,6 +74,4 @@
 } // namespace experimental
 } // namespace arm_compute
 
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
index f951ce3..11fb1d5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
 
@@ -31,6 +33,7 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
+#include "arm_compute/runtime/CL/CLScheduler.h"
 namespace arm_compute
 {
 namespace experimental
@@ -44,7 +47,7 @@
 
 std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const
 {
-    return std::set<std::string> { "helpers.h", "tile_helpers.h", "repeat.h" };
+    return std::set<std::string> { "helpers.h", "tile_helpers.h" };
 }
 
 Window ClDirectConvolutionKernelComponent::get_window() const
@@ -54,7 +57,17 @@
     auto       dst_info    = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
 
     // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info);
+    PadStrideInfo pad_stride_info
+    {
+        static_cast<unsigned int>(_desc.conv2d.stride.x()),
+        static_cast<unsigned int>(_desc.conv2d.stride.y()),
+        static_cast<unsigned int>(_desc.conv2d.pad.left),
+        static_cast<unsigned int>(_desc.conv2d.pad.right),
+        static_cast<unsigned int>(_desc.conv2d.pad.top),
+        static_cast<unsigned int>(_desc.conv2d.pad.bottom),
+        DimensionRoundingType::FLOOR /*default rounding type*/
+    };
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst_info, output_shape,
@@ -64,6 +77,9 @@
 
     const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
     const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+    // const unsigned int num_rows = 1;
+    // const unsigned int vec_size = tile_info.tile_dims.x();
+    // const unsigned int num_rows = tile_info.tile_dims.y();
 
     // Create and configure kernel window
     Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
@@ -95,27 +111,30 @@
     //------------------ START KERNEL {{meta_kernel_id}} ---------------------
     // IN_0(src)            {{src}}
     // IN_1(wei)            {{weight}}
+    )_";
+    if(bias_info != nullptr)
+    {
+        code += R"_(
     // IN_1(bia)            {{bias}}
+    )_";
+    }
+    code += R"_(
     // OUT(dst, accum)      {{dst}}
 
-    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
-
     // Initialize the accumulators
     TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
     {
         // All the tensor dimensions are passed at compile time.
         // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-    #define _I{{WEI_WIDTH}} {{WEI_WIDTH}}
-    #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}}
+    #define _IWEI_WIDTH {{WEI_WIDTH}}
+    #define _IWEI_HEIGHT {{WEI_HEIGHT}}
     #define _ISRC_WIDTH {{src}}_w
     #define _ISRC_HEIGHT {{src}}_h
     #define _ISRC_CHANNELS {{src}}_c
-    #define _IDST_WIDTH {{dst_w}}
-    #define _IDST_HEIGHT {{dst_h}}
-    #define _IDST_CHANNELS {{dst_c}}
-    #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}})
+    #define _IDST_WIDTH {{arg_dst}}_w
+    #define _IDST_HEIGHT {{arg_dst}}_h
+    #define _IDST_CHANNELS {{arg_dst}}_c
+    #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
 
         // .v    = access the whole vector (OpenCL vector)
         // .s[x] = access the vector element at position x (scalar access)
@@ -136,13 +155,11 @@
             {{dst}}[i].v = 0;
         })
 
-        uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0);
-
-        for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i)
+        for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
         {
             int ck = 0;
-            int xk = i % _I{{WEI_WIDTH}};
-            int yk = i / _I{{WEI_WIDTH}};
+            int xk = i % _IWEI_WIDTH;
+            int yk = i / _IWEI_HEIGHT;
 
             int k = 0;
             for(; k <= (_ISRC_CHANNELS - K0); k += K0)
@@ -201,6 +218,16 @@
     }
 
     code += R"_(
+    #undef _I_WEI_WIDTH
+    #undef _I_WEI_HEIGHT
+    #undef _ISRC_WIDTH
+    #undef _ISRC_HEIGHT
+    #undef _ISRC_CHANNELS
+    #undef _IDST_WIDTH
+    #undef _IDST_HEIGHT
+    #undef _IDST_CHANNELS
+    #undef _IY_MULTIPLIER
+
         }
     )_";
 
@@ -217,44 +244,7 @@
     }
 
     code += R"_(
-    #undef _I{{WEI_WIDTH}}
-    #undef _I{{WEI_HEIGHT}}
-    #undef _ISRC_WIDTH
-    #undef _ISRC_HEIGHT
-    #undef _ISRC_CHANNELS
-    #undef _IDST_WIDTH
-    #undef _IDST_HEIGHT
-    #undef _IDST_CHANNELS
-    #undef _IY_MULTIPLIER
     }
-
-    // Workaround for the discrepancy between tiles and repeats
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v;
-#if M0 >= 2
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v;
-#endif // M0 >= 2
-#if M0 >= 3
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v;
-#endif // M0 >= 3
-#if M0 >= 4
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v;
-#endif // M0 >= 4
-#if M0 >= 8
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v;
-#endif // M0 >= 8
-#if M0 == 16
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v;
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v;
-#endif // M0 == 16
 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
     )_";
     return code.c_str();
@@ -306,19 +296,18 @@
 CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const
 {
     const auto src_info    = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
-    const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+    auto       weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
     const auto dst_info    = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    // const auto tile_info  = _blueprint->impl().get_tile_info();
 
     const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
     const DataType     data_type   = src_info->data_type();
-    const GPUTarget    gpu_target  = ICLKernel().get_target();
+    const GPUTarget    gpu_target  = CLScheduler::get().target();
 
-    Window win = get_window();
-
-    const unsigned int n0                 = win.x().step();
-    const unsigned int m0                 = win.y().step();
+    const unsigned int n0                 = _blueprint->impl().get_execution_window().x().step();
+    const unsigned int m0                 = _blueprint->impl().get_execution_window().y().step();
     const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
-    const unsigned int partial_store_n0   = dst_info->dimension(channel_idx) % n0;
+    const unsigned int partial_store_n0   = dst_info->dimension(0) % n0;
     const bool         export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
 
     // Update the padding for the weights tensor if we can export to cl_image
@@ -338,54 +327,79 @@
     return build_opts;
 }
 
-ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+    const auto src_info    = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+    const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+
+    vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+
+    const GPUTarget             gpu_target         = CLScheduler::get().target();
+    const bool                  export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
+    const ClKernelTensorArgType weight_type        = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer;
+    vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight");
+
+    if(!_bias.is_empty()) // optional bias
+    {
+        vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias");
+    }
+    vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
 {
     TagLUT lut{};
 
     const auto src_info    = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
     const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
     const auto bias_info   = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
-    const auto dst_info    = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
 
-    const GPUTarget gpu_target         = ICLKernel().get_target();
-    const bool      export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
-
-    const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer;
-    lut["meta_kernel_id"]           = id();
-    lut["src"]                      = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src");
-    lut["weight"]                   = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight");
+    // Arguments and global shared variables
+    lut["src"]    = vtable.get(_src);
+    lut["weight"] = vtable.get(_weight);
 
     if(!_bias.is_empty()) // optional bias
     {
-        lut["bias"]          = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias");
+        lut["bias"]          = vtable.get(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
     }
-    lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
+    lut["dst"] = vtable.get(_dst);
+
+    const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var();
+    lut["arg_dst"]          = dst_argument.uniq_name;
 
     // Local build options
-    const auto width_idx   = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
-    const auto height_idx  = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
-    const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
-
-    lut["dst_w"] = dst_info->dimension(width_idx);
-    lut["dst_h"] = dst_info->dimension(height_idx);
-    lut["dst_c"] = dst_info->dimension(channel_idx);
-
-    lut["ACC_DATA_TYPE"] = src_info->data_type();
-    lut["SRC_DATA_TYPE"] = src_info->data_type();
-    lut["WEI_DATA_TYPE"] = weight_info->data_type();
+    lut["meta_kernel_id"] = id();
+    lut["ACC_DATA_TYPE"]  = src_info->data_type();
+    lut["SRC_DATA_TYPE"]  = src_info->data_type();
+    lut["WEI_DATA_TYPE"]  = weight_info->data_type();
 
     lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER";
+    switch(vtable.get(_weight).desc.tensor_arg_type)
+    {
+        case ClKernelTensorArgType::Image_Export_To_ClImage2D:
+        case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
+        case ClKernelTensorArgType::Tensor_4D_t_Image:
+        {
+            lut["WEI_TENSOR_TYPE"] = "IMAGE";
+            break;
+        }
+        default:
+        {
+            lut["WEI_TENSOR_TYPE"] = "BUFFER";
+            break;
+        }
+    }
+    const auto width_idx  = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
+    const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
+    lut["WEI_WIDTH"]      = weight_info->dimension(width_idx);
+    lut["WEI_HEIGHT"]     = weight_info->dimension(height_idx);
 
-    lut["WEI_WIDTH"]  = weight_info->dimension(width_idx);
-    lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
+    lut["STRIDE_X"] = _desc.conv2d.stride.x();
+    lut["STRIDE_Y"] = _desc.conv2d.stride.y();
 
-    lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride());
-    lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride());
-
-    lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left();
-    lut["PAD_TOP"]  = _desc.pad_stride_info.pad_top();
+    lut["PAD_LEFT"] = _desc.conv2d.pad.left;
+    lut["PAD_TOP"]  = _desc.conv2d.pad.top;
 
     lut["ZERO_VALUE"] = 0;
 
@@ -393,6 +407,4 @@
 }
 } // namespace dynamic_fusion
 } // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
index 10c0e00..af9a65d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
@@ -39,7 +41,7 @@
 class ClDirectConvolutionKernelComponent : public IClKernelComponent
 {
 public:
-    ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc,
+    ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc,
                                        const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{})
         : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst }
     {
@@ -58,7 +60,8 @@
         return { _src, _weight, _bias, _dst };
     }
 
-    virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+    virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+    virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
 
     virtual std::string name() const override
     {
@@ -66,16 +69,14 @@
     }
 
 private:
-    DirectConvolutionDescriptor _desc{};
-    Link                        _src{};
-    Link                        _weight{};
-    Link                        _bias{};
-    Link                        _dst{};
+    ClDirectConv2dKernelDescriptor _desc{};
+    Link                           _src{};
+    Link                           _weight{};
+    Link                           _bias{};
+    Link                           _dst{};
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 84e4003..2bbea87 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
 #include "arm_compute/core/Validate.h"
@@ -41,7 +43,7 @@
 
 std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
 {
-    return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" };
+    return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
 }
 
 Window ClElementwiseAddKernelComponent::get_window() const
@@ -67,63 +69,62 @@
 std::string ClElementwiseAddKernelComponent::get_component_code() const
 {
     std::string code;
-    return R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
-    // IN_0(Accumulator)   {{acc}}
-    // IN_1(Addend)                {{addend}}
+    const bool  is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
 
-    // c = addend + c (mix-precision, broadcast, boundary aware)
+    if(is_root)
     {
-        __global uchar *addend_addr = {{addend}}_ptr + {{addend}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{addend}}_stride_y) + get_global_id(2) * {{addend}}_stride_z; \
-        LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X);                                                                                        \
-        MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp);
+        return R"_(
+    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    // IN_0(LHS)            {{lhs}}
+    // IN_1(RHS)            {{rhs}}
+    // OUT(dst, accum)      {{dst}}
+
+    // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
+    TILE({{DATA_TYPE}}, M0, N0, {{dst}});
+    {
+        TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
+        TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
+
+        T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile);
+        T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile);
+
+        T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
     }
-
-    // Workaround for the discrepancy between tiles and repeats
-#if defined(IS_TILED)
-    {{acc}}[0].v = {{acc}}0;
-#if M0 >= 2
-    {{acc}}[1].v = {{acc}}1;
-#endif // M0 >= 2
-#if M0 >= 3
-    {{acc}}[2].v = {{acc}}2;
-#endif // M0 >= 3
-#if M0 >= 4
-    {{acc}}[3].v = {{acc}}3;
-#endif // M0 >= 4
-#if M0 >= 8
-    {{acc}}[4].v = {{acc}}4;
-    {{acc}}[5].v = {{acc}}5;
-    {{acc}}[6].v = {{acc}}6;
-    {{acc}}[7].v = {{acc}}7;
-#endif // M0 >= 8
-#if M0 == 16
-    {{acc}}[8].v = {{acc}}8;
-    {{acc}}[9].v = {{acc}}9;
-    {{acc}}[10].v = {{acc}}A;
-    {{acc}}[11].v = {{acc}}B;
-    {{acc}}[12].v = {{acc}}C;
-    {{acc}}[13].v = {{acc}}D;
-    {{acc}}[14].v = {{acc}}E;
-    {{acc}}[15].v = {{acc}}F;
-#endif // M0 == 16
-#endif // defined(IS_TILED)
     //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
-
 )_";
+    }
+    else
+    {
+        return R"_(
+    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    // IN_0/Out(Accumulator)   {{acc}}
+    // IN_1(Addend)        {{addend}}
+
+    // acc = addend + acc (mix-precision, broadcast, boundary aware)
+    {
+        TILE({{DATA_TYPE}}, M0, N0, addend_tile);
+
+        T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile);
+
+        T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+    }
+    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+)_";
+    }
 }
 
 CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
 {
-    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-    auto tile_info  = _blueprint->impl().get_tile_info();
+    const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
 
     CLBuildOptions build_opts{};
+    const auto     n0         = _blueprint->impl().get_execution_window().x().step();
+    const auto     m0         = _blueprint->impl().get_execution_window().y().step();
+    const auto     partial_m0 = t_dst_info->dimension(1) % m0;
 
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
 
     return build_opts;
 }
@@ -142,34 +143,56 @@
     return config_id;
 }
 
-ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
 {
-    // Determine which argument is the accumulator
-    Link accumulator;
-    Link addend;
-    if(_lhs.group == SharedVarGroup::Automatic)
+    const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+    vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
+    vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs");
+    if(is_root)
     {
-        accumulator = _lhs;
-        addend      = _rhs;
+        vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
     }
-    else if(_rhs.group == SharedVarGroup::Automatic)
+}
+
+ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+{
+    TagLUT     lut{};
+    const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    // Arguments and global shared variables
+    const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+    if(is_root)
     {
-        accumulator = _rhs;
-        addend      = _lhs;
+        lut["lhs"] = vtable.get(_lhs);
+        lut["rhs"] = vtable.get(_rhs);
+        lut["dst"] = vtable.get(_dst);
     }
     else
     {
-        ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+        // Determine which link is the accumulator
+        Link accumulator;
+        Link addend;
+        if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic)
+        {
+            accumulator = _lhs;
+            addend      = _rhs;
+        }
+        else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic)
+        {
+            accumulator = _rhs;
+            addend      = _lhs;
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+        }
+        lut["acc"]    = vtable.get(accumulator);
+        lut["addend"] = vtable.get(addend);
     }
-    return {
-        { "meta_kernel_id", id() },
-        { "acc", vtable.add(accumulator, ClKernelArgRuntimeDescriptor(accumulator.arg_id, TensorArgType::Image_3D), "add_acc") },
-        { "addend", vtable.add(addend, ClKernelArgRuntimeDescriptor(addend.arg_id, TensorArgType::Image_3D), "add_addend") },
-        // {"dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst")}, // dst is needed for the root version and/or non-inplace version should we need one
-    };
+    // Local build options
+    lut["meta_kernel_id"] = id();
+    lut["DATA_TYPE"]      = get_cl_type_from_data_type(t_dst_info->data_type());
+    return lut;
 }
 } // namespace dynamic_fusion
 } // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index 35c9538..4f7b697 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
@@ -37,7 +39,7 @@
 class ClElementwiseAddKernelComponent : public IClKernelComponent
 {
 public:
-    ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
+    ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
         : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
     {
     }
@@ -54,7 +56,8 @@
         return { _lhs, _rhs, _dst };
     }
 
-    virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+    virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+    virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
 
     virtual std::string name() const override
     {
@@ -70,6 +73,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
deleted file mode 100644
index 45b81b4..0000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ /dev/null

@@ -1,555 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClGemmNativeKernelComponent::get_component_type() const
-{
-    return ComponentType::Complex;
-}
-
-std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const
-{
-    return std::set<std::string> { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" };
-}
-
-Window ClGemmNativeKernelComponent::get_window() const
-{
-    ITensorInfo *lhs_info  = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
-    ITensorInfo *rhs_info  = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
-    ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
-    ITensorInfo *dst_info  = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
-
-    bool reinterpret_input_as_3d  = _desc.reinterpret_input_as_3d;
-    bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0;
-
-    Window win{};
-    Window win_out{};
-    bool   window_changed = false;
-
-    // In case both input and dst have to be reinterpreted as 3D tensors,
-    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
-    {
-        reinterpret_output_as_3d = false;
-    }
-
-    // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation
-    GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d,
-                             _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width,
-                             _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset);
-
-    // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info)));
-
-    TensorInfo tmp_info(*dst_info);
-
-    if(reinterpret_output_as_3d)
-    {
-        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(dst_info->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    win     = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
-    win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
-
-    AccessWindowStatic src0_access(lhs_info, 0, 0,
-                                   lhs_info->dimension(0),
-                                   lhs_info->dimension(1));
-    AccessWindowStatic src1_access(rhs_info, 0, 0,
-                                   ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0),
-                                   rhs_info->dimension(1));
-    AccessWindowStatic dst_access(dst_info, 0, 0,
-                                  dst_info->dimension(0),
-                                  dst_info->dimension(1));
-
-    if(bias_info != nullptr)
-    {
-        const int bias_processed_per_iteration_x = _desc.rhs_info.n0;
-
-        AccessWindowStatic src2_access(bias_info, 0, 0,
-                                       ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x),
-                                       bias_info->dimension(1));
-
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
-    }
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window             collapsed             = win;
-    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u);
-    collapsed                                = win.collapse(win, dimension_to_collapse);
-
-    if(window_changed == true)
-    {
-        ARM_COMPUTE_ERROR("Insufficient Padding!");
-    }
-
-    return collapsed;
-}
-
-std::string ClGemmNativeKernelComponent::get_additional_macros() const
-{
-    return R"_(
-#define VFMA(a, b, c) \
-({                    \
-    c = fma(a, b, c); \
-})
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-)_";
-}
-
-std::string ClGemmNativeKernelComponent::get_component_code() const
-{
-    auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
-    auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
-
-    auto has_alpha               = !(helpers::float_ops::is_one(_desc.alpha));
-    auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
-    auto dont_slide_b            = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
-
-    std::string code = R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ---------------------
-    // IN_0(lhs)            {{lhs}}
-    // IN_1(rhs)            {{rhs}}
-    )_";
-
-    if(!_bias.is_empty())
-    {
-        code += R"_(
-    // IN_2(bias)           {{bias}}
-    )_";
-    }
-
-    code += R"_(
-    // OUT(dst, accum)      {{dst}}
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), {{dst}}, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-    {
-#if defined(DUMMY_WORK_ITEMS)
-        if((g_x * N0 >= N) || (g_y * M0 >= M))
-        {
-            return;
-        }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-        // Compute LHS matrix address
-        uint lhs_offset = {{lhs}}_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * (uint){{lhs}}_stride_y;
-
-        // Compute RHS matrix address
-        uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE);
-    )_";
-
-    if(dont_slide_b)
-    {
-        code += R"_(
-            // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-            rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z;
-        )_";
-    }
-    else
-    {
-        code += R"_(
-            rhs_offset += g_z * {{rhs}}_stride_z;
-        )_";
-    }
-
-    code += R"_(
-        REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    )_";
-
-    if(reinterpret_input_as_3d)
-    {
-        code += R"_(
-            // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
-            CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
-
-            // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-            // multiply lhs_stride_z by DEPTH_GEMM3D
-            lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}};
-        )_";
-    }
-    else
-    {
-        code += R"_(
-            // Add offset for batched GEMM
-            lhs_offset += g_z * {{lhs}}_stride_z;
-        )_";
-    }
-
-    code += R"_(
-        int i = 0;
-#if {{K0}} > 1
-        for(; i <= (K - {{K0}}); i += {{K0}})
-        {
-            // Supported cases (M0, K0):
-            // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-            // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-            // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-            // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-            // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-            // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-            // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-            // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-            // Load values from LHS matrix
-            LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
-
-            // Load values from RHS matrix
-            LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
-
-            RHS_VFMA_M0xN0(0, a, b0, {{dst}});
-            RHS_VFMA_M0xN0(1, a, b1, {{dst}});
-#if {{K0}} > 2
-            RHS_VFMA_M0xN0(2, a, b2, {{dst}});
-#endif // K0 > 2
-#if {{K0}} > 3
-            RHS_VFMA_M0xN0(3, a, b3, {{dst}});
-#endif // K0 > 3
-#if {{K0}} > 4
-            RHS_VFMA_M0xN0(4, a, b4, {{dst}});
-            RHS_VFMA_M0xN0(5, a, b5, {{dst}});
-            RHS_VFMA_M0xN0(6, a, b6, {{dst}});
-            RHS_VFMA_M0xN0(7, a, b7, {{dst}});
-#endif // K0 > 4
-#if {{K0}} > 8
-            RHS_VFMA_M0xN0(8, a, b8, {{dst}});
-            RHS_VFMA_M0xN0(9, a, b9, {{dst}});
-            RHS_VFMA_M0xN0(A, a, bA, {{dst}});
-            RHS_VFMA_M0xN0(B, a, bB, {{dst}});
-            RHS_VFMA_M0xN0(C, a, bC, {{dst}});
-            RHS_VFMA_M0xN0(D, a, bD, {{dst}});
-            RHS_VFMA_M0xN0(E, a, bE, {{dst}});
-            RHS_VFMA_M0xN0(F, a, bF, {{dst}});
-#endif // K0 > 8
-
-            lhs_offset += {{K0}} * sizeof(DATA_TYPE);
-            rhs_offset += {{K0}} * {{rhs}}_stride_y;
-        }
-#endif // K0 > 1
-        // Left-over accumulations
-        for(; i < K; ++i)
-        {
-            // Load values from LHS matrix
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a0 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 0 * {{lhs}}_stride_y + zlhs0));
-#if M0 > 1
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a1 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 1 * {{lhs}}_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a2 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 2 * {{lhs}}_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a3 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 3 * {{lhs}}_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a4 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 4 * {{lhs}}_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a5 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 5 * {{lhs}}_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a6 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 6 * {{lhs}}_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
-            VEC_DATA_TYPE(DATA_TYPE, 2)
-            a7 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 7 * {{lhs}}_stride_y + zlhs7));
-#endif // M0 > 7
-
-            VEC_DATA_TYPE(DATA_TYPE, N0)
-            b = VLOAD(N0)(0, (__global DATA_TYPE *)({{rhs}}_ptr + rhs_offset + 0 * {{rhs}}_stride_y));
-            RHS_VFMA_M0xN0(0, a, b, {{dst}});
-
-            lhs_offset += sizeof(DATA_TYPE);
-            rhs_offset += {{rhs}}_stride_y;
-        }
-
-        // Multiply by the weight of matrix-matrix product and store the result
-    )_";
-    if(has_alpha)
-    {
-        code += R"_(
-            SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}});
-        )_";
-    }
-
-    if(!_bias.is_empty())
-    {
-        if(_desc.broadcast_bias)
-        {
-            code += R"_(
-                // Add beta*bias
-                __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-                LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
-            )_";
-
-            if(helpers::float_ops::is_one(_desc.beta))
-            {
-                code += R"_(
-                    SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}});
-                )_";
-            }
-
-            code += R"_(
-                // c = c + bias[broadcasted]
-                ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
-            )_";
-        }
-        else
-        {
-            code += R"_(
-                // Add beta*bias
-                __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
-                                            PARTIAL_STORE_M0)
-                                            * {{bias}}_stride_y)
-                                            + g_z * {{bias}}_stride_z;
-
-                LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
-            )_";
-
-            if(helpers::float_ops::is_one(_desc.beta))
-            {
-                code += R"_(
-                    SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}});
-                )_";
-            }
-
-            code += R"_(
-                // c = c + bias
-                ADD_BLOCK(M0, {{dst}}, bias);
-            )_";
-        }
-    }
-
-    code += R"_(
-    }
-    //------------------ END KERNEL {{meta_kernel_id}} ---------------------
-    )_";
-    return code.c_str();
-}
-
-CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
-{
-    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-    auto tile_info  = _blueprint->impl().get_tile_info();
-
-    CLBuildOptions build_opts{};
-
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y()));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x()));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
-
-    return build_opts;
-}
-
-std::string ClGemmNativeKernelComponent::generate_config_id() const
-{
-    auto        t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-    std::string config_id{};
-    config_id += (_bias.is_empty() ? "add_bias_" : "");
-    config_id += (_desc.broadcast_bias ? "broadcast_bias_" : "");
-    config_id += (_desc.reinterpret_input_as_3d ? "3di_" : "");
-    config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : "");
-    config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(t_dst_info->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(t_dst_info->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_desc.k);
-    config_id += "_";
-    config_id += support::cpp11::to_string(t_dst_info->dimension(2));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_desc.lhs_info.m0);
-    config_id += "_";
-    config_id += support::cpp11::to_string(_desc.rhs_info.n0);
-    config_id += "_";
-    config_id += support::cpp11::to_string(_desc.rhs_info.k0);
-    return config_id;
-}
-
-ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
-{
-    TagLUT lut{};
-
-    lut["meta_kernel_id"] = id();
-    lut["lhs"]            = vtable.add(_lhs, ClKernelArgRuntimeDescriptor(_lhs.arg_id, TensorArgType::Image_3D), "lhs");
-    lut["rhs"]            = vtable.add(_rhs, ClKernelArgRuntimeDescriptor(_rhs.arg_id, TensorArgType::Image_3D), "rhs");
-    if(!_bias.is_empty()) // optional bias
-    {
-        lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias");
-    }
-    lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst");
-
-    // Local build options
-    auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
-    auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
-    auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
-    auto has_alpha                = !(helpers::float_ops::is_one(_desc.alpha));
-    auto has_beta                 = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr;
-    auto reinterpret_input_as_3d  = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
-    auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0;
-    auto dont_slide_b             = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
-
-    lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0);
-
-    if(has_alpha)
-    {
-        lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha);
-    }
-    if(has_beta)
-    {
-        lut["BETA"] = float_to_string_with_full_precision(_desc.beta);
-    }
-    if(dont_slide_b)
-    {
-        lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2));
-    }
-
-    if(reinterpret_output_as_3d)
-    {
-        lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1));
-        lut["DEPTH_GEMM3D"]  = support::cpp11::to_string(t_dst_info->dimension(2));
-    }
-    else if(reinterpret_input_as_3d)
-    {
-        lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1));
-        lut["DEPTH_GEMM3D"]  = support::cpp11::to_string(t_lhs_info->dimension(2));
-    }
-
-    return lut;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
deleted file mode 100644
index b282856..0000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-
-#include "arm_compute/core/Steps.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClGemmNativeKernelComponent : public IClKernelComponent
-{
-public:
-    ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc,
-                                const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
-        : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
-    {
-    }
-
-    ComponentType         get_component_type() const override;
-    std::set<std::string> get_headers_list() const override;
-    std::string           get_additional_macros() const override;
-    std::string           get_component_code() const override;
-    Window                get_window() const override;
-    ClKernelArgList       get_args();
-    CLBuildOptions        generate_build_options() const override;
-    std::string           generate_config_id() const override;
-
-    virtual std::vector<Link> get_links() const override
-    {
-        return { _lhs, _rhs, _bias, _dst };
-    }
-
-    virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
-    virtual std::string name() const override
-    {
-        return "gemm_mm_native_" + std::to_string(id());
-    }
-
-private:
-    GemmNativeDescriptor _desc{};
-    Link                 _lhs{};
-    Link                 _rhs{};
-    Link                 _bias{};
-    Link                 _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
index de02f94..c6716a0 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h

@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
 
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 5f023ba..e0b210f 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
 
@@ -65,25 +67,36 @@
 CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
 {
     auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-    auto tile_info  = _blueprint->impl().get_tile_info();
+    // auto tile_info  = _blueprint->impl().get_tile_info();
 
     CLBuildOptions build_opts{};
 
+    const auto n0         = _blueprint->impl().get_execution_window().x().step();
+    const auto m0         = _blueprint->impl().get_execution_window().y().step();
+    const auto partial_m0 = t_dst_info->dimension(0) % m0;
+    const auto partial_n0 = t_dst_info->dimension(1) % n0;
+
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
-    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
-    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0));
 
     return build_opts;
 }
 
-ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+    vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src");
+    vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst");
+}
+
+ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
 {
     return {
         { "meta_kernel_id", id() },
-        { "src", vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src") },
-        { "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") },
+        { "src", vtable.get(_src) },
+        { "dst", vtable.get(_dst) },
     };
 }
 
@@ -96,19 +109,26 @@
 {
     return R"_(
     //------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
-
-    TILE(uint, M0, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, i, 0, 1, M0,
     {
-        dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1);
-        dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}});
-    })
+    #define _IDST_WIDTH {{dst}}_w
+    #define _IDST_HEIGHT {{dst}}_h
+        TILE(uint, M0, 1, dst_indirect_y);
 
-    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y);
+        // Calculate the destination indirect Y
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+            dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+        })
 
-    //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+        T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+    #undef _IDST_WIDTH
+    #undef _IDST_HEIGHT
+        //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+    }
 
 )_";
 }
@@ -120,21 +140,24 @@
     return build_opts;
 }
 
-ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+    vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+    vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
 {
     TagLUT lut{};
 
-    lut["meta_kernel_id"] = id();
-    lut["src"]            = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src");
-    lut["dst"]            = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
+    // Arguments and global shared variables
+    lut["src"] = vtable.get(_src);
+    lut["dst"] = vtable.get(_dst);
 
     // Local build options
-    auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
-    lut["dst_w"] = dst_info->dimension(1);
-    lut["dst_h"] = dst_info->dimension(2);
-
+    lut["meta_kernel_id"]  = id();
     lut["DST_TENSOR_TYPE"] = "BUFFER";
+    const auto dst_info    = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
     lut["DST_DATA_TYPE"]   = dst_info->data_type();
 
     return lut;
@@ -142,6 +165,4 @@
 
 } // namespace dynamic_fusion
 } // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index c7da8bd..26883d7 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h

@@ -21,7 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
 
 #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
@@ -37,21 +39,21 @@
 class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
 {
 public:
-    ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+    ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
         : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
     ComponentType  get_component_type() const override;
     std::string    get_component_code() const override;
     CLBuildOptions generate_build_options() const override;
+    TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+    void allocate_shared_vars(SharedVarTable &vtable) const override;
 
     virtual std::vector<Link> get_links() const override
     {
         return { _src, _dst };
     }
 
-    virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
     virtual std::string name() const override
     {
         return "";
@@ -65,21 +67,21 @@
 class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent
 {
 public:
-    ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+    ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
         : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
     ComponentType  get_component_type() const override;
     std::string    get_component_code() const override;
     CLBuildOptions generate_build_options() const override;
+    virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+    void allocate_shared_vars(SharedVarTable &vtable) const override;
 
     virtual std::vector<Link> get_links() const override
     {
         return { _src, _dst };
     }
 
-    virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
     virtual std::string name() const override
     {
         return "";
@@ -93,6 +95,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
new file mode 100644
index 0000000..5dbf2f6
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp

@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/OperatorGraph.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void check_dependency_graph_op_success(OperatorGraph &graph, const Status &status)
+{
+    if(!bool(status))
+    {
+        graph.impl()->status = Status{ status.error_code(), "Cycles or loops are not allowed" };
+    }
+}
+
+// Check if there are more than one roots in the graph
+void check_multiple_roots(OperatorGraph &graph)
+{
+    if(graph.impl()->graph.get_root_ops().size() > 1)
+    {
+        graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Multiple roots are not allowed" };
+    }
+}
+
+void check_execution_shape(OperatorGraph &graph, const ITensorInfo &dst_info)
+{
+    const auto roots = graph.impl()->graph.get_root_ops();
+    for(auto root : roots)
+    {
+        // We assume exactly 1 dst tensor for all operators
+        const auto root_info = graph.impl()->tensors[graph.impl()->graph.dst_tensors(root)[0]]->get_tensor_info();
+        for(unsigned int dim = 0; dim < root_info->num_dimensions(); ++dim)
+        {
+            if(root_info->dimension(dim) != dst_info.dimension(dim))
+            {
+                graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot change execution space" };
+                return;
+            }
+        }
+    }
+}
+} // namespace
+
+OpTensor::OpTensor(Id id)
+    : _id{ id }
+{
+}
+
+OpTensor::Id OpTensor::id() const
+{
+    return _id;
+}
+
+bool operator<(const OpTensor &t0, const OpTensor &t1)
+{
+    return t0.id() < t1.id();
+}
+
+Operator::Operator(Id id)
+    : _id{ id }
+{
+}
+
+Operator::Id Operator::id() const
+{
+    return _id;
+}
+
+bool operator<(const Operator &op0, const Operator &op1)
+{
+    return op0.id() < op1.id();
+}
+
+OperatorGraph::OperatorGraph()
+    : _impl{ std::make_unique<Implementation>() }
+{
+}
+
+OperatorGraph::~OperatorGraph() = default;
+
+OperatorGraph::Implementation *OperatorGraph::impl()
+{
+    return _impl.get();
+}
+
+const OperatorGraph::Implementation *OperatorGraph::impl() const
+{
+    return _impl.get();
+}
+
+Status validate(const OperatorGraph &graph)
+{
+    return graph.impl()->status;
+}
+
+OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info)
+{
+    auto     id = graph.impl()->graph.add_tensor();
+    OpTensor op_tensor(id);
+    graph.impl()->add_tensor(id, &info);
+    return op_tensor;
+}
+
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst)
+{
+    // Check if map is empty as a complex operator can only be root
+    if(!graph.impl()->graph.get_root_ops().empty())
+    {
+        graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot add multiple complex operators" };
+        return Operator{};
+    }
+
+    std::pair<Status, DependencyGraph::Id> status_id;
+
+    if(bias.id() == -1)
+    {
+        status_id = graph.impl()->graph.add_operator({ input.id(), weights.id() }, { dst.id() });
+    }
+    else
+    {
+        status_id = graph.impl()->graph.add_operator({ input.id(), weights.id(), bias.id() }, { dst.id() });
+    }
+
+    check_dependency_graph_op_success(graph, status_id.first);
+
+    Operator op_node(status_id.second);
+
+    // Infer TensorInfo
+    OpTensorContent *dst_tensor = graph.impl()->tensors[dst.id()].get();
+    if(dst_tensor->get_tensor_info()->total_size() == 0)
+    {
+        auto src   = graph.impl()->tensors[input.id()]->get_tensor_info();
+        auto wts   = graph.impl()->tensors[weights.id()]->get_tensor_info();
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wts->tensor_shape(), PadStrideInfo(desc.stride.x(), desc.stride.y(), desc.pad.left,
+                                                                            desc.pad.right,
+                                                                            desc.pad.top, desc.pad.bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(*(dst_tensor->get_tensor_info()), src->clone()->set_tensor_shape(shape));
+    }
+
+    // Check execution space
+    auto dst_info = dst_tensor->get_tensor_info();
+    check_execution_shape(graph, *dst_info);
+
+    ITensorDescPack<OpTensorContent> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[input.id()].get());
+    tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[weights.id()].get());
+    if(bias.id() != -1)
+    {
+        tensors.add_const_tensor(ACL_SRC_2, graph.impl()->tensors[bias.id()].get());
+    }
+    tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
+
+    graph.impl()->add_node<Conv2dContent>(status_id.second, desc, tensors);
+    check_multiple_roots(graph);
+
+    return op_node;
+}
+
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst)
+{
+    return add_op_conv2d(graph, desc, input, weights, OpTensor(-1), dst);
+}
+
+void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method)
+{
+    auto node = utils::cast::polymorphic_downcast<Conv2dContent *>(graph.impl()->operators[conv2d.id()].get());
+    node->set_method(method);
+}
+
+Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
+{
+    auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() });
+    check_dependency_graph_op_success(graph, id.first);
+
+    Operator op_node(id.second);
+
+    // Infer TensorInfo
+    auto             node_lhs = graph.impl()->tensors[lhs.id()]->get_tensor_info();
+    auto             node_rhs = graph.impl()->tensors[rhs.id()]->get_tensor_info();
+    OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get();
+
+    if(node_dst->get_tensor_info()->total_size() == 0)
+    {
+        const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*node_rhs, *node_lhs);
+        auto_init_if_empty(*(node_dst->get_tensor_info()), node_lhs->clone()->set_tensor_shape(broadcast_pair.first));
+    }
+
+    // Check execution space
+    auto dst_info = node_dst->get_tensor_info();
+    check_execution_shape(graph, *dst_info);
+
+    ITensorDescPack<OpTensorContent> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get());
+    tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get());
+    tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
+    graph.impl()->add_node<AddContent>(id.second, desc, tensors);
+    check_multiple_roots(graph);
+
+    return op_node;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp
new file mode 100644
index 0000000..7e9f6b8
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp

@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> get_combinations(const std::vector<ClKernelFusionGroup *> &sorted_fgs)
+{
+    ARM_COMPUTE_ERROR_ON(sorted_fgs.size() <= 1);
+    std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> combo;
+    for(size_t i = 0; i < sorted_fgs.size() - 1; ++i)
+    {
+        for(size_t j = i + 1; j < sorted_fgs.size(); ++j)
+        {
+            combo.push_back(std::make_pair(sorted_fgs.at(i), sorted_fgs.at(j)));
+        }
+    }
+    return combo;
+}
+} // namespace
+std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group)
+{
+    std::vector<const ClKernel *> kernels;
+    const auto                    sorted = group.graph.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        kernels.push_back(group.fused_kernels.at(pack.op));
+    }
+    return kernels;
+}
+
+std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph)
+{
+    std::vector<const ClKernelFusionGroup *> kernels;
+    const auto                               sorted = graph.fg_dependency.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        kernels.push_back(graph.fusion_groups.at(pack.op).get());
+    }
+    return kernels;
+}
+
+std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph)
+{
+    std::vector<ClKernelFusionGroup *> kernels;
+    const auto                         sorted = graph.fg_dependency.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        kernels.push_back(graph.fusion_groups.at(pack.op).get());
+    }
+    return kernels;
+}
+
+std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph)
+{
+    ClFusedKernelGraph fused_kernel_graph{};
+    fused_kernel_graph.original_graph = &kernel_graph; // Create a copy of the original kernel graph
+    fused_kernel_graph.fg_dependency  = DependencyGraph();
+    // Initialize all fusion groups
+    for(const auto &kernel : traverse(kernel_graph))
+    {
+        fused_kernel_graph.add_fusion_group({ kernel });
+    }
+    return { Status{}, fused_kernel_graph };
+}
+
+Status fuse(ClFusedKernelGraph &fused_kernel_graph)
+{
+    // A naive fusion algorithm that's guaranteed to find optimal pattern if there are no branches
+    // If there are branches, the algorithm cannot guanrantee optimality as it doesn't perform any searches
+
+    bool fusion_found = false;
+    do
+    {
+        fusion_found          = false;
+        const auto sorted_fgs = traverse(fused_kernel_graph);
+        if(sorted_fgs.size() <= 1)
+        {
+            // Only one or zero fusion group, thus no need to perform fusion
+            return Status{};
+        }
+        auto fgs_combo = get_combinations(sorted_fgs);
+        for(auto fgs : fgs_combo)
+        {
+            auto       fg0 = fgs.first;
+            auto       fg1 = fgs.second;
+            const auto st  = fused_kernel_graph.can_fuse(*fg0, *fg1);
+            if(bool(st))
+            {
+                const auto st = fused_kernel_graph.fuse(*fg0, *fg1);
+                if(!bool(st))
+                {
+                    return st;
+                }
+                fusion_found = true;
+                break;
+            }
+        }
+    }
+    while(fusion_found);
+    return Status{};
+}
+Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg)
+{
+    Status st{};
+    for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg.id))
+    {
+        const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id);
+
+        /// NOTE: dst tensor must have already been added to the blueprint at this point
+        ArgumentID dst_id;
+        st = add_tensor(bp, dst_t->desc, dst_id, dst_t->id);
+        if(!bool(st))
+        {
+            return st;
+        }
+        /// NOTE: the extra dst tensor is needed as the store kcomp requires 2 tensors. But this is irrelevant to the fused kernel graph
+        /// since both tensors share the exact same info and kernel arg descriptor
+        ArgumentID dst_dst_id;
+        st = add_tensor(bp, dst_t->desc, dst_dst_id);
+        if(!bool(st))
+        {
+            return st;
+        }
+        /// NOTE: Update the merge point map to link dst_dst_id with dst_t->id instead.
+        /// This is required because the get_arguments() returned by the blueprint returns the dst tensor added by the store component
+        st = update_merge_point(bp, dst_dst_id, dst_t->id);
+        if(!bool(st))
+        {
+            return st;
+        }
+        st = add_kcomp_store(bp, fg.get_root_kernel()->config().store_type, dst_id, dst_dst_id);
+        if(!bool(st))
+        {
+            return st;
+        }
+    }
+    return st;
+}
+
+Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph)
+{
+    workload.context = ctx;
+    for(const auto &fg : traverse(fused_kernel_graph))
+    {
+        ClKernelBlueprint bp{};
+        for(const auto &kernel : traverse(*fg))
+        {
+            const auto st = kernel->generate(bp);
+            if(!bool(st))
+            {
+                return st;
+            }
+        }
+        auto st = set_tile_info(bp, fg->get_root_kernel()->config().tile_desc);
+        if(!bool(st))
+        {
+            return st;
+        }
+        st = generate_store(bp, fused_kernel_graph, *fg);
+        if(!bool(st))
+        {
+            return st;
+        }
+
+        ClKernelCode code{};
+        st = build(code, ClCodeBuilderContext{ ctx.gpu_info }, bp);
+        if(!bool(st))
+        {
+            return st;
+        }
+        const auto bp_graph = get_dependency_graph(bp);
+
+        // Get tensor info
+        std::vector<Id> workload_src_tensors{};
+        for(const auto &src_t_id : fused_kernel_graph.fg_dependency.src_tensors(fg->id))
+        {
+            const auto src_t = fused_kernel_graph.original_graph->get_tensor(src_t_id);
+            // Get corresponding kernel arg descriptor
+            const auto arg_desc    = code.arguments.at(bp_graph.get_merge_points().at(src_t->id));
+            const auto kernel_t_id = workload.add_workload_tensor(src_t->desc, src_t->memory_type, src_t->memory_info, arg_desc, src_t->id);
+            workload_src_tensors.push_back(kernel_t_id);
+        }
+        std::vector<Id> workload_dst_tensors{};
+        for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg->id))
+        {
+            const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id);
+            // Get corresponding kernel arg descriptor
+            const auto arg_desc    = code.arguments.at(bp_graph.get_merge_points().at(dst_t->id));
+            const auto kernel_t_id = workload.add_workload_tensor(dst_t->desc, dst_t->memory_type, dst_t->memory_info, arg_desc, dst_t->id);
+            workload_dst_tensors.push_back(kernel_t_id);
+        }
+
+        workload.add_unit_workload(fg->get_root_kernel()->config().stage, code, workload_src_tensors, workload_dst_tensors);
+    }
+
+    return Status{};
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
new file mode 100644
index 0000000..4bd3cd9
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h

@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "support/DeepCopy.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClKernelFusionGroup;
+
+/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
+ *
+ */
+struct ClKernelFusionGroup
+{
+public:
+    using Id = DependencyGraph::Id;
+
+    ClKernelFusionGroup() = default;
+    ClKernelFusionGroup(Id id)
+        : id{ id }, graph{}, fused_kernels{}, tensors{}
+    {
+    }
+    ~ClKernelFusionGroup() = default;
+
+    void set_id(Id i)
+    {
+        id = i;
+    }
+
+    Id add_fused_kernel(const ClKernel *kernel)
+    {
+        /// PRE: Acyclicity ensured by DependencyGraph
+        /// PRE: Connectedness ensured by DependencyGraph
+        /// PRE: Single-rootedness ensured by User
+        std::vector<Id> src_tensors;
+        for(const auto t : kernel->tensors().get_const_src_tensors())
+        {
+            auto id = graph.add_tensor(t->id);
+            if(tensors.find(id) == tensors.end())
+            {
+                tensors[id] = t;
+            }
+            src_tensors.push_back(id);
+        }
+        std::vector<Id> dst_tensors;
+        for(const auto t : kernel->tensors().get_const_dst_tensors())
+        {
+            auto id = graph.add_tensor(t->id);
+            if(tensors.find(id) == tensors.end())
+            {
+                tensors[id] = t;
+            }
+            dst_tensors.push_back(id);
+        }
+        auto id                  = graph.add_operator(src_tensors, dst_tensors);
+        fused_kernels[id.second] = kernel;
+        return id.second;
+    }
+
+    const ClKernel *get_root_kernel() const
+    {
+        auto root_kernels = graph.get_root_ops();
+        ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
+        return fused_kernels.at(root_kernels.at(0));
+    }
+
+    std::vector<const ClKernelTensor *> get_src_tensors() const
+    {
+        std::vector<const ClKernelTensor *> src_tensors;
+        for(auto tensor_id : graph.src_tensors())
+        {
+            src_tensors.push_back(tensors.at(tensor_id));
+        }
+        return src_tensors;
+    }
+
+    std::vector<const ClKernelTensor *> get_dst_tensors() const
+    {
+        std::vector<const ClKernelTensor *> dst_tensors;
+        for(auto tensor_id : graph.dst_tensors())
+        {
+            dst_tensors.push_back(tensors.at(tensor_id));
+        }
+        return dst_tensors;
+    }
+
+    friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
+    {
+        return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
+    }
+
+    Id              id{};
+    DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
+    std::map<Id, const ClKernel *>       fused_kernels{};
+    std::map<Id, const ClKernelTensor *> tensors{};
+};
+
+std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
+
+struct ClFusedKernelGraph
+{
+public:
+    using Id = DependencyGraph::Id;
+
+    using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
+
+    ClFusedKernelGraph()                                = default;
+    ~ClFusedKernelGraph()                               = default;
+    ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
+    ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
+    ClFusedKernelGraph(ClFusedKernelGraph &&graph)                 = default;
+    ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
+
+    friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
+    {
+        /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
+        ///       The fusion groups can already guarantee the equivalence of fusion
+        ///       In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
+        return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
+    }
+
+    Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
+    {
+        auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
+        for(const auto k : fused_kernels)
+        {
+            fg->add_fused_kernel(k);
+        }
+        const auto      src_tensors = fg->get_src_tensors();
+        const auto      dst_tensors = fg->get_dst_tensors();
+        std::vector<Id> inputs{};
+        std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
+        {
+            return fg_dependency.add_tensor(kernel->id);
+        });
+        std::vector<Id> outputs{};
+        std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
+        {
+            return fg_dependency.add_tensor(kernel->id);
+        });
+        const auto id = fg_dependency.add_operator(inputs, outputs);
+        fg->set_id(id.second);
+        fusion_groups[id.second] = std::move(fg);
+        return id.second;
+    }
+
+    Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
+    {
+        /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
+        ClKernelFusionGroup *fg_src{};
+        ClKernelFusionGroup *fg_dst{};
+        // Find fg_src (parent / root) and fg_dst (child / non-root)
+        if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
+        {
+            fg_src = &fg0;
+            fg_dst = &fg1;
+        }
+        else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
+        {
+            fg_src = &fg1;
+            fg_dst = &fg0;
+        }
+        else
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
+        }
+
+        for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
+        {
+            if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
+            {
+                // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
+
+                // Before:
+                // fg_src
+                // |
+                // ..          t1
+                // |           |
+                // -> fg_dst <-
+                //
+                // After:
+                // fg_src <---t1
+                //
+                const auto st = link_src_tensors(fg_src->id, { t });
+                if(!bool(st))
+                {
+                    return st;
+                }
+            }
+            else
+            {
+                const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
+                if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
+                {
+                    // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
+                    // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
+
+                    // Before:
+                    // fg_src
+                    // |
+                    // t0
+                    // |
+                    // -> fg_dst
+                    //
+                    // After:
+                    // fg_src
+                    //
+                    const auto st = remove_fg_tensor(t);
+                    if(!bool(st))
+                    {
+                        return st;
+                    }
+                }
+                else
+                {
+                    // If the tensors ARE in between fg_src and fg_dst
+                    // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
+
+                    // Before:
+                    // fg_src
+                    // |
+                    // t0
+                    // |
+                    // |-----------
+                    // |          |
+                    // -> fg_dst  -> fg_other
+                    //
+                    // After:
+                    // fg_src
+                    // |
+                    // t0
+                    // |
+                    // -> fg_other
+                    //
+
+                    // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
+                    // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
+                    // so that any unfused fg's (fg_other in this case) can read it.
+                    // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
+                    // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
+                    // can be termed "dst tensors" to a fusion group
+                    void();
+                }
+            }
+        }
+
+        for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
+        {
+            // Link any outgoing tensors of fg_dst to fg_src
+
+            // Before:
+            // fg_src
+            // |
+            // ..
+            // |
+            // -> fg_dst
+            //    |
+            //    |--------
+            //    |       |
+            //    |-> t0  |-> t1
+            //
+            // After:
+            // fg_src
+            // |
+            // |--------
+            // |       |
+            // |-> t0  |-> t1
+            //
+            const auto st = link_dst_tensors(fg_src->id, { t });
+            if(!bool(st))
+            {
+                return st;
+            }
+        }
+
+        // Merge fg_dst's graph into fg_src's graph
+        for(const auto kernel : traverse(*fg_dst))
+        {
+            fg_src->add_fused_kernel(kernel);
+        }
+
+        const auto st = remove_fg(fg_dst->id);
+        return st;
+    }
+    Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
+    {
+        /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
+        /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
+        ///              Note that this does not apply to fusion groups
+        /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
+        /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
+        /// INV0: All Fusion groups have a single root
+        /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
+        /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
+        /// INV3: All non-roots are Simple kernels
+        /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
+        /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
+        const ClKernelFusionGroup *fg_src {};
+        const ClKernelFusionGroup *fg_dst{};
+
+        // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
+        // This guarantess INV0
+        // This also finds fg_src (parent / root) and fg_dst (child / non-root)
+        if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
+        {
+            fg_src = &fg0;
+            fg_dst = &fg1;
+        }
+        else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
+        {
+            fg_src = &fg1;
+            fg_dst = &fg0;
+        }
+        else
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
+        }
+
+        // Find unconnected tensors between fg_src and fg_dst
+        std::vector<Id> unconnected_tensors{};
+        for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
+        {
+            if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
+            {
+                unconnected_tensors.push_back(t);
+            }
+        }
+
+        // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
+        // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
+        for(const auto &t : unconnected_tensors)
+        {
+            if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
+            {
+                return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
+            }
+        }
+
+        // Check 2: All non-root fgs are simple. Ensure INV3
+        if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
+        }
+
+        // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
+        const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
+        ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
+        const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
+
+        for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
+        {
+            const auto t_info = original_graph->get_tensor(t)->desc;
+            if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
+            {
+                return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
+            }
+        }
+
+        // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
+        if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
+        }
+
+        return Status{};
+    }
+
+    const ClKernelGraph *original_graph{};
+    DependencyGraph      fg_dependency{};
+    KernelFusionGroupMap fusion_groups{};
+    // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
+
+private:
+    Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
+    {
+        for(auto t : src_tensors)
+        {
+            fg_dependency.link_input(fg, t);
+        }
+        return Status{};
+    }
+    Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
+    {
+        for(auto t : dst_tensors)
+        {
+            fg_dependency.link_output(fg, t);
+        }
+        return Status{};
+    }
+    Status remove_fg(Id fg)
+    {
+        fg_dependency.remove_operator(fg);
+        fusion_groups.erase(fg);
+        return Status{};
+    }
+    Status remove_fg_tensor(Id tensor)
+    {
+        fg_dependency.remove_tensor(tensor);
+        return Status{};
+    }
+};
+
+std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
+std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
+
+std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
+
+Status fuse(ClFusedKernelGraph &fused_kernel_graph);
+
+Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
+
+Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
new file mode 100644
index 0000000..cdd2b2e
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
+
+#include "arm_compute/core/experimental/OperatorGraph.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClDirectConv2dKernelDescriptor
+{
+    friend bool operator==(const ClDirectConv2dKernelDescriptor &desc0, const ClDirectConv2dKernelDescriptor &desc1)
+    {
+        return desc0.conv2d == desc1.conv2d;
+    }
+    Conv2dDescriptor conv2d{};
+};
+
+struct ClEltwiseAddKernelDescriptor
+{
+    friend bool operator==(const ClEltwiseAddKernelDescriptor &desc0, const ClEltwiseAddKernelDescriptor &desc1)
+    {
+        return desc0.add == desc1.add;
+    }
+    AddDescriptor add{};
+};
+struct ClActivationKernelDescriptor
+{
+    friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &)
+    {
+        return true;
+    }
+};
+
+enum class ClippingStrategy
+{
+    TOP_LEFT,
+    TOP_RIGHT,
+    BOTTOM_LEFT,
+    BOTTOM_RIGHT,
+};
+/** Component: Store */
+struct TileDescriptor
+{
+    Size2D           tile_dims{};
+    Size2D           boundaries{};
+    ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT };
+
+    TileDescriptor()
+    {
+    }
+
+    TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip)
+        : tile_dims(dims), boundaries(bound), clipping(clip)
+    {
+    }
+
+    bool empty() const
+    {
+        return (tile_dims.area() == 0) || (boundaries.area() == 0);
+    }
+    friend bool operator==(const TileDescriptor &tile0, const TileDescriptor &tile1)
+    {
+        return tile0.tile_dims == tile1.tile_dims && tile0.boundaries == tile1.boundaries && tile0.clipping == tile1.clipping;
+    }
+};
+enum class StoreType
+{
+    VStore,
+    VStorePartial,
+    StoreRow,
+    ConvertStoreRow,
+    StoreBlock,
+    ConvertStoreBlock,
+    StoreRowPartial,
+    StoreBlockPartial,
+    StoreBlockBoundaryAware,
+    StoreVectorSelect,
+    TStoreIndirectWidthSelect
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
new file mode 100644
index 0000000..8aaf094
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp

@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClDirectConv2dKernel::generate(ClKernelBlueprint &bp) const
+{
+    const auto input  = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bias   = _tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst    = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst);
+    ArgumentID input_id;
+    add_tensor(bp, input->desc, input_id, input->id);
+    ArgumentID weight_id;
+    add_tensor(bp, weight->desc, weight_id, weight->id);
+    ArgumentID bias_id = g_arg_placeholder;
+    if(bias != nullptr)
+    {
+        add_tensor(bp, bias->desc, bias_id, bias->id);
+    }
+    ArgumentID dst_id;
+    add_tensor(bp, dst->desc, dst_id, dst->id);
+
+    add_kcomp_direct_conv2d(bp, desc, input_id, weight_id, bias_id, dst_id);
+    return Status{};
+}
+Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc)
+{
+    // 1. Check validity
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+    }
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, biases);
+    }
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().total_size() == 0);
+    }
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    // weights shape is correct
+    const DataLayout data_layout = src->data_layout();
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    // dst shape is correct
+    PadStrideInfo legacy_pad_stride(conv2d_desc.conv2d.stride.x(), conv2d_desc.conv2d.stride.y(), conv2d_desc.conv2d.pad.left, conv2d_desc.conv2d.pad.right, conv2d_desc.conv2d.pad.top,
+                                    conv2d_desc.conv2d.pad.bottom, DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride));
+
+    // biases shape is correct
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
+                                        "Biases should be one dimensional");
+    }
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+bool ClDirectConv2dKernel::operator==(const ClKernel &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const ClDirectConv2dKernel *>(&other);
+    return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
+}
+
+Status ClAddKernel::generate(ClKernelBlueprint &bp) const
+{
+    const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ArgumentID lhs_id;
+    add_tensor(bp, lhs->desc, lhs_id, lhs->id);
+    ArgumentID rhs_id;
+    add_tensor(bp, rhs->desc, rhs_id, rhs->id);
+    ArgumentID dst_id;
+    add_tensor(bp, dst->desc, dst_id, dst->id);
+
+    add_kcomp_eltwise_add(bp, desc, lhs_id, rhs_id, dst_id);
+    return Status{};
+}
+
+Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
+{
+    // 1. Check validity
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+    const bool in_place      = (lhs == dst) || (rhs == dst);
+    const bool src0_in_place = in_place && (lhs == dst);
+
+    // dst shape is correct
+    const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+    if(in_place)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src0_in_place ? lhs->tensor_shape() : rhs->tensor_shape(), 0),
+                                        "Wrong shape for dst, cannot do in_place calculation");
+    }
+
+    // 2. Check support level
+
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
+
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC);
+
+    return Status{};
+}
+
+bool ClAddKernel::operator==(const ClKernel &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const ClAddKernel *>(&other);
+    return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
+}
+
+std::vector<const ClKernel *> traverse(const ClKernelGraph &graph)
+{
+    std::vector<const ClKernel *> kernels;
+    const auto                    sorted = graph.graph.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        kernels.push_back(graph.kernels.at(pack.op).get());
+    }
+    return kernels;
+}
+std::vector<ClKernel *> traverse(ClKernelGraph &graph)
+{
+    std::vector<ClKernel *> kernels;
+    const auto              sorted = graph.graph.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        kernels.push_back(graph.kernels.at(pack.op).get());
+    }
+    return kernels;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
new file mode 100644
index 0000000..1e14afb
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h

@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h"
+#include "support/DeepCopy.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClKernelGraph;
+class ClKernelBlueprint;
+
+enum class Complexity
+{
+    Simple,
+    Complex
+};
+
+/** Configurations for ClKernel
+ *
+ */
+struct ClKernelConfig
+{
+    UnitWorkloadStage stage{};
+    TileDescriptor    tile_desc{};
+    StoreType         store_type{};
+    friend bool operator==(const ClKernelConfig &config0, const ClKernelConfig &config1)
+    {
+        return config0.stage == config1.stage && config0.tile_desc == config1.tile_desc && config0.store_type == config1.store_type;
+    }
+};
+
+struct ClKernelTensor
+{
+public:
+    using Id         = DependencyGraph::Id;
+    ClKernelTensor() = default;
+    ClKernelTensor(Id id, ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info)
+        : id{ id }, desc{ desc }, memory_type{ memory_type }, memory_info{ memory_info }
+    {
+    }
+    bool operator==(const ClKernelTensor &other) const
+    {
+        return desc == other.desc;
+    }
+
+    Id            id{};
+    ITensorInfo *desc{};
+    MemoryType    memory_type{};
+    AuxMemoryInfo memory_info{};
+};
+
+struct ClKernel
+{
+public:
+    using Id                         = DependencyGraph::Id;
+    ClKernel()                       = default;
+    virtual ~ClKernel()              = default;
+    ClKernel(const ClKernel &kernel) = default;
+    ClKernel &operator=(const ClKernel &kernel) = default;
+    ClKernel(ClKernel &&kernel)                 = default;
+    ClKernel &operator=(ClKernel &&kernel) = default;
+    ClKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ITensorDescPack<ClKernelTensor> &tensors)
+        : _graph{ graph }, _id{ id }, _config{ config }, _tensors{ tensors }
+    {
+    }
+    virtual bool operator==(const ClKernel &other) const = 0;
+    virtual Complexity complexity() const                = 0;
+    virtual Status generate(ClKernelBlueprint &bp) const = 0;
+    Id id() const
+    {
+        return _id;
+    }
+    ITensorDescPack<ClKernelTensor> tensors() const
+    {
+        return _tensors;
+    }
+    ClKernelConfig config() const
+    {
+        return _config;
+    }
+
+protected:
+    const ClKernelGraph            *_graph {};
+    Id                              _id{};
+    ClKernelConfig                  _config{};
+    ITensorDescPack<ClKernelTensor> _tensors{};
+};
+
+struct ClDirectConv2dKernel : public ClKernel
+{
+public:
+    Complexity complexity() const override
+    {
+        return Complexity::Complex;
+    }
+    ClDirectConv2dKernel()           = default;
+    ~ClDirectConv2dKernel() override = default;
+    ClDirectConv2dKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig config, const ClDirectConv2dKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+        : ClKernel{ graph, id, config, tensors }, desc{ desc }
+    {
+    }
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc);
+    bool operator==(const ClKernel &other) const override;
+    Status generate(ClKernelBlueprint &bp) const override;
+
+    ClDirectConv2dKernelDescriptor desc{};
+};
+
+struct ClAddKernel : public ClKernel
+{
+public:
+    Complexity complexity() const override
+    {
+        return Complexity::Simple;
+    }
+    ClAddKernel()           = default;
+    ~ClAddKernel() override = default;
+    ClAddKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClEltwiseAddKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+        : ClKernel{ graph, id, config, tensors }, desc{ desc }
+    {
+    }
+    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst);
+    bool operator==(const ClKernel &other) const override;
+    Status generate(ClKernelBlueprint &bp) const override;
+
+    ClEltwiseAddKernelDescriptor desc{};
+};
+
+struct ClKernelGraph
+{
+public:
+    using Id              = DependencyGraph::Id;
+    using KernelMap       = std::map<Id, utils::memory::deep_unique_ptr<ClKernel>>;
+    using KernelTensorMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelTensor>>;
+
+    ClKernelGraph()  = default;
+    ~ClKernelGraph() = default;
+
+    friend bool operator==(const ClKernelGraph &graph0, const ClKernelGraph &graph1)
+    {
+        return graph0.graph == graph1.graph && graph0.kernels == graph1.kernels && graph0.tensors == graph1.tensors;
+    }
+
+    Status add_kernel_tensor(ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info, Id &tensor_id, Id merge_point = DependencyGraph::empty_id())
+    {
+        tensor_id = graph.add_tensor(merge_point);
+        if(tensors.find(tensor_id) == tensors.end())
+        {
+            tensors[tensor_id] = utils::memory::make_deep_unique<ClKernelTensor, ClKernelTensor>(tensor_id, desc, memory_type, memory_info);
+        }
+        return Status{};
+    }
+
+    template <typename ContentT, typename KernelDescT>
+    Status add_kernel(const ClKernelConfig &config, const KernelDescT &desc, const ITensorDescPack<ClKernelTensor> &tensors, Id &kernel_id)
+    {
+        const auto      src_tensors = tensors.get_const_src_tensors();
+        const auto      dst_tensors = tensors.get_const_dst_tensors();
+        std::vector<Id> src_tensor_ids{};
+        std::vector<Id> dst_tensor_ids{};
+        for(const auto &t : src_tensors)
+        {
+            src_tensor_ids.push_back(t->id);
+        }
+        for(const auto &t : dst_tensors)
+        {
+            dst_tensor_ids.push_back(t->id);
+        }
+        kernel_id          = graph.add_operator(src_tensor_ids, dst_tensor_ids).second;
+        auto k             = utils::memory::make_deep_unique<ClKernel, ContentT>(this, kernel_id, config, desc, tensors);
+        kernels[kernel_id] = std::move(k);
+        return Status{};
+    }
+
+    ClKernel *get_kernel(Id id)
+    {
+        return kernels.at(id).get();
+    }
+    const ClKernel *get_kernel(Id id) const
+    {
+        return kernels.at(id).get();
+    }
+
+    ClKernelTensor *get_tensor(Id id)
+    {
+        return tensors.at(id).get();
+    }
+    const ClKernelTensor *get_tensor(Id id) const
+    {
+        return tensors.at(id).get();
+    }
+
+    DependencyGraph graph{};
+    KernelMap       kernels{};
+    KernelTensorMap tensors{};
+};
+using Id = DependencyGraph::Id;
+
+std::vector<const ClKernel *> traverse(const ClKernelGraph &graph);
+std::vector<ClKernel *> traverse(ClKernelGraph &graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp
new file mode 100644
index 0000000..e97cf88
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx)
+{
+    workload.context = ctx;
+    ClKernelGraph kernel_graph;
+    workload.status = validate(op_graph);
+    ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+    workload.status = translate(kernel_graph, *op_graph.impl());
+    ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+    ClFusedKernelGraph fused_k_graph;
+    std::tie(workload.status, fused_k_graph) = init_fusion_graph(kernel_graph);
+    ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+    workload.status = fuse(fused_k_graph);
+    ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+    workload.status = generate(workload, ctx, fused_k_graph);
+    ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+
+    // Get operator tensor id to workload tensor id map
+    const auto op_tensor_to_kernel_tensor       = fused_k_graph.original_graph->graph.get_merge_points();
+    const auto kernel_tensor_to_workload_tensor = workload.graph.get_merge_points();
+    for(const auto op_t : op_graph.impl()->graph.src_tensors())
+    {
+        const auto kernel_t                   = op_tensor_to_kernel_tensor.at(op_t);
+        const auto workload_t                 = kernel_tensor_to_workload_tensor.at(kernel_t);
+        workload.op_tensor_id_lut[workload_t] = op_t;
+    }
+    for(const auto op_t : op_graph.impl()->graph.dst_tensors())
+    {
+        const auto kernel_t                   = op_tensor_to_kernel_tensor.at(op_t);
+        const auto workload_t                 = kernel_tensor_to_workload_tensor.at(kernel_t);
+        workload.op_tensor_id_lut[workload_t] = op_t;
+    }
+    return workload.status;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp
new file mode 100644
index 0000000..2e8292b
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp

@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/DependencyGraph.h"
+
+#include <algorithm>
+#include <deque>
+#include <set>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+DependencyGraph::DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points)
+    : _adj_src_tensors{ adj_src_tensors }, _adj_dst_tensors{ adj_dst_tensors }, _adj_src_ops{ adj_src_ops }, _adj_dst_ops{ adj_dst_ops }, _merge_to_internal{ merge_points }, _operator_id{}, _tensor_id{}
+{
+}
+DependencyGraph::DependencyGraph(const std::vector<Id> &imported_tensors)
+    : _adj_src_tensors{}, _adj_dst_tensors{}, _adj_src_ops{}, _adj_dst_ops{}, _merge_to_internal{}, _operator_id{}, _tensor_id{}
+{
+    for(auto t : imported_tensors)
+    {
+        _adj_src_ops[t] = {};
+        _adj_dst_ops[t] = {};
+    }
+}
+
+Status DependencyGraph::update_merge_point(Id t_id, Id merge_point)
+{
+    if(_merge_to_internal.find(merge_point) == _merge_to_internal.end())
+    {
+        return Status{ ErrorCode::RUNTIME_ERROR, "Merge point does not exist" };
+    }
+    _merge_to_internal[merge_point] = t_id;
+    return Status{};
+}
+
+DependencyGraph::Id DependencyGraph::add_tensor(Id merge_tensor)
+{
+    Id new_tensor{ empty_id() };
+    if(merge_tensor != empty_id())
+    {
+        if(_merge_to_internal.find(merge_tensor) != _merge_to_internal.end())
+        {
+            new_tensor = _merge_to_internal[merge_tensor];
+        }
+        else
+        {
+            new_tensor                       = insert_new_tensor();
+            _merge_to_internal[merge_tensor] = new_tensor;
+        }
+    }
+    else
+    {
+        new_tensor = insert_new_tensor();
+    }
+    return new_tensor;
+}
+
+void DependencyGraph::remove_tensor(Id tensor)
+{
+    for(auto src_op : _adj_src_ops.at(tensor))
+    {
+        auto &dst_tensors = _adj_dst_tensors.at(src_op);
+        dst_tensors.erase(
+            std::remove(std::begin(dst_tensors), std::end(dst_tensors), tensor),
+            std::end(dst_tensors));
+    }
+    for(auto dst_op : _adj_dst_ops.at(tensor))
+    {
+        auto &src_tensors = _adj_src_tensors.at(dst_op);
+        src_tensors.erase(
+            std::remove(std::begin(src_tensors), std::end(src_tensors), tensor),
+            std::end(src_tensors));
+    }
+    _adj_src_ops.erase(tensor);
+    _adj_dst_ops.erase(tensor);
+}
+
+std::pair<Status, DependencyGraph::Id> DependencyGraph::add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs)
+{
+    Id new_op = insert_new_op();
+    for(Id tensor : inputs)
+    {
+        link_input(new_op, tensor);
+    }
+    for(Id tensor : outputs)
+    {
+        link_output(new_op, tensor);
+    }
+
+    // Use topological sort in order to detect possible loops / cycles.
+    // NOTE: This is unscalable. We'll need to have a better way of detecting loops or relax this invariant during operation, and add a validate method instead
+    return std::pair<Status, DependencyGraph::Id>(topological_sort().first, new_op);
+}
+
+void DependencyGraph::remove_operator(Id op)
+{
+    for(auto src_tensor : _adj_src_tensors.at(op))
+    {
+        auto &dst_ops = _adj_dst_ops.at(src_tensor);
+        dst_ops.erase(
+            std::remove(std::begin(dst_ops), std::end(dst_ops), op),
+            std::end(dst_ops));
+    }
+    for(auto dst_tensor : _adj_dst_tensors.at(op))
+    {
+        auto &src_ops = _adj_src_ops.at(dst_tensor);
+        src_ops.erase(
+            std::remove(std::begin(src_ops), std::end(src_ops), op),
+            std::end(src_ops));
+    }
+    _adj_src_tensors.erase(op);
+    _adj_dst_tensors.erase(op);
+}
+
+std::map<DependencyGraph::Id, DependencyGraph::Id> DependencyGraph::get_merge_points() const
+{
+    return _merge_to_internal;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::get_root_ops() const
+{
+    std::vector<Id> ops{};
+    const auto      op_list = all_ops();
+
+    for(auto op : op_list)
+    {
+        if(src_ops(op).empty())
+        {
+            ops.emplace_back(op);
+        }
+    }
+    return ops;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::get_dst_ops() const
+{
+    std::vector<Id> ops{};
+    const auto      op_list = all_ops();
+
+    for(auto op : op_list)
+    {
+        if(dst_ops(op).empty())
+        {
+            ops.emplace_back(op);
+        }
+    }
+    return ops;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::src_tensors(Id op) const
+{
+    ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+    return _adj_src_tensors.at(op);
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors(Id op) const
+{
+    ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+    return _adj_dst_tensors.at(op);
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::src_tensors() const
+{
+    std::vector<Id> tensors;
+    for(auto tensor_src_ops : _adj_src_ops)
+    {
+        if(tensor_src_ops.second.empty())
+            tensors.push_back(tensor_src_ops.first);
+    }
+    return tensors;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors() const
+{
+    std::vector<Id> tensors;
+    for(auto tensor_dst_ops : _adj_dst_ops)
+    {
+        if(tensor_dst_ops.second.empty())
+            tensors.push_back(tensor_dst_ops.first);
+    }
+    return tensors;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::src_ops_from_tensor(Id tensor) const
+{
+    return _adj_src_ops.at(tensor);
+}
+std::vector<DependencyGraph::Id> DependencyGraph::dst_ops_from_tensor(Id tensor) const
+{
+    return _adj_dst_ops.at(tensor);
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::all_ops() const
+{
+    std::vector<Id> ops{};
+    std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
+    {
+        return it.first;
+    });
+    return ops;
+}
+
+bool DependencyGraph::path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const
+{
+    for(auto child_op : dst_ops_from_tensor(src_tensor))
+    {
+        if(path_exists_from_op_to_op(child_op, dst_op))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool DependencyGraph::path_exists_from_op_to_op(Id src_op, Id dst_op) const
+{
+    if(src_op == dst_op)
+    {
+        return true;
+    }
+    if(is_in(src_op, get_dst_ops()))
+    {
+        return false;
+    }
+    for(auto child_tensor : dst_tensors(src_op))
+    {
+        if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::all_tensors() const
+{
+    std::vector<Id> tensors{};
+    std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
+    {
+        return it.first;
+    });
+    return tensors;
+}
+
+unsigned int DependencyGraph::number_of_ops() const
+{
+    return _adj_src_tensors.size();
+}
+
+unsigned int DependencyGraph::number_of_tensors() const
+{
+    return _adj_src_ops.size();
+}
+
+DependencyGraph::Id DependencyGraph::insert_new_tensor()
+{
+    Id new_tensor            = _tensor_id.alloc();
+    _adj_src_ops[new_tensor] = {};
+    _adj_dst_ops[new_tensor] = {};
+    return new_tensor;
+}
+DependencyGraph::Id DependencyGraph::insert_new_op()
+{
+    Id new_op                = _operator_id.alloc();
+    _adj_src_tensors[new_op] = {};
+    _adj_dst_tensors[new_op] = {};
+    return new_op;
+}
+void DependencyGraph::link_input(Id op, Id in_tensor)
+{
+    ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+    ARM_COMPUTE_ERROR_ON(!tensor_exists(in_tensor));
+    ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor));
+    _adj_src_tensors[op].push_back(in_tensor);
+    _adj_dst_ops[in_tensor].push_back(op);
+}
+void DependencyGraph::link_output(Id op, Id out_tensor)
+{
+    ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+    ARM_COMPUTE_ERROR_ON(!tensor_exists(out_tensor));
+    ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor));
+    _adj_dst_tensors[op].push_back(out_tensor);
+    _adj_src_ops[out_tensor].push_back(op);
+}
+bool DependencyGraph::tensor_exists(Id tensor) const
+{
+    return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+}
+bool DependencyGraph::operator_exists(Id op) const
+{
+    return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+}
+
+bool DependencyGraph::is_src_tensor(Id tensor) const
+{
+    if(!tensor_exists(tensor))
+    {
+        return false;
+    }
+    return _adj_src_ops.at(tensor).empty();
+}
+
+bool DependencyGraph::is_dst_tensor(Id tensor) const
+{
+    if(!tensor_exists(tensor))
+    {
+        return false;
+    }
+    return _adj_dst_ops.at(tensor).empty();
+}
+bool DependencyGraph::is_src_tensor_of(Id op, Id tensor) const
+{
+    if(!operator_exists(op) || !tensor_exists(tensor))
+    {
+        return false;
+    }
+    const auto op_inputs = src_tensors(op);
+    return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+}
+bool DependencyGraph::is_dst_tensor_of(Id op, Id tensor) const
+{
+    if(!operator_exists(op) || !tensor_exists(tensor))
+    {
+        return false;
+    }
+    const auto op_outputs = dst_tensors(op);
+    return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+}
+bool DependencyGraph::are_connected(Id op, Id tensor) const
+{
+    return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+}
+std::vector<DependencyGraph::Id> DependencyGraph::src_ops(Id op) const
+{
+    ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+    std::vector<Id> ops{};
+    for(Id src_tensor : src_tensors(op))
+    {
+        ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+    }
+    return ops;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::dst_ops(Id op) const
+{
+    ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+    std::vector<Id> ops{};
+    for(Id dst_tensor : _adj_dst_tensors.at(op))
+    {
+        ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+    }
+    return ops;
+}
+
+std::pair<Status, std::vector<DependencyGraph::OpPack>> DependencyGraph::topological_sort() const
+{
+    // Incident degree (number of source operators to an op)
+    std::map<Id, unsigned int> in_degree{};
+    std::set<Id>        visited_ops{};
+    std::deque<Id>      zero_in_degree_ops{};
+    std::vector<OpPack> sorted_op_packs{};
+    for(auto op : all_ops())
+    {
+        const auto degree = src_ops(op).size();
+        in_degree[op]     = degree;
+        if(degree == 0)
+        {
+            zero_in_degree_ops.push_back(op);
+            visited_ops.insert(op);
+        }
+    }
+
+    while(!zero_in_degree_ops.empty())
+    {
+        const Id op = zero_in_degree_ops.front();
+        zero_in_degree_ops.pop_front();
+        sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+
+        for(const auto next_op : dst_ops(op))
+        {
+            if(in_degree[next_op] > 0)
+            {
+                in_degree[next_op]--;
+            }
+            if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
+            {
+                zero_in_degree_ops.push_back(next_op);
+                visited_ops.insert(op);
+            }
+        }
+    }
+
+    // If there are remaining ops with in_degree > 0, then it's indication that there are cycles in the graph
+    Status st{};
+    if(sorted_op_packs.size() != number_of_ops())
+    {
+        st = Status{ ErrorCode::RUNTIME_ERROR, "Cycles or loops are not allowed in a DependencyGraph" };
+    }
+    return std::make_pair(st, sorted_op_packs);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h
new file mode 100644
index 0000000..bfa2eac
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h

@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
+
+#include <cstddef>
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+template <typename TDesc>
+class ITensorDescPack
+{
+public:
+    struct PackElement
+    {
+        PackElement()                    = default;
+        ~PackElement()                   = default;
+        PackElement(const PackElement &) = default;
+        PackElement &operator=(const PackElement &) = default;
+        PackElement(PackElement &&)                 = default;
+        PackElement &operator=(PackElement &&) = default;
+        PackElement(int id, TDesc *tensor)
+            : id(id), tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(int id, const TDesc *ctensor)
+            : id(id), tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        int          id{ -1 };
+        TDesc       *tensor{ nullptr };
+        const TDesc *ctensor{ nullptr };
+
+        friend bool operator==(const PackElement &elem0, const PackElement &elem1)
+        {
+            const bool same_ctensor = (elem0.tensor == nullptr && elem1.tensor == nullptr && elem0.ctensor != nullptr && elem1.ctensor != nullptr && *elem0.ctensor == *elem1.ctensor);
+            const bool same_tensor  = (elem0.ctensor == nullptr && elem1.ctensor == nullptr && elem0.tensor != nullptr && elem1.tensor != nullptr && *elem0.tensor == *elem1.tensor);
+
+            return elem0.id == elem1.id && (same_ctensor || same_tensor);
+        }
+    };
+
+public:
+    /** Default Constructor */
+    ITensorDescPack()                                           = default;
+    ~ITensorDescPack()                                          = default;
+    ITensorDescPack<TDesc>(const ITensorDescPack<TDesc> &other) = default;
+    ITensorDescPack<TDesc> &operator=(const ITensorDescPack<TDesc> &other) = default;
+    ITensorDescPack<TDesc>(ITensorDescPack<TDesc> &&other)                 = default;
+    ITensorDescPack<TDesc> &operator=(ITensorDescPack<TDesc> &&other) = default;
+    /**  Initializer list Constructor */
+    ITensorDescPack(std::initializer_list<PackElement> l)
+        : _pack{}
+    {
+        for(auto &e : l)
+        {
+            _pack[e.id] = e;
+        }
+    }
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, TDesc *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_const_tensor(int id, const TDesc *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Get tensor of a given id from the pac
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    TDesc *get_tensor(int id)
+    {
+        auto it = _pack.find(id);
+        return it != _pack.end() ? it->second.tensor : nullptr;
+    }
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is const else nullptr
+     */
+    const TDesc *get_const_tensor(int id) const
+    {
+        auto it = _pack.find(id);
+        if(it != _pack.end())
+        {
+            return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+        }
+        return nullptr;
+    }
+    /** Remove the tensor stored with the given id
+     *
+     * @param[in] id ID of tensor to remove
+     */
+    void remove_tensor(int id)
+    {
+        _pack.erase(id);
+    }
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const
+    {
+        return _pack.size();
+    }
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const
+    {
+        return _pack.empty();
+    }
+
+    /** Get the ACL_SRC_* tensors
+     *
+     * @return std::vector<TDesc *>
+     */
+    std::vector<TDesc *> get_src_tensors()
+    {
+        std::vector<TDesc *> src_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_tensor(id);
+            if(tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the const ACL_SRC_* tensors
+     *
+     * @return std::vector<const TDesc *>
+     */
+    std::vector<const TDesc *> get_const_src_tensors() const
+    {
+        std::vector<const TDesc *> src_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_const_tensor(id);
+            if(tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the ACL_DST_* tensors
+     *
+     * @return std::vector<TDesc *>
+     */
+    std::vector<TDesc *> get_dst_tensors()
+    {
+        std::vector<TDesc *> dst_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_tensor(id);
+            if(tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+    /** Get the const ACL_DST_* tensors
+     *
+     * @return std::vector<const TDesc *>
+     */
+    std::vector<const TDesc *> get_const_dst_tensors() const
+    {
+        std::vector<const TDesc *> dst_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_const_tensor(id);
+            if(tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+
+    friend bool operator==(const ITensorDescPack<TDesc> &pack0, const ITensorDescPack<TDesc> &pack1)
+    {
+        return pack0._pack == pack1._pack;
+    }
+
+private:
+    std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
new file mode 100644
index 0000000..4b91c0f
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp

@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, MemoryType memory_type, AuxMemoryInfo memory_info,
+                         DependencyGraph::Id &id)
+{
+    ARM_COMPUTE_UNUSED(op_graph);
+    return k_graph.add_kernel_tensor(op_tensor.desc, memory_type, memory_info, id, op_tensor.id);
+}
+
+Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, DependencyGraph::Id &id)
+{
+    // For a tensor t
+    // 1. If t is a src tensor of the entire op graph, then it's Core.
+    //    (Optimisation opportunity, if we guanrantee that all translate methods are called in topological order, we can always assign t to Core.
+    //       Because even if the op is non-root (which would mean t should be an Aux tensor), the src tensors would be already be determined by the ancestor ops (topological order), and thus would not be overriden by it)
+    // 2. If t is a dst tensor of the entire op graph, then it's Core.
+    // 3. Aux tensor with Persistent and Prepare lifetime is manually specified
+    // 4. All other ts not captured by the above are assigned Aux, with lifetime of Temporary.
+    // kernel_graph.add_kernel_tensor(input->desc, );
+    bool          is_src_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.src_tensors());
+    bool          is_dst_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.dst_tensors());
+    MemoryType    memory_type;
+    AuxMemoryInfo memory_info;
+    if(is_src_tensor_of_graph || is_dst_tensor_of_graph)
+    {
+        memory_type = MemoryType::Core;
+    }
+    else
+    {
+        memory_type          = MemoryType::Auxiliary;
+        memory_info.lifetime = AuxMemoryLifetime::Temporary;
+        memory_info.size     = op_tensor.desc->total_size();
+    }
+    return add_kernel_tensor(k_graph, op_graph, op_tensor, memory_type, memory_info, id);
+}
+
+/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
+ *
+ * @note Duplicate of the function with the same name in src/gpu/cl/operators/ClConv2d.cpp
+ *
+ * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
+ *
+ * @param[in] gpu_target GPU target
+ *
+ * @return the suitable kernel size for using direct convolution method with NHWC data layout
+ */
+size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
+{
+    switch(gpu_target)
+    {
+        case arm_compute::GPUTarget::G76:
+        case arm_compute::GPUTarget::G77:
+        case arm_compute::GPUTarget::G78:
+            return 5;
+        case arm_compute::GPUTarget::G71:
+        case arm_compute::GPUTarget::G72:
+        case arm_compute::GPUTarget::MIDGARD:
+        case arm_compute::GPUTarget::BIFROST:
+            return 7;
+        default:
+            return 5;
+    }
+}
+} // namespace
+
+bool operator==(const OpTensor &t0, const OpTensor &t1)
+{
+    return std::make_tuple(t0.id()) == std::make_tuple(t1.id());
+}
+bool operator==(const Padding2D &pad0, const Padding2D &pad1)
+{
+    return std::make_tuple(pad0.top, pad0.right, pad0.bottom, pad0.left) == std::make_tuple(pad1.top, pad1.right, pad1.bottom, pad1.left);
+}
+bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1)
+{
+    return std::make_tuple(conv2d0.pad, conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.pad, conv2d1.stride, conv2d1.dilation);
+}
+
+bool operator==(const AddDescriptor &, const AddDescriptor &)
+{
+    return std::make_tuple() == std::make_tuple(); // Currently two Add ops are always the same
+}
+
+bool Conv2dContent::operator==(const OperatorContent &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const Conv2dContent *>(&other);
+    return desc == converted.desc;
+}
+
+bool AddContent::operator==(const OperatorContent &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const AddContent *>(&other);
+    return desc == converted.desc;
+}
+
+ConvolutionMethod Conv2dContent::select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target)
+{
+    // Modified from ClConv2d::get_convolution_method
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+
+    const PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{});
+    const Size2D        dilation = conv2d_desc.dilation;
+
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
+    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+    const std::vector<ConfigurationMethod> known_configs =
+    {
+        // Alexnet
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        // VGG16 / VGG19
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+    };
+
+    const auto find_config = [&](ConfigurationMethod c)
+    {
+        const ConvolutionConfiguration config      = c.first;
+        const PadStrideInfo            info        = std::get<3>(config);
+        const DataLayout               data_layout = std::get<4>(config);
+
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == legacy_pad_stride.pad_top() && info.pad_right() == legacy_pad_stride.pad_right()
+               && info.pad_bottom() == legacy_pad_stride.pad_bottom() && info.pad_left() == legacy_pad_stride.pad_left() && info.stride() == legacy_pad_stride.stride() && (data_layout == src->data_layout());
+    };
+
+    std::vector<ConfigurationMethod>::const_iterator found;
+    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    {
+        return (*found).second;
+    }
+
+    if(dilation != Size2D(1U, 1U))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
+    {
+        if(src->data_layout() == DataLayout::NCHW)
+        {
+            ARM_COMPUTE_ERROR("NCHW not supported");
+        }
+        else
+        {
+            const bool   is_direct_valid           = bool(ClDirectConv2dKernel::validate(src, weights, nullptr, dst, ClDirectConv2dKernelDescriptor{ conv2d_desc }));
+            const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
+
+            // SRGAN case
+            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv2d_desc.pad.top < 3)
+               && is_direct_valid)
+            {
+                return ConvolutionMethod::DIRECT;
+            }
+
+            // Floating-point case: GeMM/Direct
+            if(is_data_type_float(src->data_type()))
+            {
+                // Get dst shape
+                TensorShape output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride);
+                const bool  is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool  is_ifm_ge_16       = src->dimension(idx_c) >= 16;
+                const bool  is_ofm_lte_8       = weights->dimension(3U) <= 8;
+                const bool  workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool  is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
+
+                // Direct convolution case
+                if(is_direct_valid)
+                {
+                    if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || gpu_target == arm_compute::GPUTarget::MIDGARD))
+                    {
+                        if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                    else
+                    {
+                        if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                }
+
+                // Default case
+                return ConvolutionMethod::GEMM;
+            }
+
+            // Generic case for quantized. Only GeMM
+            return ConvolutionMethod::GEMM;
+        }
+    }
+    return ConvolutionMethod::DIRECT;
+}
+
+Status Conv2dContent::translate(ClKernelGraph &kernel_graph) const
+{
+    const auto input  = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst    = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    const auto method = forced_method_enabled ? forced_method : Conv2dContent::select_conv_method(input->desc, weight->desc, dst->desc, desc, CLScheduler::get().target());
+    switch(method)
+    {
+        case ConvolutionMethod::DIRECT:
+        {
+            return translate_direct_conv2d(kernel_graph);
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Not implemented");
+        }
+    }
+    return Status{};
+}
+Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const
+{
+    const auto input  = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bias   = _tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst    = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst);
+
+    ITensorDescPack<ClKernelTensor> tensors;
+
+    DependencyGraph::Id input_id;
+    auto                st = add_kernel_tensor(kernel_graph, *_graph, *input, input_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(input_id));
+
+    DependencyGraph::Id weight_id;
+    st = add_kernel_tensor(kernel_graph, *_graph, *weight, weight_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(weight_id));
+
+    if(bias != nullptr)
+    {
+        DependencyGraph::Id bias_id;
+        st = add_kernel_tensor(kernel_graph, *_graph, *bias, bias_id);
+        ARM_COMPUTE_RETURN_ON_ERROR(st);
+        tensors.add_const_tensor(ACL_SRC_2, kernel_graph.get_tensor(bias_id));
+    }
+
+    DependencyGraph::Id dst_id;
+    st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
+
+    DependencyGraph::Id direct_conv2d_id;
+    const auto          kernel_desc = ClDirectConv2dKernelDescriptor{ desc };
+
+    st = ClDirectConv2dKernel::validate(input->desc, weight->desc, bias == nullptr ? nullptr : bias->desc, dst->desc, kernel_desc);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+    ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
+    st = kernel_graph.add_kernel<ClDirectConv2dKernel>(config, kernel_desc, tensors, direct_conv2d_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    ARM_COMPUTE_UNUSED(direct_conv2d_id);
+
+    return Status{};
+}
+
+Status AddContent::translate(ClKernelGraph &kernel_graph) const
+{
+    const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+    ITensorDescPack<ClKernelTensor> tensors;
+
+    DependencyGraph::Id lhs_id;
+    auto                st = add_kernel_tensor(kernel_graph, *_graph, *lhs, lhs_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(lhs_id));
+
+    DependencyGraph::Id rhs_id;
+    st = add_kernel_tensor(kernel_graph, *_graph, *rhs, rhs_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(rhs_id));
+
+    DependencyGraph::Id dst_id;
+    st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
+
+    DependencyGraph::Id add_id;
+    ClKernelConfig      config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
+
+    st = ClAddKernel::validate(lhs->desc, rhs->desc, dst->desc);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+    st = kernel_graph.add_kernel<ClAddKernel>(config, ClEltwiseAddKernelDescriptor{ desc }, tensors, add_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    ARM_COMPUTE_UNUSED(add_id);
+
+    return Status{};
+}
+
+std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph)
+{
+    std::vector<const OperatorContent *> ops;
+    const auto                           sorted = graph.graph.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        ops.push_back(graph.operators.at(pack.op).get());
+    }
+    return ops;
+}
+
+std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph)
+{
+    std::vector<OperatorContent *> ops;
+    const auto                     sorted = graph.graph.topological_sort();
+    for(const auto &pack : sorted.second)
+    {
+        ops.push_back(graph.operators.at(pack.op).get());
+    }
+    return ops;
+}
+
+Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph)
+{
+    for(const auto &op : traverse(op_graph))
+    {
+        const auto st = op->translate(kernel_graph);
+        ARM_COMPUTE_RETURN_ON_ERROR(st);
+    }
+    return Status{};
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
new file mode 100644
index 0000000..c33e189
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h

@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
+
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h"
+
+#include "support/Cast.h"
+#include "support/DeepCopy.h"
+
+#include <map>
+#include <tuple>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+enum class OperatorComplexity
+{
+    Complex = 0,
+    Simple
+};
+
+struct ClKernelGraph;
+struct OpTensorContent
+{
+public:
+    using Id          = DependencyGraph::Id;
+    OpTensorContent() = default;
+    OpTensorContent(Id id)
+        : id{ id }, desc{}
+    {
+    }
+    OpTensorContent(Id id, ITensorInfo *desc)
+        : id{ id }, desc{ desc }
+    {
+    }
+    ~OpTensorContent()                       = default;
+    OpTensorContent(const OpTensorContent &) = default;
+    OpTensorContent &operator=(const OpTensorContent &) = default;
+    OpTensorContent(OpTensorContent &&)                 = default;
+    OpTensorContent &operator=(OpTensorContent &&) = default;
+    bool operator==(const OpTensorContent &other) const
+    {
+        return desc == other.desc;
+    }
+
+    const ITensorInfo *get_tensor_info() const
+    {
+        return desc;
+    }
+    ITensorInfo *get_tensor_info()
+    {
+        return desc;
+    }
+
+    Id           id{};
+    ITensorInfo *desc{};
+};
+
+struct OperatorContent
+{
+public:
+    using Id          = DependencyGraph::Id;
+    OperatorContent() = default;
+    OperatorContent(const OperatorGraph::Implementation *graph, Id id, const ITensorDescPack<OpTensorContent> &tensors)
+        : _graph{ graph }, _id{ id }, _tensors{ tensors }
+    {
+    }
+    OperatorContent(const OperatorContent &op) = default;
+    OperatorContent &operator=(const OperatorContent &op) = default;
+    OperatorContent(OperatorContent &&op)                 = default;
+    OperatorContent &operator=(OperatorContent &&op)            = default;
+    virtual ~OperatorContent()                                  = default;
+    virtual OperatorComplexity complexity() const               = 0;
+    virtual bool operator==(const OperatorContent &other) const = 0;
+    virtual Status translate(ClKernelGraph &kernel_graph) const = 0;
+
+protected:
+    const OperatorGraph::Implementation *_graph {};
+    Id                                   _id{};
+    ITensorDescPack<OpTensorContent>     _tensors{};
+};
+
+struct Conv2dContent : public OperatorContent
+{
+public:
+    Conv2dContent() = default;
+    Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+        : OperatorContent(graph, id, tensors), desc(desc), forced_method(), forced_method_enabled(false)
+    {
+    }
+    // Temporary. Do not need to pass ConvolutionMethod
+    Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors, ConvolutionMethod method)
+        : OperatorContent(graph, id, tensors), desc(desc), forced_method(method), forced_method_enabled(true)
+    {
+    }
+    ~Conv2dContent()                     = default;
+    Conv2dContent(const Conv2dContent &) = default;
+    Conv2dContent &operator=(const Conv2dContent &) = default;
+    Conv2dContent(Conv2dContent &&)                 = default;
+    Conv2dContent &operator=(Conv2dContent &&) = default;
+    bool operator==(const OperatorContent &other) const override;
+    OperatorComplexity complexity() const override
+    {
+        return OperatorComplexity::Complex;
+    }
+    void set_method(ConvolutionMethod method)
+    {
+        forced_method_enabled = true;
+        forced_method         = method;
+    }
+
+    Status translate(ClKernelGraph &kernel_graph) const override;
+    /** Replicate heuristics of @ref ClConv2d::get_convolution_method(), except that non-supported data types and data layouts are removed from the heuristics
+     *
+     * @param src
+     * @param weights
+     * @param dst
+     * @param conv2d_desc
+     * @param gpu_target
+     * @return ConvolutionMethod
+     */
+    static ConvolutionMethod select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target);
+
+    Conv2dDescriptor  desc{};
+    ConvolutionMethod forced_method{ ConvolutionMethod::GEMM_CONV2D };
+    bool              forced_method_enabled{ false };
+
+private:
+    Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const;
+};
+
+class AddContent : public OperatorContent
+{
+public:
+    AddContent() = default;
+    AddContent(const OperatorGraph::Implementation *graph, Id id, const AddDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+        : OperatorContent(graph, id, tensors), desc(desc)
+    {
+    }
+    ~AddContent()                  = default;
+    AddContent(const AddContent &) = default;
+    AddContent &operator=(const AddContent &) = default;
+    AddContent(AddContent &&)                 = default;
+    AddContent &operator=(AddContent &&) = default;
+    bool operator==(const OperatorContent &other) const override;
+    OperatorComplexity complexity() const override
+    {
+        return OperatorComplexity::Simple;
+    }
+    Status translate(ClKernelGraph &kernel_graph) const override;
+
+private:
+    AddDescriptor desc{};
+};
+
+struct OperatorGraph::Implementation
+{
+public:
+    template <typename ContentT, typename... Args>
+    void add_node(Operator::Id id, Args &&... args)
+    {
+        operators[id] = utils::memory::make_deep_unique<OperatorContent, ContentT>(this, id, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void add_tensor(OpTensor::Id id, Args &&... args)
+    {
+        tensors[id] = utils::memory::make_deep_unique<OpTensorContent, OpTensorContent>(id, std::forward<Args>(args)...);
+    }
+
+    using Dependency  = DependencyGraph;
+    using OperatorMap = std::map<Operator::Id, utils::memory::deep_unique_ptr<OperatorContent>>;
+    using OpTensorMap = std::map<OpTensor::Id, utils::memory::deep_unique_ptr<OpTensorContent>>;
+
+    Implementation()  = default;
+    ~Implementation() = default;
+
+    friend bool operator==(const OperatorGraph::Implementation &graph0, const OperatorGraph::Implementation &graph1)
+    {
+        return graph0.graph == graph1.graph && graph0.operators == graph1.operators && graph0.tensors == graph1.tensors;
+    }
+
+    Dependency  graph{};
+    OperatorMap operators{};
+    OpTensorMap tensors{};
+    Status      status{};
+};
+
+std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph);
+
+std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph);
+
+Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
\ No newline at end of file
commit	b63b1196adea8b07dd8db77c2492a212650deba0	[log] [tgz]
author	SiCong Li <sicong.li@arm.com>	Fri Jan 28 18:24:39 2022 +0000
committer	SiCong Li <sicong.li@arm.com>	Fri May 06 15:01:45 2022 +0000
tree	b264035197873f56c69784bec68cad7041b5d423
parent	3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 [diff]