Rewrite dynamic fusion

The new version introduces the following major changes:

* Change public interface to simplify and standardize the user experience
    - Use the term "Workload" uniformly
    - Simplify operator interface to be a set of static methods:
      validate_op(), create_op()

* Separate the kernel writing into its own component (template_writer).
  This is to allow the co-development of GpuKernelWriter, and to allow
  easy replacement once GpuKernelWriter is mature.

* Optimize the core fusion algorithm used by the component graph. The
  details can be found in GpuKernelComponentGraph::fuse()

* Use Gpu instead of Cl prefixes for most of the Workload interfaces
  (except for runtime and kernel components, which have to be language specific)
  This allows the potential extension to other Gpu langauges in the
  future.

* Refactor runtime memory interface so that auxiliary tensor handling
  is separate from the user tensor passing. This is because the former
  is less stable and may require extension in the future.

* Hide source code object from the user as it is not required at the
  moment

* Deprecate the old prototype entirely by disabling it in SCons build

Resolves COMPMID-5510, COMPMID-5512, COMPMID-5513

Change-Id: If69d2362856f2de4503546b7b6cf48a525cf3079
Signed-off-by: SiCong Li <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8406
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
new file mode 100644
index 0000000..13c0b14
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelVariableTable.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+    // Do not re-declare if the variable associated with the tensor has already been declared
+    if(get_variable(tensor).has_valid_id())
+    {
+        ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info));
+        return;
+    }
+    // Declare variable associated with the tensor
+    std::stringstream ss;
+    ss << alias << "_t" << tensor->id();
+    const auto     uniq_name = ss.str();
+    TensorVariable var{ tensor->id(), uniq_name, argument_info };
+
+    if(is_interm)
+    {
+        _interm_var = var;
+        _interm_tensors.insert(tensor->id());
+    }
+    else
+    {
+        _vars.emplace(tensor->id(), var);
+    }
+}
+
+GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
+{
+    const TensorVariable empty_var{};
+    if(_vars.find(tensor->id()) != _vars.end())
+    {
+        return _vars.at(tensor->id());
+    }
+    if(_interm_tensors.find(tensor->id()) != _interm_tensors.end())
+    {
+        return _interm_var;
+    }
+    return empty_var;
+}
+
+GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+{
+    VariableList vars{};
+    for(const auto &tensor : tensors)
+    {
+        if(!tensor->has_valid_id())
+        {
+            continue;
+        }
+        vars.push_back(get_variable(tensor));
+    }
+    return vars;
+}
+
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
+    : value{ var.uniq_name }
+{
+}
+
+TagVal::TagVal(const std::string &val)
+    : value{ val }
+{
+}
+
+TagVal::TagVal(const char *val)
+    : value{ std::string(val) }
+{
+}
+
+TagVal::TagVal(const DataType &data_type)
+    : value{ get_cl_type_from_data_type(data_type) }
+{
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
new file mode 100644
index 0000000..4eee396
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "support/Requires.h"
+#include "support/StringSupport.h"
+
+#include <set>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A table of all the variables used in the kernel
+ * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed.
+ * Each kernel has exactly one variable table
+ */
+class GpuKernelVariableTable
+{
+public:
+    /** A tensor variable whose main purposes are:
+     *  - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
+     *  - Hold the generated variable name for the associated tensor info
+     */
+    struct TensorVariable
+    {
+    public:
+        TensorVariable()                       = default;
+        TensorVariable(const TensorVariable &) = default;
+        TensorVariable       &operator=(const TensorVariable &) = default;
+        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
+        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        GpuKernelArgumentInfo kernel_argument_info{};
+        bool                  has_valid_id() const
+        {
+            return id != ITensorInfo::invalid_tensor_id;
+        }
+    };
+    using VariableList = std::vector<TensorVariable>;
+
+public:
+    /** Declare a @ref TensorVariable for a corresponding tensor info.
+     *
+     * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo
+     *        Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order
+     *
+     * @param[in] tensor        Tensor info with which the new variable is associated
+     * @param[in] argument_info Kernel argument information
+     * @param[in] is_interm     If the new variable is an intermediate variable
+     * @param[in] alias         Alias for the variable. Will be used as part of the variable name
+     */
+    void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed");
+    /** Get the @ref TensorVariable associated with @p tensor
+     *
+     * @param[in] tensor Tensor info to be queried
+     *
+     * @return TensorVariable
+     */
+    TensorVariable get_variable(const ITensorInfo *tensor) const;
+    /** Get the @ref TensorVariable list associated with @p tensors
+     * @note Empty tensors are skipped
+     *
+     * @param[in] tensors List of tensor infos to be queried
+     *
+     * @return VariableList
+     */
+    VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
+
+private:
+    std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/
+    TensorVariable            _interm_var{};           /**< Intermediate variable */
+    std::set<ITensorInfo::Id> _interm_tensors{};       /**< Tensors associated with the single intermediate variable */
+};
+
+/** A tag value will substitute a tag in a string template during its instantiation */
+struct TagVal
+{
+    /** Default constructor */
+    TagVal() = default;
+    /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
+    TagVal(const GpuKernelVariableTable::TensorVariable &var);
+    /** Construct a @ref TagVal from an integral type */
+    template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
+    TagVal(T val)
+        : value{ support::cpp11::to_string(val) }
+    {
+    }
+    /** Construct a @ref TagVal from a string */
+    TagVal(const std::string &val);
+    /** Construct a @ref TagVal from a c-style string */
+    TagVal(const char *val);
+    /** Construct a @ref TagVal from a @ref DataType */
+    TagVal(const DataType &data_type);
+    /** Get the value of the TagVal as a converted string */
+    std::string value{};
+};
+
+/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
+using Tag = std::string;
+
+/** Tag lookup table. It is used to instantiate a string template */
+using TagLUT = std::unordered_map<Tag, TagVal>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
new file mode 100644
index 0000000..c85ddf5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuKernelVariableTable;
+
+/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
+ */
+class IGpuTemplateComponentWriter
+{
+public:
+    using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> tensors)
+        : _id{ id }, _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuTemplateComponentWriter()
+    {
+    }
+    /** Generate kernel component name */
+    virtual std::string get_name() const = 0;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate additional macros used in the component */
+    virtual std::string get_additional_macros() const
+    {
+        return "";
+    }
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
+    {
+        ARM_COMPUTE_UNUSED(comp_group);
+        return CLBuildOptions{};
+    }
+    /** Generate the component config id string used for tuning */
+    virtual std::string get_config_id() const
+    {
+        return "";
+    }
+    /** Generate the header list used in the component */
+    virtual std::set<std::string> get_headers_list() const
+    {
+        return std::set<std::string> {};
+    }
+    /** Generate the execution window for the component */
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+
+private:
+    ComponentId               _id{ -1 };
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
new file mode 100644
index 0000000..870de64
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateDirectConv2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
+                                               const ArgumentPack<ITensorInfo> &tensors,
+                                               const Attributes                &attributes,
+                                               const Settings                  &settings)
+    : IGpuTemplateComponentWriter{ id, tensors },
+      _src{},
+      _weight{},
+      _bias{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
+}
+
+std::string ClTemplateDirectConv2d::get_name() const
+{
+    return "direct_conv2d";
+}
+
+std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const auto k0            = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
+    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
+
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// IN_1(wei)            {{weight}}
+)_";
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+// IN_1(bia)            {{bias}}
+)_";
+    }
+    code += R"_(
+// OUT(dst, accum)      {{dst}}
+
+// Initialize the accumulators
+TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH {{WEI_WIDTH}}
+#define _IWEI_HEIGHT {{WEI_HEIGHT}}
+#define _ISRC_WIDTH {{src}}_w
+#define _ISRC_HEIGHT {{src}}_h
+#define _ISRC_CHANNELS {{src}}_c
+#define _IDST_WIDTH {{arg_dst}}_w
+#define _IDST_HEIGHT {{arg_dst}}_h
+#define _IDST_CHANNELS {{arg_dst}}_c
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
+        yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+        xi[i].v -= {{PAD_LEFT}};
+        yi[i].v -= {{PAD_TOP}};
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        {{dst}}[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+            ck += K0;
+        }
+
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+)_";
+
+    if(leftover_loop)
+    {
+        code += R"_(
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+            ++ck;
+        }
+    )_";
+}
+
+code += R"_(
+#undef _I_WEI_WIDTH
+#undef _I_WEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+
+    }
+)_";
+
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+    )_";
+}
+
+code += R"_(
+}
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+    return code;
+}
+
+void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(
+        _weight,
+        GpuKernelArgumentInfo(weight_type),
+        comp_group.is_intermediate_tensor(_weight),
+        "weight");
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        vtable.declare_variable(
+            _bias,
+            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
+            comp_group.is_intermediate_tensor(_bias),
+            "bias");
+    }
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+    // Arguments and global shared variables
+    lut["src"]    = vtable.get_variable(_src);
+    lut["weight"] = vtable.get_variable(_weight);
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        lut["bias"]          = vtable.get_variable(_bias);
+        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
+    }
+    lut["dst"] = vtable.get_variable(_dst);
+
+    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    lut["arg_dst"]          = dst_argument.uniq_name;
+
+    // Local build options
+    lut["meta_kernel_id"] = id();
+    lut["ACC_DATA_TYPE"]  = _src->data_type();
+    lut["SRC_DATA_TYPE"]  = _src->data_type();
+    lut["WEI_DATA_TYPE"]  = _weight->data_type();
+
+    lut["SRC_TENSOR_TYPE"] = "BUFFER";
+    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+    {
+        lut["WEI_TENSOR_TYPE"] = "IMAGE";
+        break;
+    }
+        default:
+    {
+        lut["WEI_TENSOR_TYPE"] = "BUFFER";
+        break;
+    }
+    }
+    const auto width_idx  = 1;
+    const auto height_idx = 2;
+    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+
+    lut["STRIDE_X"] = _attributes.stride().x();
+    lut["STRIDE_Y"] = _attributes.stride().y();
+
+    lut["PAD_LEFT"] = _attributes.pad().left;
+    lut["PAD_TOP"]  = _attributes.pad().top;
+
+    lut["ZERO_VALUE"] = 0;
+
+    return lut;
+}
+
+CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
+{
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataType     data_type   = _src->data_type();
+
+    /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
+    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0               = root_window.x().step();
+    const unsigned int m0               = root_window.y().step();
+    const unsigned int k0               = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
+    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+    CLBuildOptions build_opts{};
+    if(_settings.fast_relaxed_math())
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+    else
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    build_opts.add_option("-DIS_TILED");
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClTemplateDirectConv2d::get_config_id() const
+{
+    const DataType   data_type   = _src->data_type();
+    const DataLayout data_layout = _src->data_layout();
+
+    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const unsigned int kernel_size = _weight->dimension(width_idx);
+
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(data_type));
+    config_id += "_";
+    config_id += support::cpp11::to_string(kernel_size);
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().x());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().y());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(data_layout));
+    return config_id;
+}
+
+std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
+{
+    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+}
+
+Window ClTemplateDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto output_shape = _dst->tensor_shape();
+
+    const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
+    const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+    // Create and configure kernel window
+    Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
new file mode 100644
index 0000000..48027a9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    ClTemplateDirectConv2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+    /** Generate the component config id string used for tuning */
+    std::string get_config_id() const override;
+    /** Generate the header list used in the component */
+    std::set<std::string> get_headers_list() const override;
+    /** Generate the execution window for the component */
+    Window get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_weight;
+    const ITensorInfo *_bias;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
new file mode 100644
index 0000000..6c4b8f5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateStore.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+
+std::string ClTemplateStore::get_name() const
+{
+    return "store";
+}
+
+std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    return R"_(
+//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+{
+// This also follows NHWC layout
+// g_ind_0 maps to global_id(0) maps to Channel
+// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window)
+// g_ind_2 maps to global_id(2) maps to N / Batch
+#define _IDST_WIDTH {{dst}}_w
+#define _IDST_HEIGHT {{dst}}_h
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+    //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+}
+
+)_";
+}
+
+void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    // ARM_COMPUTE_UNUSED(comp_group)
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+
+    // Arguments and global shared variables
+    lut["src"] = vtable.get_variable(_src);
+    lut["dst"] = vtable.get_variable(_dst);
+
+    // Local build options
+    lut["meta_kernel_id"]  = id();
+    lut["DST_TENSOR_TYPE"] = "BUFFER";
+    const auto dst_info    = comp_group.get_dst_tensors()[0];
+    lut["DST_DATA_TYPE"]   = dst_info->data_type();
+
+    return lut;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
new file mode 100644
index 0000000..3f97a82
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateStore final : public IGpuTemplateComponentWriter
+{
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateStore(const ClTemplateStore &store) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateStore(ClTemplateStore &&store) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
new file mode 100644
index 0000000..cb643a7
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateWriter.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
+std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
+{
+    std::string replaced_code    = "";
+    bool        scanning_pattern = false;
+    std::string pattern_found    = "";
+    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    {
+        if(!scanning_pattern)
+        {
+            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            {
+                i += 1;
+                scanning_pattern = true;
+                pattern_found    = "";
+            }
+            else
+            {
+                replaced_code += code_template[i];
+            }
+        }
+        else
+        {
+            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            {
+                i += 1;
+                scanning_pattern = false;
+                std::string err  = "Pattern " + pattern_found + " not found in tags";
+                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
+                replaced_code += tags.find(pattern_found)->second.value;
+            }
+            else
+            {
+                pattern_found += code_template[i];
+            }
+        }
+    }
+
+    return replaced_code;
+}
+ClTemplateWriter::~ClTemplateWriter()
+{
+}
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
+    : _components{ components }
+{
+}
+std::string ClTemplateWriter::get_name()
+{
+    return write_kernel_name();
+}
+std::string ClTemplateWriter::get_code()
+{
+    return write_code();
+}
+std::string ClTemplateWriter::get_config_id()
+{
+    std::string config_id = get_name();
+    for(const auto &comp : _components)
+    {
+        config_id += "--" + comp->template_writer()->get_config_id() + "--";
+    }
+
+    return config_id;
+}
+
+CLBuildOptions ClTemplateWriter::get_build_options()
+{
+    CLBuildOptions build_opts{};
+
+    for(const auto &comp : _components)
+    {
+        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
+    }
+
+    return build_opts;
+}
+
+Window ClTemplateWriter::get_window() const
+{
+    const auto root_comp = _components.get_root_component();
+    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+    return root_comp->template_writer()->get_window();
+}
+
+std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
+{
+    // Assemble GpuKernelArguments
+    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
+    for(const auto t : _components.get_argument_tensors())
+    {
+        tensors.emplace(
+            t->id(),
+            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+    }
+    return tensors;
+}
+
+std::string ClTemplateWriter::write_code()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
+
+    // These data structures will hold the data from all the components in the blueprint
+    std::set<std::string>    headers_list{};
+    std::set<std::string>    additional_macros{};
+    std::vector<std::string> component_codes{}; // vector because order matters
+
+    // Pass 1: Declare all kernel variables
+    for(auto &component : _components)
+    {
+        component->template_writer()->declare_variables(_vtable, _components);
+    }
+    // Pass 2: Generate component codes
+    for(auto &component : _components)
+    {
+        const auto component_writer       = component->template_writer();
+        auto       curr_headers_list      = component_writer->get_headers_list();
+        auto       curr_additional_macros = component_writer->get_additional_macros();
+        auto       curr_component_code    = component_writer->get_component_code(_components);
+        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        component_codes.push_back(replace_tags(curr_component_code, var_lut));
+
+        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
+        if(!additional_macros.empty()) // Some components might not have any
+        {
+            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
+        }
+    }
+
+    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
+    std::string code = "";
+
+    for(auto &header : headers_list)
+    {
+#if defined(EMBEDDED_KERNELS)
+        code += CLKernelLibrary::get().get_program(header).first;
+#else  // defined(EMBEDDED_KERNELS)
+        code += "#include \"" + header + "\"\n";
+#endif // defined(EMBEDDED_KERNELS)
+    }
+
+    for(auto &macros : additional_macros)
+    {
+        code += macros;
+    }
+
+    code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors()));
+
+    code += "\n{\n\n";
+
+    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
+    code += write_global_section();
+    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+
+    for(const auto &component_code : component_codes)
+    {
+        code += component_code;
+    }
+
+    code += "}\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_global_section() const
+{
+    const auto dst_tensors = _components.get_dst_tensors();
+    ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
+    const auto dst_info   = dst_tensors[0];
+    const auto dst_w      = dst_info->dimension(0);
+    const auto tile_w     = std::max(1, get_window().x().step());
+    const auto tile_h     = std::max(1, get_window().y().step());
+    auto       leftover_w = dst_w % tile_w;
+
+    std::string code = "";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
+    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
+
+    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
+    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
+{
+    std::string code;
+    switch(var.kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
+        }
+    }
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
+{
+    std::string code = "\n__kernel void " + write_kernel_name() + "(";
+
+    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    {
+        code += write_argument_declaration(argument_list[i]) + ",";
+    }
+    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    {
+        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
+    }
+
+    code += ')';
+
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_name() const
+{
+    if(_components.empty())
+    {
+        return "empty_kernel";
+    }
+    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
+    for(size_t i = 1; i < _components.size(); ++i)
+    {
+        name += "___";
+        name += _components[i]->template_writer()->get_name();
+    }
+
+    return name;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
new file mode 100644
index 0000000..83f617b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use a templated-string-based method to write kernel code
+ *  It stitches the component code templates together based on the valid fusion configuration.
+ *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
+ */
+class ClTemplateWriter : public IGpuKernelWriter
+{
+public:
+    /** Instantiates a kernel code string from the kernel code template
+     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
+     *
+     * @param[in] code_template Kernel code template
+     * @param[in] tags          Tag lookup table
+     *
+     * @return std::string  Instantiated kernel string
+     */
+    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
+    /** Default constructor */
+    ClTemplateWriter() = default;
+    /** Constructor
+     *
+     * @param[in] components Kernel component group from which the kernel will be generated
+     */
+    ClTemplateWriter(const GpuKernelComponentGroup &components);
+    /** Destructor */
+    ~ClTemplateWriter() override;
+    /** Generate kernel name */
+    std::string get_name() override;
+    /** Generate kernel code */
+    std::string get_code() override;
+    /** Generate build options */
+    CLBuildOptions get_build_options() override;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    std::string get_config_id() override;
+    /** Generate execution window */
+    Window get_window() const override;
+    /** Get the kernel argument lists of the kernel*/
+    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+
+private:
+    std::string write_kernel_name() const;
+    std::string write_code();
+    std::string write_global_section() const;
+    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
+    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
+
+private:
+    GpuKernelComponentGroup _components{};
+    GpuKernelVariableTable  _vtable{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */