Add GpuKernelArgumentBinding for runtime argument setting

* Add flexible runtime argument setting that accept argument bindings
exported from ckw.

* Introduce internal build flag ACL_INTERNAL_TEST_CKW_IN_DF. If set to
true, ckw will be tested in dynamic fusion validation tests. Otherwise
it will not be tested and the dynamic fusion will keep using
ClTemplateWriter instead.

* Fix CKW sampler for elementwise binary to deal with tile sizes > 1
in both dimensions

Resolves: COMPMID-6282
Partially resolves: COMPMID-6260

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: I0ab225a4484eb2119643d900a4e72806558626ee
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9917
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 302d4c8..226e1a2 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -33,6 +33,7 @@
 namespace dynamic_fusion
 {
 /** Contain information required to set up a kernel argument at run time
+ * @deprecated To be removed along with ClTemplateWriter
  */
 struct GpuKernelArgumentInfo
 {
@@ -66,10 +67,9 @@
     }
     Type type{ Type::Tensor_4D_t_Buffer };
 };
-
 bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
-
 /** Kernel argument information linked with its corresponding @ref ITensorInfo
+ * @deprecated To be removed along with ClTemplateWriter
  */
 class GpuKernelArgument
 {
@@ -124,6 +124,130 @@
     TensorInfo            _tensor_info{};
     GpuKernelArgumentInfo _kernel_arg_info{};
 };
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+/** Describe how the tensor runtime memory can be accessed
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorStorageType
+{
+    Unknown,
+    ClBufferUint8Ptr,
+    ClImage2dReadOnly,
+    ClImage2dWriteOnly,
+};
+
+/** Describe additional runtime information about the tensor
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorComponentType
+{
+    Unknown,
+    OffsetFirstElement,
+    Stride0,
+    Stride1,
+    Stride2,
+    Stride3,
+    Stride4,
+    Dim0,
+    Dim1,
+    Dim2,
+    Dim3,
+    Dim4,
+    Dim1xDim2,
+    Dim2xDim3,
+    Dim1xDim2xDim3,
+};
+
+/** Describe how to extract information from a runtime Gpu tensor, and set it as an argument to a gpu kernel at runtime
+ *
+ * A kernel argument is just an argument to the gpu kernel as shown in the argument list below. This contrasts with a "workload argument" which is a tensor (@ref GpuWorkloadArgument)
+ * void kernel(arg0, arg1, ... argN)
+ *
+ * In a kernel generated using dynamic fusion (@ref GpuKernelSourceCode), every kernel argument describes part of a tensor.
+ * A tensor is described as: **storages** followed by **components**
+ *
+ * A storage (@ref TensorStorageType) describes how the tensor runtime memory can be accessed (e.g. via a global uint8 pointer to a CL buffer)
+ * A component (@ref TensorComponentType) describes additional runtime information about the tensor (e.g. the dimensions of the tensor)
+ *
+ * The arguments are arranged in the order of use in the generated kernel code:
+ *
+ *  arg0   , arg1      , arg2      ,                         ...,                         , argN
+ *  storage, component0, component1, ..., componentX, storage, component0, component1, ..., componentY
+ * |                   tensor0                       |                    tensor1                    |
+ *
+ * An example argument list:
+ *
+ * void kernel(
+ *  image2d_t       t0_image,               // TensorStorageType::ClImage2dReadOnly
+ *  uint8_t*        t0_ptr,                 // TensorStorageType::ClBufferUint8Ptr
+ *  uint            t0_dim0,                // TensorComponentType::Dim0
+ *  uint            t0_stride1,             // TensorComponentType::Stride1
+ *  image2d_t       t1_ptr,                 // TensorStorageType::ClImage2dReadOnly
+ *  uint            t1_dim1xdim2,           // TensorComponentType::Dim1xDim2
+ *  uint            t1_stride1,             // TensorComponentType::Stride1
+ *  uint            t1_stride2,             // TensorComponentType:Stride2
+ * )
+ *
+ */
+class GpuKernelArgumentBinding
+{
+public:
+    enum class Type : int32_t
+    {
+        TensorStorage,  /** @ref TensorStorageType */
+        TensorComponent /** @ref TensorComponentType */
+    };
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
+        : _type{ Type::TensorStorage }, _id{ id }, _value{}
+    {
+        _value.tensor_storage_type = storage;
+    }
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
+        : _type{ Type::TensorComponent }, _id{ id }, _value{}
+    {
+        _value.tensor_component_type = component;
+    }
+    /** Storage type of the tensor
+     */
+    TensorStorageType tensor_storage_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorStorage);
+        return _value.tensor_storage_type;
+    }
+    /** Component of the tensor
+     */
+    TensorComponentType tensor_component_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorComponent);
+        return _value.tensor_component_type;
+    }
+    /** Id of the tensor this kernel argument belongs to
+     */
+    ITensorInfo::Id id() const
+    {
+        return _id;
+    }
+    /** Type of the kernel argument
+     */
+    Type type() const
+    {
+        return _type;
+    }
+
+private:
+    Type            _type;
+    ITensorInfo::Id _id;
+    union Value
+    {
+        TensorStorageType   tensor_storage_type;
+        TensorComponentType tensor_component_type;
+    };
+    Value _value;
+};
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index b70a192..5a65ede 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -44,14 +44,14 @@
     return tensor_ids;
 }
 
-GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services)
-    : _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
+    : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{}
 {
 }
 
 GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
 {
-    GpuKernelComponentStream stream{ _services, mem_map };
+    GpuKernelComponentStream stream{ _context, _services, mem_map };
     const auto               op_seq = _dependency_graph.build_operators_sequence();
 
     stream.new_component_group();
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
index 8314ea0..85c9b45 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
@@ -49,9 +49,10 @@
 public:
     /** Constructor
      *
+     * @param[in] context  @ref GpuWorkloadContext to be used by the graph
      * @param[in] services @ref GpuComponentServices to be used by the graph
      */
-    GpuKernelComponentGraph(GpuComponentServices *services);
+    GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services);
     /** Prevent instances of this class from being copy constructed */
     GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
     /** Prevent instances of this class from being copied */
@@ -98,6 +99,7 @@
 
 private:
     static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+    GpuWorkloadContext   *_context;
     GpuComponentServices *_services;
     std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
     std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
@@ -106,4 +108,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index 8f4eadc..a2b6623 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@
 {
 namespace dynamic_fusion
 {
-GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
-    : _services{ services }, _component_groups{}, _mem_map{ mem_map }
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
+    : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map }
 {
 }
 
@@ -51,7 +51,7 @@
         const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
         // The whole unit workload stage is determined by the root component
         const auto unit_workload_stage = group.get_root_component()->properties().stage();
-        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map);
+        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map, _context);
     }
     return source_code;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
index cbaa7c2..ba2503a 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,10 +49,11 @@
 public:
     /** Constructor
      *
+     * @param[in] context  @ref GpuWorkloadContext to be used throughout the stream
      * @param[in] services @ref GpuComponentServices to be used throughout the stream
      * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
      */
-    GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
     /** Allow instances of this class to be copy constructed */
     GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
     /** Allow instances of this class to be copied */
@@ -78,6 +79,7 @@
     bool add_component(IGpuKernelComponent *component);
 
 private:
+    GpuWorkloadContext                  *_context;
     GpuComponentServices                *_services;
     std::vector<GpuKernelComponentGroup> _component_groups{};
     MemoryDescriptorMap                  _mem_map{};
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 7479328..64e1cdc 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include <map>
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include <deque>
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include <string>
 
 namespace arm_compute
@@ -38,7 +42,11 @@
 namespace dynamic_fusion
 {
 /** The argument list of a @ref GpuKernelSourceCode */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 /** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
  */
@@ -123,4 +131,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index 00f625d..c99984f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,11 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -46,11 +50,19 @@
 GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
 {
     GpuKernelSourceCode code;
-    ClTemplateWriter    writer{ _comp_group };
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+    ClTemplateWriter writer { _comp_group };
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuCkwDriver writer { _comp_group };
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     code.name(writer.get_name());
     code.code(writer.get_code());
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     code.arguments(writer.get_tensors());
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    code.arguments(writer.get_kernel_arguments());
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     code.build_options(writer.get_build_options());
     code.config_id(writer.get_config_id());
     code.window(writer.get_window());
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
index 50f34d9..c2bd012 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -32,9 +32,8 @@
 {
 namespace dynamic_fusion
 {
-
 GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
-    : _impl { std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
+    : _impl{ std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
 {
 }
 
@@ -75,8 +74,7 @@
 }
 
 GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
-    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx),
-      _next_tensor_id(1), _mem_map()
+    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info()
 {
 }
 
@@ -103,26 +101,39 @@
 
     tensor_info.set_id(tensor_id);
     _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User };
+    // Save a *copy* of the user tensor info in workload context for future reference
+    // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
+    _managed_tensor_info.emplace(tensor_info.id(), std::make_unique<TensorInfo>(tensor_info));
 }
 
-void GpuWorkloadContext::Impl::register_aux_tensor(ITensorInfo &tensor_info, const AuxMemoryInfo &mem_info)
+ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor()
 {
-    ARM_COMPUTE_ERROR_ON(tensor_info.has_valid_id());
-
-    const auto tensor_id = next_tensor_id();
-
-    tensor_info.set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, mem_info };
-}
-
-void GpuWorkloadContext::Impl::register_virtual_tensor(ITensorInfo &tensor_info)
-{
-    ARM_COMPUTE_ERROR_ON(tensor_info.has_valid_id());
-
-    const auto tensor_id = -next_tensor_id();
-
-    tensor_info.set_id(tensor_id);
+    auto       tensor_info = std::make_unique<TensorInfo>();
+    const auto tensor_id   = -next_tensor_id();
+    tensor_info->set_id(tensor_id);
     _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual };
+    auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+    return inserted.first->second.get();
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo &itensor_info)
+{
+    auto       tensor_info = std::make_unique<TensorInfo>(itensor_info);
+    const auto tensor_id   = next_tensor_id();
+    tensor_info->set_id(tensor_id);
+    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } };
+    auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+    return inserted.first->second.get();
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id)
+{
+    return _managed_tensor_info.at(id).get();
+}
+
+const ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id) const
+{
+    return _managed_tensor_info.at(id).get();
 }
 
 ITensorInfo::Id GpuWorkloadContext::Impl::next_tensor_id()
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
index a857932..c169476 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
@@ -36,7 +36,6 @@
 {
 namespace dynamic_fusion
 {
-
 /** Internal implementation of workload context. */
 class GpuWorkloadContext::Impl
 {
@@ -52,7 +51,7 @@
     Impl(Impl &) = default;
 
     /** Assignment */
-    Impl& operator=(Impl &) = default;
+    Impl &operator=(Impl &) = default;
 
     /** Get target GPU language. */
     GpuLanguage gpu_language() const;
@@ -69,27 +68,34 @@
      */
     void register_user_tensor(ITensorInfo &tensor_info);
 
-    /** Set a new ID and register the auxiliary tensor info.
+    /** Create a virtual (see @ref MemoryType) tensor info and save it
      *
-     * @param[in, out] tensor_info The tensor info to be registered.
-     * @param[in]      mem_info    The auxiliary tensor memory info.
+     * @return ITensorInfo*  The created virtual tensor info object pointer
      */
-    void register_aux_tensor(ITensorInfo &tensor_info, const AuxMemoryInfo &mem_info);
+    ITensorInfo *create_virtual_tensor();
+    /** Create an auxiliary (see @ref MemoryType) tensor info and save it
+     *
+     * @param[in] tensor_info @ref ITensorInfo to copy from
+     *
+     * @return ITensorInfo*  The created auxiliary tensor info object pointer
+     */
+    ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info);
 
-    /** Set a new ID and register the virtual tensor info.
-     *
-     * @param[in, out] tensor_info The tensor info to be registered.
-     */
-    void register_virtual_tensor(ITensorInfo &tensor_info);
+    /** Get tensor info created by this context, from id */
+    ITensorInfo *get_tensor_info(ITensorInfo::Id id);
+
+    /** Get tensor info created by this context, from id */
+    const ITensorInfo *get_tensor_info(ITensorInfo::Id id) const;
 
 private:
     ITensorInfo::Id next_tensor_id();
 
-    GpuLanguage _gpu_language;
+    GpuLanguage       _gpu_language;
     CLCompileContext *_cl_compile_ctx;
 
-    ITensorInfo::Id _next_tensor_id;
+    ITensorInfo::Id     _next_tensor_id;
     MemoryDescriptorMap _mem_map;
+    std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
index 44c99e8..d303389 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -26,13 +26,10 @@
 
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
-
-#include <memory>
-#include <vector>
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
 {
@@ -52,9 +49,8 @@
         Context *context)
         : _context{ context },
           _comp_services{},
-          _component_graph{ &_comp_services },
-          _operator_group{},
-          _managed_tensor_info_list{ std::vector<std::unique_ptr<TensorInfo>>() }
+          _component_graph{ _context, &_comp_services },
+          _operator_group{}
     {
     }
     /** Prevent instances of this class from being copy constructed */
@@ -90,10 +86,6 @@
     {
         return _operator_group;
     }
-    ITensorInfo::Id allocate_new_tensor_id()
-    {
-        return ++_next_id;
-    }
     /** Generate @ref GpuWorkloadSourceCode from the workload sketch
      * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
      *
@@ -110,37 +102,29 @@
      */
     ITensorInfo *create_virtual_tensor()
     {
-        auto uptr = std::make_unique<TensorInfo>();
-        _context->implementation().register_virtual_tensor(*uptr);
-        _managed_tensor_info_list.emplace_back(std::move(uptr));
-        return _managed_tensor_info_list.back().get();
+        return _context->implementation().create_virtual_tensor();
     }
     /** Create an auxiliary (see @ref MemoryType) tensor info and save it
      *
-     * @return ITensorInfo*  The created auxiliary tensor info object pointer
-     */
-
-    /** Create an auxiliary (see @ref MemoryType) tensor info and save it
-     *
      * @param[in] tensor_info @ref ITensorInfo to copy from
      *
      * @return ITensorInfo*  The created auxiliary tensor info object pointer
      */
     ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info)
     {
-        auto uptr = std::make_unique<TensorInfo>(tensor_info);
-        _context->implementation().register_aux_tensor(*uptr, AuxMemoryInfo{ uptr->total_size() });
-        _managed_tensor_info_list.emplace_back(std::move(uptr));
-        return _managed_tensor_info_list.back().get();
+        return _context->implementation().create_auxiliary_tensor(tensor_info);
+    }
+
+    ITensorInfo *get_tensor_info(ITensorInfo::Id id)
+    {
+        return _context->implementation().get_tensor_info(id);
     }
 
 private:
-    Context                                 *_context;
-    GpuComponentServices                     _comp_services;
-    GpuKernelComponentGraph                  _component_graph;
-    GpuOperatorGroup                         _operator_group;
-    ITensorInfo::Id                          _next_id{ ITensorInfo::invalid_tensor_id };
-    std::vector<std::unique_ptr<TensorInfo>> _managed_tensor_info_list;
+    Context                *_context;
+    GpuComponentServices    _comp_services;
+    GpuKernelComponentGraph _component_graph;
+    GpuOperatorGroup        _operator_group;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index d1d0bdf..578366d 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
 {
@@ -34,10 +35,45 @@
 {
 namespace dynamic_fusion
 {
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+namespace
+{
+/** Extract kernel arguments of one tensor from a flat list of kernel arguments.
+ *
+ * @param[in] flat_kernel_args
+ * @return GpuKernelArgumentList
+ */
+GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
+{
+    if(flat_kernel_args.empty())
+    {
+        return {};
+    }
+    GpuKernelArgumentList tensor_kargs{};
+
+    const GpuKernelArgumentBinding &karg_head = flat_kernel_args.front();
+    tensor_kargs.push_back(karg_head);
+    flat_kernel_args.pop_front();
+    const auto tensor_id = karg_head.id();
+
+    while(!flat_kernel_args.empty())
+    {
+        const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
+        if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+        {
+            return tensor_kargs;
+        }
+        tensor_kargs.push_back(karg);
+        flat_kernel_args.pop_front();
+    }
+    return tensor_kargs;
+}
+}
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
 
-/** Describes all the info related to a kernel in order to:
+/** Describes all the info related to a **workload argument** (tensor) in order to:
  *  - be used by runtime to configure gpu kernel argument
  *  - be used by memory managers to allocate required memory
  */
@@ -46,6 +82,7 @@
 public:
     /** Default constructor */
     GpuWorkloadArgument() = default;
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Constructor
      *
      * @param[in] tensor_info     @ref ITensorInfo of the workload argument
@@ -60,6 +97,22 @@
           _kernel_arg_info{ kernel_arg_info }
     {
     }
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Constructor
+     *
+     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_arg_list @ref GpuKernelArgumentList of the workload argument
+     */
+    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
+                        const MemoryDescriptor      &mem_desc,
+                        const GpuKernelArgumentList &kernel_args)
+        : _tensor_info{ tensor_info },
+          _mem_desc{ mem_desc },
+          _kernel_args{ kernel_args }
+    {
+    }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get tensor id within workload */
     ITensorInfo::Id id() const
     {
@@ -85,6 +138,7 @@
     {
         return &_mem_desc;
     }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get @ref GpuKernelArgumentInfo of the argument */
     GpuKernelArgumentInfo *kernel_argument_info()
     {
@@ -95,6 +149,18 @@
     {
         return &_kernel_arg_info;
     }
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Get @ref GpuKernelArgumentList of the workload tensor */
+    GpuKernelArgumentList *kernel_argument_list()
+    {
+        return &_kernel_args;
+    }
+    /** Get @ref GpuKernelArgumentList of the workload tensor */
+    const GpuKernelArgumentList *kernel_argument_list() const
+    {
+        return &_kernel_args;
+    }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Check if the workload argument has valid id
      *
      * @return true   If has valid id
@@ -106,9 +172,13 @@
     }
 
 private:
-    TensorInfo            _tensor_info{};
-    MemoryDescriptor      _mem_desc{};
-    GpuKernelArgumentInfo _kernel_arg_info{};
+    TensorInfo       _tensor_info{};
+    MemoryDescriptor _mem_desc{};
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuKernelArgumentInfo _kernel_arg_info {};
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuKernelArgumentList     _kernel_args {};
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 /** Describes when a unit workload is run.
@@ -179,15 +249,18 @@
      * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
      * @param[in] stage       Stage of the unit workload
      * @param[in] mem_map     @ref MemoryDescriptor map for all tensors within the unit workload
+     * @param[in] context     @ref GpuWorkloadContext associated with the unit workload
      *
      * @return UnitWorkloadId  Allocated unit workload id
      */
-    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map)
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context)
     {
         // Use the size of the kernel codes as Id
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
         const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
         _unit_workloads.push_back(unit_work);
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+        ARM_COMPUTE_UNUSED(context);
         // Assemble kernel argument with memory descriptor to form workload argument
         for(const auto &id_arg : kernel_code.arguments())
         {
@@ -200,6 +273,28 @@
             }
             _tensor_uwork_map[arg_id].insert(uwk_id);
         }
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+        GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
+        GpuKernelArgumentList tensor_kargs{};
+        while(true)
+        {
+            tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
+            if(tensor_kargs.empty())
+            {
+                break;
+            }
+            else
+            {
+                const auto tensor_id           = tensor_kargs.at(0).id();
+                _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs };
+                if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+                {
+                    _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
+                }
+                _tensor_uwork_map[tensor_id].insert(uwk_id);
+            }
+        }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
         return uwk_id;
     }
     /** Get a unit workload from its id */
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index ae67790..28e5432 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,11 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 
 #include <map>
 #include <string>
+#include <vector>
 
 namespace arm_compute
 {
@@ -56,8 +58,20 @@
     virtual std::string get_config_id() = 0;
     /** Generate execution window */
     virtual Window get_window() const = 0;
-    /** Get the kernel argument lists of the kernel*/
-    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0;
+    /** Get the kernel argument lists of the kernel
+     * @deprecated To be removed along with ClTemplateWriter
+     */
+    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors()
+    {
+        return {};
+    }
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Get the flat list of arguments of the kernel*/
+    virtual GpuKernelArgumentList get_kernel_arguments()
+    {
+        return {};
+    }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index d5c03c6..d78956f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Window.h"
 #include "src/common/utils/Log.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
@@ -42,29 +43,24 @@
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{ components }
+    : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }
 {
 }
 
 std::string GpuCkwDriver::get_name()
 {
     ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    return "todo_get_name";
+    return "unnamed";
 }
 
 std::string GpuCkwDriver::get_code()
 {
-    ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    ckw::Kernel              kernel(get_name().c_str(), GpuTargetLanguage::OpenCL);
-    GpuCkwKernelWriter       root_writer(kernel);
+    _kernel.name(get_name());
+    GpuCkwKernelWriter       root_writer(_kernel);
     GpuCkwScopedKernelWriter writer(&root_writer);
     GpuCkwVariableTable      vtable{};
 
     // Global Kernel Writer Driver code
-
-    // The following is just an incomplete example of using the kernel writer
-
-    // Iterate over component specific Ckw Driver; generate component code and concatenate them
     for(auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
@@ -96,18 +92,31 @@
     return root_comp->ckw_component_driver()->get_window();
 }
 
-std::map<ITensorInfo::Id, GpuKernelArgument> GpuCkwDriver::get_tensors()
+GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
-    ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    // Assemble GpuKernelArguments
-    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for(const auto t : _components.get_argument_tensors())
+    GpuKernelArgumentList args{};
+    for(const auto &arg : _kernel.arguments())
     {
-        tensors.emplace(
-            t->id(),
-            GpuKernelArgument{ *t, { GpuKernelArgumentInfo::Type::Tensor_Special_0 } });
+        switch(arg.type())
+        {
+            case KernelArgument::Type::TensorStorage:
+            {
+                args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_storage_type()));
+                break;
+            }
+            case KernelArgument::Type::TensorComponent:
+            {
+                args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_component_type()));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported KernelArgument Type");
+                break;
+            }
+        }
     }
-    return tensors;
+    return args;
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 2084b72..c6e03f6 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -28,6 +28,8 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
+#include "ckw/Kernel.h"
+
 #include <map>
 #include <string>
 
@@ -66,11 +68,12 @@
     std::string get_config_id() override;
     /** Generate execution window */
     Window get_window() const override;
-    /** Get the kernel argument lists of the kernel*/
-    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+    /** Get the flat list of arguments of the kernel*/
+    GpuKernelArgumentList get_kernel_arguments() override;
 
 private:
     GpuKernelComponentGroup _components{};
+    ckw::Kernel             _kernel;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 1549687..6f3eca7 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -23,9 +23,10 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 #include <sstream>
 
@@ -35,7 +36,8 @@
 {
 namespace dynamic_fusion
 {
-GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias)
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
+                                                               const std::string &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
@@ -59,7 +61,7 @@
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name.c_str(), to_ckw(*tensor)) };
+        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) };
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 1c9cb08..0649dcb 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -24,8 +24,8 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
 #include <map>
 
@@ -37,6 +37,7 @@
 {
 class GpuKernelComponentGroup;
 class GpuCkwScopedKernelWriter;
+enum class TensorStorageType;
 
 /** A table of all the variables used in the kernel.
  *
@@ -52,11 +53,13 @@
      * @param[in] comp_group Component group the tensor belongs to
      * @param[in] writer     Compute Kernel Writer
      * @param[in] tensor     Tensor info with which the new variable is associated
+     * @param[in] storage    Tensor storage type associated with the tensor
      * @param[in] alias      Alias for the variable. Will be used as part of the variable name
      *
      * @return GpuCkwComponentArgument*
      */
-    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias = "unnamed");
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
+                                              const std::string &alias = "unnamed");
 
 private:
     std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index 224c176..c07fac0 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -23,14 +23,15 @@
  */
 #include "GpuCkwActivation.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "ckw/TensorTileSampler.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include <string>
@@ -84,8 +85,8 @@
 } // namespace
 
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
-                                                 const ArgumentPack<ITensorInfo> &tensors,
-                                                 const Attributes                &attributes)
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes)
     : IGpuCkwComponentDriver{ id, tensors },
       _src{},
       _dst{},
@@ -102,8 +103,8 @@
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
 
@@ -111,14 +112,14 @@
     auto &dst_tile = dst->tile();
 
     // Constants
-    const auto &constant_minus_1     = writer->declare_tile("minus_1", -1);
-    const auto &constant_pos_1       = writer->declare_tile("one", 1);
-    const auto &constant_zero        = writer->declare_tile("zero", 0);
-    const auto &constant_A           = writer->declare_tile("A_VAL", _attributes.a());
-    const auto &constant_B           = writer->declare_tile("B_VAL", _attributes.b());
+    const auto &constant_minus_1 = writer->declare_tile("minus_1", -1);
+    const auto &constant_pos_1   = writer->declare_tile("one", 1);
+    const auto &constant_zero    = writer->declare_tile("zero", 0);
+    const auto &constant_A       = writer->declare_tile("A_VAL", _attributes.a());
+    const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
 
     // Perform the operation.
-    switch (_attributes.activation())
+    switch(_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
@@ -178,9 +179,9 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index dd71c55..8d7e6a8 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -23,14 +23,15 @@
  */
 #include "GpuCkwCast.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "ckw/TensorTileSampler.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 #include <string>
@@ -84,8 +85,8 @@
 } // namespace
 
 GpuCkwCast::GpuCkwCast(ComponentId                      id,
-                                                 const ArgumentPack<ITensorInfo> &tensors,
-                                                 const Attributes                &attributes)
+                       const ArgumentPack<ITensorInfo> &tensors,
+                       const Attributes                &attributes)
     : IGpuCkwComponentDriver{ id, tensors },
       _src{},
       _dst{},
@@ -102,8 +103,8 @@
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the source tile and prepare the sampler.
     if(!src->has_tile())
@@ -124,7 +125,7 @@
     if(!dst->has_tile())
     {
         // Get Target datatype and convert it to ckw::DataType.
-        ckw::DataType target_dt =  dynamic_fusion::to_ckw(_attributes.data_type());
+        ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
 
         // Create dst_tile based on src_tile dimensions and with target DataType.
         const TileInfo src_tile_info = src_tile.tile_info();
@@ -166,9 +167,9 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index 685bf39..15e32e2 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -23,14 +23,16 @@
  */
 #include "GpuCkwElementwiseBinary.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/types/TensorSamplerTypes.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include <string>
@@ -54,14 +56,20 @@
     auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
     auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
 
-    auto &const_0 = writer->declare_tile("0", 0);
-
     writer->op_get_global_id(gid_0, 0);
     writer->op_get_global_id(gid_1, 1);
     writer->op_get_global_id(gid_2, 2);
 
-    sampler.x(gid_0);
-    sampler.y(gid_1);
+    auto &x_coord = writer->declare_tile("x_coord", ckw::DataType::Int32);
+    auto &y_coord = writer->declare_tile("y_coord", ckw::DataType::Int32);
+    auto &m0_t    = writer->declare_tile("m0", m0);
+    auto &n0_t    = writer->declare_tile("n0", n0);
+    writer->op_binary_expression(x_coord, gid_0, ckw::BinaryOp::Mul, n0_t);
+    writer->op_binary_expression(y_coord, gid_1, ckw::BinaryOp::Mul, m0_t);
+
+    sampler.x(x_coord);
+    sampler.y(y_coord);
+    auto &const_0 = writer->declare_tile("0", 0);
     sampler.z(const_0); // 3rd dimension collapsed with 2nd dimension
     sampler.b(gid_2);
 
@@ -99,9 +107,9 @@
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
-    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the LHS and RHS tiles and prepare the tensor sampler.
     load_lhs_rhs_tiles_and_prepare_sampler(writer, lhs, rhs, m0, n0, create_simple_sampler);
@@ -131,10 +139,9 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    // constexpr unsigned int vector_size_byte_opencl = 16;
-    // const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    const unsigned int num_elems_processed_per_iteration = 1U; // Hard-coded for now
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 63555e6..247d1b8 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -24,6 +24,7 @@
 #include "GpuCkwStore.h"
 
 #include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
@@ -43,8 +44,8 @@
 }
 void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
 {
-    auto src = vtable.declare_variable(comp_group, writer, _src, "src");
-    auto dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     auto       &src_tile   = src->tile();
     const auto &sampler    = src->tile_sampler();
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h
index 9027bdd..8a38d67 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "ckw/TensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 namespace arm_compute
 {
@@ -98,6 +99,103 @@
         tensor_info.id()
     };
 }
+
+inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+    switch(component)
+    {
+        case ckw::TensorComponentType::OffsetFirstElement:
+            return TensorComponentType::OffsetFirstElement;
+            break;
+        case ckw::TensorComponentType::Stride0:
+            return TensorComponentType::Stride0;
+            break;
+        case ckw::TensorComponentType::Stride1:
+            return TensorComponentType::Stride1;
+            break;
+        case ckw::TensorComponentType::Stride2:
+            return TensorComponentType::Stride2;
+            break;
+        case ckw::TensorComponentType::Stride3:
+            return TensorComponentType::Stride3;
+            break;
+        case ckw::TensorComponentType::Stride4:
+            return TensorComponentType::Stride4;
+            break;
+        case ckw::TensorComponentType::Dim0:
+            return TensorComponentType::Dim0;
+            break;
+        case ckw::TensorComponentType::Dim1:
+            return TensorComponentType::Dim1;
+            break;
+        case ckw::TensorComponentType::Dim2:
+            return TensorComponentType::Dim2;
+            break;
+        case ckw::TensorComponentType::Dim3:
+            return TensorComponentType::Dim3;
+            break;
+        case ckw::TensorComponentType::Dim4:
+            return TensorComponentType::Dim4;
+            break;
+        case ckw::TensorComponentType::Dim1xDim2:
+            return TensorComponentType::Dim1xDim2;
+            break;
+        case ckw::TensorComponentType::Dim2xDim3:
+            return TensorComponentType::Dim2xDim3;
+            break;
+        case ckw::TensorComponentType::Dim1xDim2xDim3:
+            return TensorComponentType::Dim1xDim2xDim3;
+            break;
+        case ckw::TensorComponentType::Unknown:
+            return TensorComponentType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+            return TensorComponentType::Unknown;
+    }
+}
+
+inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+    switch(storage)
+    {
+        case TensorStorageType::ClBufferUint8Ptr:
+            return ckw::TensorStorageType::BufferUint8Ptr;
+            break;
+        case TensorStorageType::ClImage2dReadOnly:
+            return ckw::TensorStorageType::Texture2dReadOnly;
+            break;
+        case TensorStorageType::ClImage2dWriteOnly:
+            return ckw::TensorStorageType::Texture2dWriteOnly;
+            break;
+        case TensorStorageType::Unknown:
+            return ckw::TensorStorageType::Unknown;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor storage type");
+            return ckw::TensorStorageType::Unknown;
+    }
+}
+inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+    switch(storage)
+    {
+        case ckw::TensorStorageType::BufferUint8Ptr:
+            return TensorStorageType::ClBufferUint8Ptr;
+            break;
+        case ckw::TensorStorageType::Texture2dReadOnly:
+            return TensorStorageType::ClImage2dReadOnly;
+            break;
+        case ckw::TensorStorageType::Texture2dWriteOnly:
+            return TensorStorageType::ClImage2dWriteOnly;
+            break;
+        case ckw::TensorStorageType::Unknown:
+            return TensorStorageType::Unknown;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+            return TensorStorageType::Unknown;
+    }
+}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index d600956..af766a7 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -104,9 +104,11 @@
     {
         return _properties;
     }
-    /** Get template writer for the component */
-    virtual const IGpuTemplateComponentWriter *template_writer() const = 0;
-    /** Get compute kernel writer driver for the component */
+    /** Get writer for the component */
+    virtual const IGpuTemplateComponentWriter *template_writer() const
+    {
+        return nullptr;
+    }
     virtual const IGpuCkwComponentDriver *ckw_component_driver() const
     {
         return nullptr;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index d2cde40..c41257d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -24,8 +24,11 @@
 #include "ClComponentActivation.h"
 
 #include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -66,8 +69,17 @@
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
     : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateActivation>(id, tensors, attributes) },
-      _ckw_driver{ std::make_unique<GpuCkwActivation>(id, tensors, attributes) }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<ClTemplateActivation>(id, tensors, attributes)
+}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<GpuCkwActivation>(id, tensors, attributes)
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -75,15 +87,15 @@
 {
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentActivation::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
 
-const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
-{
-    return _ckw_driver.get();
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index bb6f7c6..ebe8719 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -42,8 +42,11 @@
 class ArgumentPack;
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateActivation;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwActivation;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentActivation final : public IGpuKernelComponent
 {
@@ -106,10 +109,12 @@
     /** Allow instances of this class to be moved */
     ClComponentActivation &operator=(ClComponentActivation &&component) = default;
 
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver     *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -118,8 +123,11 @@
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateActivation> _component_writer;
-    std::unique_ptr<GpuCkwActivation>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwActivation> _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index 92933ae..635869f 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -26,8 +26,11 @@
 #include "arm_compute/core/Error.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -67,23 +70,32 @@
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
     : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateCast>(id, tensors, attributes) },
-      _ckw_driver{ std::make_unique<GpuCkwCast>(id, tensors, attributes) }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<ClTemplateCast>(id, tensors, attributes)
+}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<GpuCkwCast>(id, tensors, attributes)
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
 }
 ClComponentCast::~ClComponentCast()
 {
 }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentCast::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
 
-const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
-{
-    return _ckw_driver.get();
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index 174f967..37b8cbb 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -48,8 +48,11 @@
 };
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateCast;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwCast;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentCast final : public IGpuKernelComponent
 {
@@ -116,10 +119,12 @@
     ClComponentCast(ClComponentCast &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentCast &operator=(ClComponentCast &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-    /** Get GPU kernel writer for the component */
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -127,8 +132,11 @@
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateCast> _component_writer;
-    std::unique_ptr<GpuCkwCast>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwCast>   _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 52739e2..88d7291 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -25,8 +25,11 @@
 
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -106,28 +109,38 @@
     return Status{};
 }
 
+ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
+{
+}
 ClComponentElementwiseBinary::ClComponentElementwiseBinary(
     ComponentId                      id,
     const Properties                &properties,
     const ArgumentPack<ITensorInfo> &tensors,
     const Attributes                &attributes)
     : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes) },
-      _ckw_driver{ std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes) }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)
+}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
-ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
-{
-}
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
 
-const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
-{
-    return _ckw_driver.get();
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index a56dd8b..f717590 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -40,8 +40,11 @@
 class ArgumentPack;
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateElementwiseBinary;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwElementwiseBinary;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentElementwiseBinary final : public IGpuKernelComponent
 {
@@ -101,10 +104,13 @@
     ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver            *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -112,10 +118,13 @@
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateElementwiseBinary> _component_writer;
-    std::unique_ptr<GpuCkwElementwiseBinary>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwElementwiseBinary> _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index a3283b1..12b81c3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -24,8 +24,11 @@
 #include "ClComponentStore.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include <memory>
 
@@ -43,20 +46,31 @@
     return Status{};
 }
 ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) }, _ckw_driver{ std::make_unique<GpuCkwStore>(id, tensors) }
+    : IGpuKernelComponent{ id, properties, tensors },
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<ClTemplateStore>(id, tensors)
+}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<GpuCkwStore>(id, tensors)
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 ClComponentStore::~ClComponentStore()
 {
 }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
-const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
-{
-    return _ckw_driver.get();
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index f168ccb..853ee39 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -25,7 +25,6 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
 #include <memory>
 
 namespace arm_compute
@@ -39,7 +38,11 @@
 /** Forward declaration */
 template <typename T>
 class ArgumentPack;
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+class ClTemplateStore;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwStore;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentStore final : public IGpuKernelComponent
 {
@@ -85,10 +88,12 @@
     ClComponentStore(ClComponentStore &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentStore &operator=(ClComponentStore &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -96,8 +101,11 @@
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateStore> _component_writer;
-    std::unique_ptr<GpuCkwStore>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwStore>  _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental