Use the stable CKW API in the GPU dynamic fusion backend - Refactor all kernels to work with the CKW stable API - Add support for sub-tile in the op_load/op_store CKW operator - Fix mismatch in resize - Add comments in all kernels written with CKW to help developers understand the structure of the code - Add texture image support in depthwise convolution written with CKW - Add support for different block sizes in depthwise convolution - Remove the use of the dynamic fusion helper functions. - Add support for floor in the op_unary() of CKW Resolves: COMPMID-6708, COMPMID-6743, COMPMID-6530 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Change-Id: I8104ce4d04a3138a1aeb0b84940e1f1c89e76069 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10914 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>

commit: 2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c [log] [tgz]
author: Gunes Bayir <gunes.bayir@arm.com> Wed Jan 17 16:07:03 2024 +0000
committer: Viet-Hoa Do <viet-hoa.do@arm.com> Thu Feb 01 16:00:34 2024 +0000
tree: a4e2d5ce46443a79a0778e4960462ce3edf106ec
parent: 7ab7fca87cca8775f82b0e9efec6a40975910c17 [diff]
diff --git a/.clang-tidy b/.clang-tidy
index e789636..b0d22e5 100644
--- a/.clang-tidy
+++ b/.clang-tidy

@@ -1,9 +1,9 @@
 ---
-Checks:          'clang-diagnostic-*,clang-analyzer-*,*,-abseil-*,-fuchsia-*,-bugprone-*,-hicpp-*,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-constant-array-index,-cert-err58-cpp,-cppcoreguidelines-pro-type-reinterpret-cast,-google-runtime-references,-google-build-using-namespace,-readability-redundant-member-init,-readability-redundant-declaration,-readability-else-after-return,-performance-type-promotion-in-math-fn,-cert-err60-cpp,-cppcoreguidelines-narrowing-conversions,-readability-magic-numbers,-cppcoreguidelines-avoid-magic-numbers,-readability-named-parameter,-readability-implicit-bool-conversion,-readability-uppercase-literal-suffix,-clang-analyzer-optin.cplusplus.VirtualCall,-cppcoreguidelines-macro-usage'
+Checks:          'clang-diagnostic-*,*,-llvm-include-order,clang-analyzer-*,-abseil-*,-fuchsia-*,-bugprone-*,-hicpp-*,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-constant-array-index,-cert-err58-cpp,-cppcoreguidelines-pro-type-reinterpret-cast,-google-runtime-references,-google-build-using-namespace,-readability-redundant-member-init,-readability-redundant-declaration,-readability-else-after-return,-performance-type-promotion-in-math-fn,-cert-err60-cpp,-cppcoreguidelines-narrowing-conversions,-readability-magic-numbers,-cppcoreguidelines-avoid-magic-numbers,-readability-named-parameter,-readability-implicit-bool-conversion,-readability-uppercase-literal-suffix,-clang-analyzer-optin.cplusplus.VirtualCall,-cppcoreguidelines-macro-usage'
 WarningsAsErrors: ''
 HeaderFilterRegex: ''
 AnalyzeTemporaryDtors: false
-CheckOptions:    
+CheckOptions:
   - key:             cert-dcl16-c.IgnoreMacros
     value:           '1'
   - key:             cert-dcl16-c.NewSuffixes
@@ -211,4 +211,3 @@
   - key:             zircon-temporary-objects.Names
     value:           ''
 ...
-

diff --git a/SConscript b/SConscript
index 100bb54..f0c4297 100644
--- a/SConscript
+++ b/SConscript

@@ -137,7 +137,7 @@
 
 
 def get_ckw_obj_list():
-    cmake_obj_dir = os.path.abspath("prototype/CMakeFiles/ckw_prototype.dir/src")
+    cmake_obj_dir = os.path.abspath("CMakeFiles/ckw.dir/src")
     return recursive_glob(root_dir=cmake_obj_dir, pattern=".*.o$")
 
 
@@ -163,7 +163,7 @@
     else:
         # Always statically link Compute Library against CKW
         if env['experimental_dynamic_fusion'] and name == "arm_compute":
-            libs.append('libckw_prototype.a')
+            libs.append('libckw.a')
 
         # Add shared library versioning
         if env['set_soname']:

diff --git a/SConstruct b/SConstruct
index e415b34..bad85e5 100644
--- a/SConstruct
+++ b/SConstruct

@@ -169,7 +169,7 @@
 if not env['install_dir'].startswith('/') and install_path != "":
     install_path = "%s/%s" % (build_path, install_path)
 
-env.Append(LIBPATH = [build_path, os.path.join(build_path, "prototype")])
+env.Append(LIBPATH = [build_path, os.path.join(build_path, "")])
 Export('env')
 Export('vars')
 
@@ -439,15 +439,14 @@
     CKW_ENABLE_ASSERTS = env['debug'] or env['asserts']
 
     CKW_PROJECT_DIR = Dir('.').path + "/compute_kernel_writer"
-    CKW_INCLUDE_DIR = CKW_PROJECT_DIR + "/prototype/include"
+    CKW_INCLUDE_DIR = CKW_PROJECT_DIR + "/include"
     CKW_BUILD_DIR = build_path.replace("#", "")
 
     CKW_CMAKE_CMD = "CC={CKW_CC} CXX={CKW_CXX} cmake -G \"Unix Makefiles\" " \
                     "-S {CKW_PROJECT_DIR} -B {CKW_BUILD_DIR} " \
                     "-DCMAKE_BUILD_TYPE={CKW_BUILD_TYPE} " \
-                    "-DCKW_ENABLE_OPENCL={CKW_ENABLE_OPENCL} " \
+                    "-DCKW_ENABLE_OPENCL=ON " \
                     "-DCKW_ENABLE_ASSERTS={CKW_ENABLE_ASSERTS} " \
-                    "-DCKW_BUILD_PROTOTYPE=ON " \
                     "-DCKW_CCACHE={CKW_CCACHE} ".format(CKW_CC=CKW_CC,
                                                         CKW_CXX=CKW_CXX,
                                                         CKW_PROJECT_DIR=CKW_PROJECT_DIR,
@@ -460,7 +459,7 @@
 
     # Configure CKW static objects with -fPIC (CMAKE_POSITION_INDEPENDENT_CODE) option to enable linking statically to ACL
     CKW_CMAKE_CONFIGURE_STATIC = CKW_CMAKE_CMD + "-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON"
-    CKW_CMAKE_BUILD = "cmake --build {CKW_BUILD_DIR} --target ckw_prototype -j{NUM_JOBS}".format(CKW_BUILD_DIR=CKW_BUILD_DIR,
+    CKW_CMAKE_BUILD = "cmake --build {CKW_BUILD_DIR} --target ckw -j{NUM_JOBS}".format(CKW_BUILD_DIR=CKW_BUILD_DIR,
                                                                                                  NUM_JOBS=GetOption('num_jobs')
                                                                                                  )
 

diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
index 5b6c1b9..0030528 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD
-#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD_H
 
 #include "arm_compute/core/Error.h"
 
@@ -52,9 +52,6 @@
      * |:--------------|:--------------|:-------------|
      * |F16            |F16            |F16           |
      * |F32            |F32            |F32           |
-     * |S32            |S32            |S32           |
-     * |S16            |S16            |S16           |
-     * |U8             |U8             |U8            |
      *
      * Valid data layouts:
      * - Any
@@ -86,4 +83,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD */
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD_H

diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
index 1593cec..9735dcf 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST
-#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 
@@ -49,13 +49,8 @@
      * Valid data type configurations:
      * |src            |dst                                    |
      * |:--------------|:--------------------------------------|
-     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
-     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
-     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
-     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
-     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
-     * |F16            | U8, S8, U16, S16, U32, S32, F32       |
-     * |F32            | U8, S8, U16, S16, U32, S32, F16       |
+     * |F16            | F32                                   |
+     * |F32            | F16                                   |
      *
      * Input data type must be different than output data type.
      *
@@ -90,4 +85,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST */
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST_H

diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
index 4d2db0e..a1b0c33 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,12 +42,6 @@
 class GpuPool2dSettings
 {
 public:
-    /* Get mixed_precision*/
-    bool mixed_precision() const;
-
-    /* Set mixed_precision */
-    GpuPool2dSettings &mixed_precision(bool mixed_precision);
-
     /* Get using -infinity as limit flag */
     bool use_inf_as_limit() const;
 
@@ -55,7 +49,6 @@
     GpuPool2dSettings use_inf_as_limit(bool use_inf_as_limit);
 
 private:
-    bool _mixed_precision{false};
     bool _use_inf_as_limit{true};
 };
 

diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
index e2ece80..e02f9da 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE
-#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
 
@@ -51,12 +51,8 @@
      * Valid data type configurations:
      * |src            |dst            |
      * |:--------------|:--------------|
-     * |QASYMM8        |QASYMM8        |
-     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
      * |F16            |F16            |
      * |F32            |F32            |
-     * |U8             |U8             |
-     * |S16            |S16            |
      *
      * Valid data layouts:
      * - NHWC
@@ -90,4 +86,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE */
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE_H

diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
index 2d9255f..8c44ec5 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB
-#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB_H
 
 #include "arm_compute/core/Error.h"
 
@@ -52,9 +52,6 @@
      * |:--------------|:--------------|:-------------|
      * |F16            |F16            |F16           |
      * |F32            |F32            |F32           |
-     * |S32            |S32            |S32           |
-     * |S16            |S16            |S16           |
-     * |U8             |U8             |U8            |
      *
      * Valid data layouts:
      * - Any
@@ -88,4 +85,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB */
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB_H

diff --git a/compute_kernel_writer/include/ckw/KernelWriter.h b/compute_kernel_writer/include/ckw/KernelWriter.h
index 0d739e8..da41b94 100644
--- a/compute_kernel_writer/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/include/ckw/KernelWriter.h

@@ -25,11 +25,22 @@
 #ifndef CKW_INCLUDE_CKW_KERNELWRITER_H
 #define CKW_INCLUDE_CKW_KERNELWRITER_H
 
+#include "ckw/Kernel.h"
+#include "ckw/TensorInfo.h"
 #include "ckw/TensorOperand.h"
+#include "ckw/TensorSampler.h"
+#include "ckw/TileInfo.h"
 #include "ckw/TileOperand.h"
 #include "ckw/types/ConstantData.h"
 #include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/DataType.h"
 #include "ckw/types/Operators.h"
+#include "ckw/types/TargetArchitecture.h"
+#include "ckw/types/TargetLanguage.h"
+#include "ckw/types/TensorComponentType.h"
+#include "ckw/types/TensorDataLayout.h"
+#include "ckw/types/TensorSamplerTypes.h"
+#include "ckw/types/TensorStorageType.h"
 
 #include <functional>
 #include <memory>
@@ -39,16 +50,8 @@
 namespace ckw
 {
 
-/** Forward Declerations */
-class Kernel;
-class TensorInfo;
-class TensorSampler;
+/** Forward Declarations */
 class TileArea;
-class TileInfo;
-
-enum class DataType;
-enum class TargetArchitecture;
-enum class TargetLanguage;
 
 /** A kernel writer.
  *
@@ -350,7 +353,6 @@
                                   const TileOperand   &z,
                                   const TileOperand   &batch_op) = 0;
 
-protected:
     // =============================================================================================
     // ID space management
     // =============================================================================================
@@ -367,6 +369,7 @@
     /** Get the current ID space. */
     int32_t id_space() const;
 
+protected:
     /** Set the current ID space.
      *
      * @param[in] value The ID space to be used.

diff --git a/compute_kernel_writer/include/ckw/TensorOperand.h b/compute_kernel_writer/include/ckw/TensorOperand.h
index 2672cd5..a3e53d1 100644
--- a/compute_kernel_writer/include/ckw/TensorOperand.h
+++ b/compute_kernel_writer/include/ckw/TensorOperand.h

@@ -43,6 +43,15 @@
     // Only kernel writer class interacts with tensor operand hence we allow it to access this field.
     friend class KernelWriter;
 
+    /** Create an empty tensor operand.
+     *
+     * The new tensor operand doesn't refer to any tensor therefore it is not useable.
+     */
+    TensorOperand();
+
+    /** Check if the tensor operand contains a tensor and therefore useable. */
+    bool is_valid() const;
+
     /** Get the tensor info. */
     const TensorInfo &info() const;
 
@@ -92,7 +101,7 @@
     /** Initialize a new instance of @ref TensorOperand class for a tensor. */
     TensorOperand(ITensor &tensor);
 
-    ITensor &_tensor;
+    ITensor *_tensor;
 };
 
 } // namespace ckw

diff --git a/compute_kernel_writer/include/ckw/TileOperand.h b/compute_kernel_writer/include/ckw/TileOperand.h
index 56dc5e7..556d589 100644
--- a/compute_kernel_writer/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/include/ckw/TileOperand.h

@@ -33,6 +33,7 @@
 class KernelWriter;
 class TensorOperand;
 class ITile;
+class TileInfo;
 
 /** A tile operand refers to a tile object that can be used for kernel writing. */
 class TileOperand
@@ -43,6 +44,18 @@
     friend class KernelWriter;
     friend class TensorOperand;
 
+    /** Create an empty tile operand.
+     *
+     * The new tile operand doesn't refer to any tile therefore it is not useable.
+     */
+    TileOperand();
+
+    /** Check if the tile operand contains a tile and therefore useable. */
+    bool is_valid() const;
+
+    /** Get the tile info. */
+    const TileInfo &tile_info() const;
+
     /** Get a row vector of the current tile operand.
      *
      * @param[in] row The index of the row to be accessed in the current tile operand.

diff --git a/compute_kernel_writer/include/ckw/types/ConstantData.h b/compute_kernel_writer/include/ckw/types/ConstantData.h
index 7708818..ea95049 100644
--- a/compute_kernel_writer/include/ckw/types/ConstantData.h
+++ b/compute_kernel_writer/include/ckw/types/ConstantData.h

@@ -53,6 +53,10 @@
     template <typename T>
     ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type);
 
+    /** Templated constructor */
+    template <typename T>
+    ConstantData(const std::vector<std::vector<T>> &values, DataType data_type);
+
 private:
     /** Validate the given data type and the template type
      *

diff --git a/compute_kernel_writer/include/ckw/types/Operators.h b/compute_kernel_writer/include/ckw/types/Operators.h
index 1e5f9bd..77b0519 100644
--- a/compute_kernel_writer/include/ckw/types/Operators.h
+++ b/compute_kernel_writer/include/ckw/types/Operators.h

@@ -43,6 +43,7 @@
     Fabs  = 0x0014,
     Log   = 0x0015,
     Round = 0x0016,
+    Floor = 0x0017,
 };
 
 /** Assignment operators. */

diff --git a/compute_kernel_writer/src/KernelWriter.cpp b/compute_kernel_writer/src/KernelWriter.cpp
index a478231..92a3674 100644
--- a/compute_kernel_writer/src/KernelWriter.cpp
+++ b/compute_kernel_writer/src/KernelWriter.cpp

@@ -107,7 +107,8 @@
 
 ITensor &KernelWriter::get_tensor(const TensorOperand &operand)
 {
-    return operand._tensor;
+    CKW_ASSERT(operand._tensor != nullptr);
+    return *operand._tensor;
 }
 
 const std::vector<std::vector<std::string>> &KernelWriter::get_values(const ConstantData &data)

diff --git a/compute_kernel_writer/src/TensorOperand.cpp b/compute_kernel_writer/src/TensorOperand.cpp
index bf11d0d..9499753 100644
--- a/compute_kernel_writer/src/TensorOperand.cpp
+++ b/compute_kernel_writer/src/TensorOperand.cpp

@@ -21,91 +21,115 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "ckw/TensorOperand.h"
 
+#include "ckw/Error.h"
+
 #include "src/ITensor.h"
 
 namespace ckw
 {
 
-TensorOperand::TensorOperand(ITensor &tensor) : _tensor(tensor)
+TensorOperand::TensorOperand() : _tensor(nullptr)
 {
 }
 
+TensorOperand::TensorOperand(ITensor &tensor) : _tensor(&tensor)
+{
+}
+
+bool TensorOperand::is_valid() const
+{
+    return _tensor != nullptr;
+}
+
 const TensorInfo &TensorOperand::info() const
 {
-    return _tensor.info();
+    CKW_ASSERT(is_valid() == true);
+    return _tensor->info();
 }
 
 TileOperand TensorOperand::stride0()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride0));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride0));
 }
 
 TileOperand TensorOperand::stride1()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride1));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride1));
 }
 
 TileOperand TensorOperand::stride2()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride2));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride2));
 }
 
 TileOperand TensorOperand::stride3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride3));
 }
 
 TileOperand TensorOperand::stride4()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride4));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride4));
 }
 
 TileOperand TensorOperand::dim0()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim0));
+    return TileOperand(_tensor->component(TensorComponentType::Dim0));
 }
 
 TileOperand TensorOperand::dim1()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim1));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim1));
 }
 
 TileOperand TensorOperand::dim2()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim2));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim2));
 }
 
 TileOperand TensorOperand::dim3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim3));
 }
 
 TileOperand TensorOperand::dim4()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim4));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim4));
 }
 
 TileOperand TensorOperand::dim1_dim2()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim1xDim2));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim1xDim2));
 }
 
 TileOperand TensorOperand::dim1_dim2_dim3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim1xDim2xDim3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim1xDim2xDim3));
 }
 
 TileOperand TensorOperand::dim2_dim3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim2xDim3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim2xDim3));
 }
 
 TileOperand TensorOperand::offset_first_element_in_bytes()
 {
-    return TileOperand(_tensor.component(TensorComponentType::OffsetFirstElement));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::OffsetFirstElement));
 }
 
 } // namespace ckw

diff --git a/compute_kernel_writer/src/TensorSampler.cpp b/compute_kernel_writer/src/TensorSampler.cpp
index 91d5af2..e81c5f9 100644
--- a/compute_kernel_writer/src/TensorSampler.cpp
+++ b/compute_kernel_writer/src/TensorSampler.cpp

@@ -27,6 +27,8 @@
 namespace ckw
 {
 
+TensorSampler::TensorSampler() = default;
+
 TensorSampler::TensorSampler(TensorStorageType         storage,
                              TensorSamplerFormat       format,
                              TensorSamplerAddressModeX address_mode_x,

diff --git a/compute_kernel_writer/src/TileOperand.cpp b/compute_kernel_writer/src/TileOperand.cpp
index 865ef85..8ced6cf 100644
--- a/compute_kernel_writer/src/TileOperand.cpp
+++ b/compute_kernel_writer/src/TileOperand.cpp

@@ -31,6 +31,10 @@
 namespace ckw
 {
 
+TileOperand::TileOperand() : _tile(nullptr), _row_start(0), _row_end(0), _col_start(0), _col_end(0)
+{
+}
+
 TileOperand::TileOperand(ITile &tile)
     : _tile(&tile), _row_start(0), _row_end(tile.info().height()), _col_start(0), _col_end(tile.info().width())
 {
@@ -46,6 +50,16 @@
     CKW_ASSERT(col_end > col_start && col_end <= _tile->info().width());
 }
 
+bool TileOperand::is_valid() const
+{
+    return _tile != nullptr;
+}
+
+const TileInfo &TileOperand::tile_info() const
+{
+    return _tile->info();
+}
+
 TileOperand TileOperand::tile(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end) const
 {
     CKW_ASSERT(row_start >= 0 && _row_start + row_start < _row_end);

diff --git a/compute_kernel_writer/src/TileView.h b/compute_kernel_writer/src/TileView.h
index 50ae66b..42854ac 100644
--- a/compute_kernel_writer/src/TileView.h
+++ b/compute_kernel_writer/src/TileView.h

@@ -78,6 +78,10 @@
 class TileView
 {
 public:
+    /** Default constructor */
+    TileView() : _tile(nullptr), _area(0, 0, 0, 0)
+    {
+    }
     /** Create a tile view that refers to the whole tile.
      *
      * @param[in] tile The tile object.
@@ -179,6 +183,22 @@
                col_end() == _tile->info().width();
     }
 
+    /** Set the rectangular active area.
+     *
+     * @param[in] area The rectangular active area.
+    */
+    TileView &area(const TileArea &area)
+    {
+        _area = area;
+        return *this;
+    }
+
+    /** Get the tile area */
+    TileArea area() const
+    {
+        return _area;
+    }
+
 private:
     const T *_tile;
     TileArea _area;

diff --git a/compute_kernel_writer/src/cl/CLHelpers.cpp b/compute_kernel_writer/src/cl/CLHelpers.cpp
index 8e4a932..252c5cd 100644
--- a/compute_kernel_writer/src/cl/CLHelpers.cpp
+++ b/compute_kernel_writer/src/cl/CLHelpers.cpp

@@ -193,6 +193,9 @@
         case UnaryOp::Round:
             return {true, "round"};
 
+        case UnaryOp::Floor:
+            return {true, "floor"};
+
         default:
             CKW_THROW_MSG("Unsupported unary operation!");
     }

diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.cpp b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
index 62e6853..8b4876b 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.cpp
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.cpp

@@ -47,6 +47,25 @@
 #include <tuple>
 #include <vector>
 
+namespace
+{
+std::string generate_cl_extensions()
+{
+    std::string ext = R"(
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(cl_khr_fp16)
+
+#if defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf);
+
+#define inf (INFINITY)
+)";
+    return ext;
+}
+} // namespace
+
 namespace ckw
 {
 
@@ -56,7 +75,7 @@
 std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
 {
     std::string code;
-
+    code += generate_cl_extensions();
     code += "__kernel void ";
     code += name;
     code += "\n(\n";
@@ -154,21 +173,31 @@
     const auto dst_type_str     = cl_get_variable_datatype_as_string(dst_type, dst_w);
 
     const std::string sat = policy == ConvertPolicy::Saturate ? "_sat" : "";
+
     CKW_ASSERT_IF(policy == ConvertPolicy::Saturate, !is_data_type_float(dst_type));
 
     const auto        broadcast_x = dst_w != 1 && src_w == 1;
     const std::string prefix      = broadcast_x ? "(" + dst_type_str + ")" : "";
 
-    CKW_ASSERT_MSG(src_view.data_type() != dst_view.data_type(), "Source and destination type must be different.");
     CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
                    "Tile height must match or source is broadcasting in y dimension.");
     CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
 
     // Broadcasting on y dimension is automatic (see CLTile::vector).
-    for (int32_t y = 0; y < dst_h; ++y)
+    if (src_view.data_type() == dst_view.data_type())
     {
-        append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
-                    src_view.vector(y).str, ");\n");
+        for (int32_t y = 0; y < dst_h; ++y)
+        {
+            append_code(dst_view.vector(y).str, " = ", src_view.vector(y).str, ";\n");
+        }
+    }
+    else
+    {
+        for (int32_t y = 0; y < dst_h; ++y)
+        {
+            append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
+                        src_view.vector(y).str, ");\n");
+        }
     }
 }
 
@@ -219,18 +248,12 @@
 
     CKW_ASSERT_MSG(lhs_view.data_type() == rhs_view.data_type(), "LHS and RHS type must match.");
 
-    CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
-                   "LHS tile height must match or source is broadcasting in y dimension.");
-    CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
-                   "RHS tile height must match or source is broadcasting in y dimension.");
-
-    CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
-                   "LHS tile width must match destination or LHS is broadcasting in x dimension.");
-    CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
-                   "RHS tile width must match destination or RHS is broadcasting in x dimension.");
-
     if (op == BinaryOp::MatMul_Nt_T)
     {
+        CKW_ASSERT_MSG(lhs_view.height() == dst_h, "LHS tile height must match the DST tile height");
+        CKW_ASSERT_MSG(rhs_view.height() == dst_w, "RHS tile height must match the DST tile width");
+        CKW_ASSERT_MSG(lhs_view.width() == rhs_view.width(), "LHS tile width must match the LHS tile width");
+
         CKW_ASSERT(is_data_type_float(data_type));
 
         for (int32_t y = 0; y < dst_h; ++y)
@@ -239,14 +262,24 @@
             {
                 for (int32_t k = 0; k < lhs_w; ++k)
                 {
-                    append_code(dst_view.scalar(x, y).str, " = fma(", lhs_view.scalar(k, y).str, ", ",
-                                rhs_view.scalar(k, x).str, ", ", dst_view.scalar(x, y).str, ");\n");
+                    append_code(dst_view.scalar(y, x).str, " = fma(", lhs_view.scalar(y, k).str, ", ",
+                                rhs_view.scalar(x, k).str, ", ", dst_view.scalar(y, x).str, ");\n");
                 }
             }
         }
     }
     else
     {
+        CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
+                       "LHS tile height must match or source is broadcasting in y dimension.");
+        CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
+                       "RHS tile height must match or source is broadcasting in y dimension.");
+
+        CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
+                       "LHS tile width must match destination or LHS is broadcasting in x dimension.");
+        CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
+                       "RHS tile width must match destination or RHS is broadcasting in x dimension.");
+
         const auto  op_info    = cl_get_binary_op(op, data_type);
         const auto  op_is_func = std::get<0>(op_info);
         const auto &op_name    = std::get<1>(op_info);
@@ -746,36 +779,35 @@
 
     ITensor &tensor = get_tensor(tensor_op);
 
-    std::unique_ptr<ICLMemoryOpHelper> helper;
-    switch (sampler.storage())
-    {
-        case TensorStorageType::BufferUint8Ptr:
-            helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op);
-            break;
-        case TensorStorageType::Texture2dReadOnly:
-        case TensorStorageType::Texture2dWriteOnly:
-            helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op);
-            break;
-        default:
-            CKW_THROW_MSG("Unsupported tensor storage");
-    }
-
-    // Load/store op doesn't support sub-tile access.
-    const auto tile       = to_cl_tile_view(tile_op).full_tile();
+    const auto tile       = to_cl_tile_view(tile_op);
     const auto x_tile     = to_cl_tile_view(x).full_tile();
     const auto y_tile     = to_cl_tile_view(y).full_tile();
     const auto z_tile     = to_cl_tile_view(z).full_tile();
     const auto batch_tile = to_cl_tile_view(batch).full_tile();
 
+    std::unique_ptr<ICLMemoryOpHelper> helper;
+    switch (sampler.storage())
+    {
+        case TensorStorageType::BufferUint8Ptr:
+            helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op, tile);
+            break;
+        case TensorStorageType::Texture2dReadOnly:
+        case TensorStorageType::Texture2dWriteOnly:
+            helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op, tile);
+            break;
+        default:
+            CKW_THROW_MSG("Unsupported tensor storage");
+    }
+
     CKW_ASSERT(x_tile.is_scalar());
     CKW_ASSERT(z_tile.is_scalar());
     CKW_ASSERT_IF(indirect_buffer, y_tile.info().width() == 1);
     CKW_ASSERT_IF(!indirect_buffer, y_tile.is_scalar());
     CKW_ASSERT(batch_tile.is_scalar());
 
-    helper->initialize(&tile, &x_tile, &z_tile, &batch_tile);
+    helper->initialize(&x_tile, &z_tile, &batch_tile);
 
-    for (int row = 0; row < tile.info().height(); ++row)
+    for (int row = 0; row < tile.height(); ++row)
     {
         if (!indirect_buffer)
         {

diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.h b/compute_kernel_writer/src/cl/CLTensorArgument.h
index 35df514..a79cf34 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.h
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.h

@@ -27,6 +27,7 @@
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
 
+#include "src/cl/CLTensorComponent.h"
 #include "src/ITensor.h"
 
 #include <memory>
@@ -39,8 +40,6 @@
 class TensorInfo;
 
 class ITensorComponent;
-class CLTensorComponent;
-class CLTensorStorage;
 
 /** OpenCL specific tensor argument
  *  Internally, the object keeps track of the components and storages used to minimize the number

diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
index a98ebed..7d16f35 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp

@@ -34,15 +34,16 @@
 #include "src/cl/CLTile.h"
 #include "src/ITensor.h"
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 namespace ckw
 {
-bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
-                                      const ITensor        *tensor,
-                                      const TensorSampler  *sampler,
-                                      const Tensor3dMapper *mapper,
-                                      MemoryOperation       op,
-                                      const CLTile         *dst)
+bool CLMemoryOpBufferHelper::validate(const CLKernelWriter   *writer,
+                                      const ITensor          *tensor,
+                                      const TensorSampler    *sampler,
+                                      const Tensor3dMapper   *mapper,
+                                      MemoryOperation         op,
+                                      const TileView<CLTile> &dst)
 {
     CKW_UNUSED(writer, tensor, mapper, op, dst);
 
@@ -100,17 +101,14 @@
  *  The outermost block is x, then z and then y. This is why, if/else's covering for y are initialized
  *  at each row write. In some addressing modes, such as None, no if/else conditions are written.
  */
-void CLMemoryOpBufferHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpBufferHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    _dst = dst;
-
     CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
 
-    _ls_width_full = dst->info().width();
-    _coord_x       = x->scalar(0, 0).str;
-    _coord_z       = z->scalar(0, 0).str;
-    _coord_b       = b->scalar(0, 0).str;
-    _coord_orig_z  = _coord_z;
+    _coord_x      = x->scalar(0, 0).str;
+    _coord_z      = z->scalar(0, 0).str;
+    _coord_b      = b->scalar(0, 0).str;
+    _coord_orig_z = _coord_z;
 
     out_of_bound_initialize_x(_coord_x);
     out_of_bound_initialize_z(_coord_z);
@@ -121,7 +119,7 @@
     // The only check required is on Y.
     out_of_bound_initialize_y(coord_y);
 
-    const std::string dst     = _dst->vector(row_id).str;
+    const std::string dst     = _dst.vector(row_id).str;
     const std::string address = to_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
     const std::string ls_buf  = to_statement(_op, _ls_width_full, dst, address);
 
@@ -133,10 +131,17 @@
     // The left over load/store will be written in the finalize stage
     if (_ls_width_part.size() != 0)
     {
-        int32_t col_start = 0;
+        int32_t        col_start     = 0;
+        const TileArea original_area = _dst.area();
+
         for (int32_t partial_width : _ls_width_part)
         {
-            const std::string dst       = _dst->vector(row_id, col_start, partial_width).str;
+            // Set the active area
+            const TileArea area(original_area.row_start(), original_area.row_end(), col_start,
+                                col_start + partial_width);
+            _dst.area(area);
+
+            const std::string dst       = _dst.vector(row_id).str;
             const std::string coord_x   = _coord_x + " + " + std::to_string(col_start);
             const std::string address   = to_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
             const std::string statement = to_statement(_op, partial_width, dst, address);
@@ -144,6 +149,8 @@
 
             col_start += partial_width;
         }
+        // Restore the original area
+        _dst.area(original_area);
     }
 }
 
@@ -304,7 +311,7 @@
     CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
 
     const std::string ptr_buf  = _tensor->storage(tensor_storage).val;
-    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
+    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst.data_type(), 1);
 
     std::string address;
     address += "(__global ";

diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
index 4e1a842..a6b3272 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h

@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
-#define CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
+#ifndef CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
+#define CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
 
 #include "src/cl/helpers/ICLMemoryOpHelper.h"
 
@@ -37,6 +37,8 @@
 // Forward Declarations
 class CLKernelWriter;
 class CLTile;
+template <class CLTile>
+class TileView;
 enum class MemoryOperation;
 
 /** Helper class to write memory operations (like load/store) in OpenCL
@@ -45,19 +47,23 @@
 {
 public:
     /** Constructor similar to @ref ICLMemoryOpHelper() */
-    CLMemoryOpBufferHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    CLMemoryOpBufferHelper(CLKernelWriter         *writer,
+                           ITensor                *tensor,
+                           TensorSampler          *sampler,
+                           MemoryOperation         op,
+                           const TileView<CLTile> &dst)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
     {
     }
 
     /** Copy constructor */
-    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = default;
+    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = delete;
 
     /** Assignment operator overload */
-    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = default;
+    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = delete;
 
     // Methods overridden
-    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
     void write_row(int32_t row_id, const std::string &coord_y) override;
     void finalize() override;
 
@@ -78,12 +84,12 @@
     std::vector<LeftoverDescriptor> _leftovers_x{};
     std::string                     _coord_orig_z{};
 
-    static bool validate(const CLKernelWriter *writer,
-                         const ITensor        *tensor,
-                         const TensorSampler  *sampler,
-                         const Tensor3dMapper *mapper,
-                         MemoryOperation       op,
-                         const CLTile         *dst);
+    static bool validate(const CLKernelWriter   *writer,
+                         const ITensor          *tensor,
+                         const TensorSampler    *sampler,
+                         const Tensor3dMapper   *mapper,
+                         MemoryOperation         op,
+                         const TileView<CLTile> &dst);
 
     void out_of_bound_initialize_x(const std::string &coord);
     void out_of_bound_finalize_x();
@@ -99,4 +105,4 @@
 };
 } // namespace ckw
 
-#endif /* CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H

diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
index b7d146b..f392cd8 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp

@@ -33,18 +33,15 @@
 #include "src/cl/CLTile.h"
 #include "src/ITensor.h"
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 namespace ckw
 {
-void CLMemoryOpImage2dHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpImage2dHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, dst));
-
-    _dst           = dst;
-    _ls_width_full = dst->info().width();
-    _coord_x       = x->scalar(0, 0).str;
-    _coord_z       = z->scalar(0, 0).str;
-    _coord_b       = b->scalar(0, 0).str;
+    _coord_x = x->scalar(0, 0).str;
+    _coord_z = z->scalar(0, 0).str;
+    _coord_b = b->scalar(0, 0).str;
 }
 
 void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord_y)
@@ -52,7 +49,7 @@
     // The only check required is on Y.
     out_of_bound_initialize_y(coord_y);
 
-    const std::string dst     = _dst->vector(row_id).str;
+    const std::string dst     = _dst.vector(row_id).str;
     const std::string sampler = to_ls_image2d_sampler();
     const std::string coord   = to_ls_image2d_address(_coord_x, coord_y, _coord_z, _coord_b);
     const std::string ls_buf  = to_ls_image2d(_op, _ls_width_full, dst, sampler, coord);
@@ -66,16 +63,16 @@
 {
 }
 
-bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
-                                       const ITensor        *tensor,
-                                       const TensorSampler  *sampler,
-                                       const Tensor3dMapper *mapper,
-                                       MemoryOperation       op,
-                                       const CLTile         *dst)
+bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter   *writer,
+                                       const ITensor          *tensor,
+                                       const TensorSampler    *sampler,
+                                       const Tensor3dMapper   *mapper,
+                                       MemoryOperation         op,
+                                       const TileView<CLTile> &dst)
 {
     CKW_UNUSED(writer, tensor, mapper);
 
-    if (dst->info().width() != 4)
+    if (dst.width() != 4)
     {
         return false;
     }
@@ -95,7 +92,7 @@
     {
         return false;
     }
-    if ((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
+    if ((dst.data_type() != DataType::Fp32) && (dst.data_type() != DataType::Fp16))
     {
         return false;
     }
@@ -143,10 +140,12 @@
                                                    const std::string &address) const
 {
     CKW_UNUSED(vector_width);
+    CKW_ASSERT_MSG(_dst.data_type() == DataType::Fp32 || _dst.data_type() == DataType::Fp16,
+                   "Image2d only supports floating-point data type");
 
     const TensorStorageType tensor_storage = _sampler->storage();
     const std::string       image2d_obj    = _tensor->storage(tensor_storage).val;
-    const std::string       post_fix       = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
+    const std::string       post_fix       = _dst.data_type() == DataType::Fp32 ? "f" : "h";
 
     switch (op)
     {

diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
index fd9b097..6c42c13 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h

@@ -35,6 +35,8 @@
 // Forward Declarations
 class CLKernelWriter;
 class CLTile;
+template <class CLTile>
+class TileView;
 enum class MemoryOperation;
 
 /** Helper class to write memory operations (like load/store) in OpenCL for Image2d type */
@@ -42,29 +44,33 @@
 {
 public:
     /** Constructor similar to @ref ICLMemoryOpHelper() */
-    CLMemoryOpImage2dHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    CLMemoryOpImage2dHelper(CLKernelWriter         *writer,
+                            ITensor                *tensor,
+                            TensorSampler          *sampler,
+                            MemoryOperation         op,
+                            const TileView<CLTile> &dst)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
     {
     }
 
     /** Copy constructor */
-    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = default;
+    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = delete;
 
     /** Assignment operator overload */
-    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = default;
+    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = delete;
 
     // Methods overridden
-    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
     void write_row(int32_t row_id, const std::string &coord_y) override;
     void finalize() override;
 
 private:
-    static bool validate(const CLKernelWriter *writer,
-                         const ITensor        *tensor,
-                         const TensorSampler  *sampler,
-                         const Tensor3dMapper *mapper,
-                         MemoryOperation       op,
-                         const CLTile         *dst);
+    static bool validate(const CLKernelWriter   *writer,
+                         const ITensor          *tensor,
+                         const TensorSampler    *sampler,
+                         const Tensor3dMapper   *mapper,
+                         MemoryOperation         op,
+                         const TileView<CLTile> &dst);
 
     void out_of_bound_initialize_y(const std::string &coord);
     void out_of_bound_finalize_y();

diff --git a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
index f46fee9..a5b679a 100644
--- a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h

@@ -28,6 +28,7 @@
 #include "ckw/TensorSampler.h"
 
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 #include <cstdint>
 #include <memory>
@@ -55,18 +56,24 @@
      * @param[in] tensor  @ref ckw::ITensor object to perform the memory operation on
      * @param[in] sampler @ref ckw::TensorSampler object that tells how to sample a tensor
      * @param[in] op      The memory operation to be done (e.g. Load/Store)
+     * @param[in] dst     The tile to perform the memory operation on
      */
-    ICLMemoryOpHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op)
+    ICLMemoryOpHelper(CLKernelWriter         *writer,
+                      ITensor                *tensor,
+                      TensorSampler          *sampler,
+                      MemoryOperation         op,
+                      const TileView<CLTile> &dst)
+        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op), _dst(dst)
     {
-        _mapper = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+        _mapper        = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+        _ls_width_full = _dst.width();
     }
 
     /** Copy constructor */
-    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = default;
+    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = delete;
 
     /** Assignment operator overload */
-    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = default;
+    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = delete;
 
     /** Destructor */
     virtual ~ICLMemoryOpHelper() = default;
@@ -75,12 +82,11 @@
      *  the batch offset as a tile object, and initializes the code inside
      *  the writer object.
      *
-     * @param[in] dst  tile object to perform the memory operation on
      * @param[in] x    tile object that describes the x-coordinate of the tensor involved
      * @param[in] z    tile object that describes the z-coordinate of the tensor involved
      * @param[in] b    tile object that describes the batch offset of the tensor involved
      */
-    virtual void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) = 0;
+    virtual void initialize(const CLTile *x, const CLTile *z, const CLTile *b) = 0;
 
     /** Method that writes the actual code to the writer that performs the mentioned memory
      *  operation on the tile initialized. It writes the code for a specific row given in the
@@ -104,7 +110,7 @@
     TensorSampler                  *_sampler{nullptr};
     MemoryOperation                 _op;
     std::unique_ptr<Tensor3dMapper> _mapper{nullptr};
-    const CLTile                   *_dst{nullptr};
+    TileView<CLTile>                _dst{};
     int32_t                         _ls_width_full{0};
     std::string                     _coord_x{};
     std::string                     _coord_z{};
@@ -112,4 +118,4 @@
 };
 } // namespace ckw
 
-#endif /* CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H

diff --git a/compute_kernel_writer/src/types/ConstantData.cpp b/compute_kernel_writer/src/types/ConstantData.cpp
index 67b1103..6d15eab 100644
--- a/compute_kernel_writer/src/types/ConstantData.cpp
+++ b/compute_kernel_writer/src/types/ConstantData.cpp

@@ -31,7 +31,7 @@
 namespace
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
+typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
 {
     std::stringstream ss;
     ss << std::scientific << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
@@ -39,14 +39,14 @@
 }
 
 template <typename T>
-inline typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
+typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
 to_str(T value)
 {
     return std::to_string(value);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
+typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
 {
     return std::to_string((int)value);
 }
@@ -72,6 +72,24 @@
 }
 
 template <typename T>
+ConstantData::ConstantData(const std::vector<std::vector<T>> &values, DataType data_type) : _data_type(data_type)
+{
+    CKW_ASSERT(validate<T>(data_type));
+    CKW_ASSERT(values.size() > 0);
+
+    for (auto value_arr : values)
+    {
+        // Each row must have the same number of elements
+        CKW_ASSERT(value_arr.size() == (*values.begin()).size());
+
+        StringVector vec;
+        std::transform(value_arr.begin(), value_arr.end(), std::back_inserter(vec), [](T val) { return to_str(val); });
+
+        _values.push_back(std::move(vec));
+    }
+}
+
+template <typename T>
 bool ConstantData::validate(DataType data_type)
 {
     switch (data_type)
@@ -100,6 +118,10 @@
 template ConstantData::ConstantData(std::initializer_list<std::initializer_list<uint32_t>>, DataType);
 template ConstantData::ConstantData(std::initializer_list<std::initializer_list<bool>>, DataType);
 template ConstantData::ConstantData(std::initializer_list<std::initializer_list<float>>, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<int32_t>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<uint32_t>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<bool>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<float>> &, DataType);
 
 template bool ConstantData::validate<int32_t>(DataType);
 template bool ConstantData::validate<uint32_t>(DataType);

diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
index bfa6724..44a4df1 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h

@@ -61,25 +61,19 @@
 
         _tests.push_back({ 2, 4, DataType::Bool, 2, 1, 2, 1, DataType::Fp32, BinaryOp::GreaterEqual, "G0__dst__0 = (float4)G0__lhs__0 >= (float4)G0__rhs__0;\nG0__dst__1 = (float4)G0__lhs__1 >= (float4)G0__rhs__1;\n" }); // LHS and RHS x-dimension broadcast.
 
-        _tests.push_back({ 2, 3, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
+        _tests.push_back({ 2, 2, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
                            "G0__dst__0.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s0, G0__dst__0.s0);\n"
-                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
-                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s2, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s0, G0__dst__0.s1);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__1.s1);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__0.s0);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s2, G0__rhs__0.s2, G0__dst__0.s0);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s0, G0__rhs__1.s0, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__1.s1, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s2, G0__rhs__1.s2, G0__dst__0.s1);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__0.s0, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s1, G0__rhs__0.s1, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s2, G0__rhs__0.s2, G0__dst__1.s0);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__1.s1);\n"
                            "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s2, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n" });
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s2, G0__rhs__1.s2, G0__dst__1.s1);\n" });
     }
 
     bool run() override

diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
index 3e10569..855c747 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h

@@ -81,7 +81,15 @@
             "{\n"
             "}\n";
 
-        const auto &actual_code = kernel->source_code();
+        std::string actual_code = kernel->source_code();
+
+        std::size_t pos = actual_code.find("__kernel");
+
+        if (pos != std::string::npos)
+        {
+            // Remove text before "__kernel"
+            actual_code = actual_code.substr(pos);
+        }
 
         int test_id = 0;
         VALIDATE_TEST(kernel->arguments().size() == 4, all_tests_passed, test_id++);

diff --git a/filelist.json b/filelist.json
index 7c530f3..2f33b5c 100644
--- a/filelist.json
+++ b/filelist.json

@@ -2388,7 +2388,9 @@
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
-        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
         "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp"

diff --git a/scripts/format_code.py b/scripts/format_code.py
index 29dbea7..b456bd4 100755
--- a/scripts/format_code.py
+++ b/scripts/format_code.py

@@ -216,7 +216,7 @@
         if strategy == "git-head":
             cmd = "git diff-tree --no-commit-id --name-status -r HEAD | grep \"^[AMRT]\" | cut -f 2"
         elif strategy == "git-diff":
-            cmd = "git diff --name-status --cached -r HEAD | grep \"^[AMRT]\" | cut -f 2"
+            cmd = "git diff --name-status --cached -r HEAD | grep \"^[AMRT]\" | rev | cut -f 1 | rev"
         else:
             cmd = "git ls-tree -r HEAD --name-only"
             # Skip copyright checks when running on all files because we don't know when they were last modified

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
index c4ab110..a42b397 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
-#include "ckw/Error.h"
+#include "compute_kernel_writer/include/ckw/Error.h"
 
 namespace arm_compute
 {
@@ -33,75 +33,71 @@
 namespace dynamic_fusion
 {
 
-GpuCkwComponentArgument::GpuCkwComponentArgument()
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand tensor) : _tensor(tensor)
 {
 }
 
-GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
-{
-}
-
-GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
-                                                                      const ckw::TensorTileSampler &tile_sampler)
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand         &tile,
+                                                                      const ckw::TensorSampler &sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 
-    _tile         = &tile;
-    _tile_sampler = tile_sampler;
+    _tile    = tile;
+    _sampler = sampler;
 
     return *this;
 }
 
 bool GpuCkwComponentArgument::has_tensor() const
 {
-    return _tensor != nullptr;
+    return _tensor.is_valid();
 }
 
 ckw::TensorOperand &GpuCkwComponentArgument::tensor()
 {
-    CKW_ASSERT(_tensor != nullptr);
+    CKW_ASSERT(_tensor.is_valid());
 
-    return *_tensor;
+    return _tensor;
 }
 
 const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const
 {
-    CKW_ASSERT(_tensor != nullptr);
+    CKW_ASSERT(_tensor.is_valid());
 
-    return *_tensor;
+    return _tensor;
 }
 
 bool GpuCkwComponentArgument::has_tile() const
 {
-    return _tile != nullptr;
+    return _tile.is_valid();
 }
 
 ckw::TileOperand &GpuCkwComponentArgument::tile()
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return *_tile;
+    return _tile;
 }
 
 const ckw::TileOperand &GpuCkwComponentArgument::tile() const
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return *_tile;
+    return _tile;
 }
 
-ckw::TensorTileSampler &GpuCkwComponentArgument::tile_sampler()
+ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler()
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return _tile_sampler;
+    return _sampler;
 }
 
-const ckw::TensorTileSampler &GpuCkwComponentArgument::tile_sampler() const
+const ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() const
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return _tile_sampler;
+    return _sampler;
 }
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
index 863989a..7a57c81 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,9 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
 
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class TensorOperand;
-class TileOperand;
-} // namespace ckw
+#include "compute_kernel_writer/include/ckw/TensorOperand.h"
+#include "compute_kernel_writer/include/ckw/TensorSampler.h"
+#include "compute_kernel_writer/include/ckw/TileOperand.h"
 
 namespace arm_compute
 {
@@ -44,24 +40,27 @@
 class GpuCkwComponentArgument
 {
 public:
-    /** Initialize a new instance of @ref GpuCkwComponentArgument class for empty virtual tensor. */
-    GpuCkwComponentArgument();
+    /** Default constructor */
+    GpuCkwComponentArgument() = default;
 
     /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor.
      *
      * @param[in] tensor The user tensor.
      */
-    explicit GpuCkwComponentArgument(ckw::TensorOperand &tensor);
+    explicit GpuCkwComponentArgument(ckw::TensorOperand tensor);
 
-    /** Set virtual tensor information (tile, sampler) for the argument.
+    /** Bind the tile and sampler to the tensor argument.
      *
-     * If the component is a user tensor, it can be treated as virtual tensor as well
-     * and won't be loaded again using @ref GpuCkwKernelWriter::op_load_once method.
+     * This method can be used to share a tile and sampler associated to a tensor
+     * among different kernel components. For example, when we create the destination
+     * tile and destination sampler for the first time (root component), this method can be
+     * used to bind these two information to the destination tensor so that the following
+     * simple components know the tile size and how to access the elements from memory.
      *
      * @param[in] tile    The tile that has been loaded.
      * @param[in] sampler The tensor sampling information that has been used to load the tile.
      */
-    GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
+    GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorSampler &sampler);
 
     /** Get whether the argument is a user tensor. */
     bool has_tensor() const;
@@ -101,18 +100,18 @@
      *
      * If the tile is not available, throw an error.
      */
-    ckw::TensorTileSampler &tile_sampler();
+    ckw::TensorSampler &tensor_sampler();
 
     /** Get the tensor sampling information for the tile.
      *
      * If the tile is not available, throw an error.
      */
-    const ckw::TensorTileSampler &tile_sampler() const;
+    const ckw::TensorSampler &tensor_sampler() const;
 
 private:
-    ckw::TensorOperand    *_tensor{nullptr};
-    ckw::TileOperand      *_tile{nullptr};
-    ckw::TensorTileSampler _tile_sampler{};
+    ckw::TensorOperand _tensor{};
+    ckw::TileOperand   _tile{};
+    ckw::TensorSampler _sampler{};
 };
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index c927f32..a0e5e16 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,13 +28,17 @@
 
 #include "src/common/utils/Log.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/TargetArchitecture.h"
+#include "compute_kernel_writer/include/ckw/types/TargetLanguage.h"
+
 using namespace ckw;
+
 namespace arm_compute
 {
 namespace experimental
@@ -42,21 +46,22 @@
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{}
 {
+    _components = components;
+
     // Generate kernel name
-    std::string name = "";
+    std::string kernel_name;
     for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
-        name += ckw_driver->get_name(_components) + "__";
+        kernel_name += ckw_driver->get_name(_components) + "__";
     }
 
     // Generate kernel code
-    _kernel.name(name);
-    GpuCkwKernelWriter       root_writer(_kernel);
-    GpuCkwScopedKernelWriter writer(&root_writer);
+    auto root_writer =
+        KernelWriter::create_instance(ckw::TargetArchitecture::GpuArmMaliValhall, ckw::TargetLanguage::OpenCL);
+    GpuCkwScopedKernelWriter writer(root_writer.get());
     GpuCkwVariableTable      vtable{};
 
     for (auto &comp : _components)
@@ -65,22 +70,27 @@
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
         ckw_driver->write_component_code(_components, vtable, writer);
     }
-    _code = root_writer.generate_code();
+    auto kernel = root_writer->emit_kernel(kernel_name);
+
+    // Set the kernel name, kernel arguments and source code
+    _kernel_name = kernel_name;
+    _kernel_args = kernel->arguments();
+    _kernel_code = kernel->source_code();
 }
 
 std::string GpuCkwDriver::get_name()
 {
-    return _kernel.name();
+    return _kernel_name;
 }
 
 std::string GpuCkwDriver::get_code()
 {
-    return _code;
+    return _kernel_code;
 }
 
 std::string GpuCkwDriver::get_config_id()
 {
-    std::string id = "";
+    std::string id;
     for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
@@ -100,7 +110,7 @@
 GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
     GpuKernelArgumentList args{};
-    for (const auto &arg : _kernel.arguments())
+    for (const auto &arg : _kernel_args)
     {
         switch (arg.type())
         {

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 2ca5fb4..b80ce0d 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
 
 #include "ckw/Kernel.h"
 
@@ -30,6 +30,8 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
+#include "compute_kernel_writer/include/ckw/Kernel.h"
+#include "compute_kernel_writer/include/ckw/KernelArgument.h"
 #include <map>
 #include <string>
 
@@ -49,7 +51,7 @@
 {
 public:
     /** Default constructor */
-    GpuCkwDriver() = default;
+    GpuCkwDriver() = delete;
     /** Constructor
      *
      * @param[in] components Kernel component group from which the kernel will be generated
@@ -69,13 +71,14 @@
     GpuKernelArgumentList get_kernel_arguments() override;
 
 private:
-    GpuKernelComponentGroup _components{};
-    ckw::Kernel             _kernel;
-    std::string             _code;
+    GpuKernelComponentGroup          _components{};
+    std::string                      _kernel_name{};
+    std::vector<ckw::KernelArgument> _kernel_args{};
+    std::string                      _kernel_code{};
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
deleted file mode 100644
index 5f8ce91..0000000
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
+++ /dev/null

@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-
-#include "ckw/Error.h"
-#include "ckw/TileInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-
-GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
-{
-}
-
-void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
-{
-    if (!tensor_or_tile->has_tile())
-    {
-        CKW_ASSERT(tensor_or_tile->has_tensor());
-
-        auto &tensor = tensor_or_tile->tensor();
-
-        const auto tile_name = tensor.name() + "_tile";
-        auto      &tile =
-            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
-
-        op_load(tile, tensor, sampler);
-
-        tensor_or_tile->init_virtual_tensor(tile, sampler);
-    }
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h
deleted file mode 100644
index b916e6b..0000000
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class Kernel;
-} // namespace ckw
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-
-class GpuCkwComponentArgument;
-
-/** Extended implementation of kernel writer for dynamic fusion. */
-class GpuCkwKernelWriter : public ckw::KernelWriter
-{
-public:
-    /** Initialize a new instance of @ref GpuCkwKernelWriter class.
-     *
-     * @param[in] kernel The kernel to be generated.
-     */
-    explicit GpuCkwKernelWriter(ckw::Kernel &kernel);
-
-    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
-     *
-     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
-     * @param[in] sampler        The tensor sampling information to load the tile.
-     */
-    void op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
index cbadbd9..ae12d13 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-
 namespace arm_compute
 {
 namespace experimental
@@ -33,34 +31,34 @@
 namespace dynamic_fusion
 {
 
-GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(GpuCkwKernelWriter *writer)
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(ckw::KernelWriter *writer)
     : _writer(writer), _parent_id_space(writer->id_space())
 {
-    _writer->next_id_space();
+    _writer->new_id_space();
 }
 
 GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other)
     : _writer(other._writer), _parent_id_space(other._writer->id_space())
 {
-    _writer->next_id_space();
+    _writer->new_id_space();
 }
 
-GpuCkwKernelWriter *GpuCkwScopedKernelWriter::operator->()
+ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->()
 {
     return _writer;
 }
 
-const GpuCkwKernelWriter *GpuCkwScopedKernelWriter::operator->() const
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() const
 {
     return _writer;
 }
 
-GpuCkwKernelWriter *GpuCkwScopedKernelWriter::writer()
+ckw::KernelWriter *GpuCkwScopedKernelWriter::writer()
 {
     return _writer;
 }
 
-const GpuCkwKernelWriter *GpuCkwScopedKernelWriter::writer() const
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() const
 {
     return _writer;
 }

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
index 81049bf..84dd706 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
 #include <cstdint>
 
 namespace arm_compute
@@ -34,14 +35,12 @@
 namespace dynamic_fusion
 {
 
-class GpuCkwKernelWriter;
-
 /** Helper to automatically manage kernel writer ID space. */
 class GpuCkwScopedKernelWriter
 {
 public:
     /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */
-    explicit GpuCkwScopedKernelWriter(GpuCkwKernelWriter *writer);
+    explicit GpuCkwScopedKernelWriter(ckw::KernelWriter *writer);
 
     /** Create a new scope from the specified scoped kernel writer. */
     GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other);
@@ -50,20 +49,20 @@
     GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete;
 
     /** Access the underlying kernel writer. */
-    GpuCkwKernelWriter *operator->();
+    ckw::KernelWriter *operator->();
 
     /** Access the underlying kernel writer. */
-    const GpuCkwKernelWriter *operator->() const;
+    const ckw::KernelWriter *operator->() const;
 
     /** Get the kernel writer. */
-    GpuCkwKernelWriter *writer();
+    ckw::KernelWriter *writer();
 
     /** Get the kernel writer. */
-    const GpuCkwKernelWriter *writer() const;
+    const ckw::KernelWriter *writer() const;
 
 private:
-    GpuCkwKernelWriter *_writer;
-    int32_t             _parent_id_space;
+    ckw::KernelWriter *_writer;
+    int32_t            _parent_id_space;
 };
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 88a0cf7..66ccc1a 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
@@ -40,7 +39,6 @@
 GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
                                                                GpuCkwScopedKernelWriter      &writer,
                                                                const ITensorInfo             *tensor,
-                                                               TensorStorageType              storage,
                                                                const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
@@ -65,7 +63,7 @@
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))};
+        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor))};
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 2b11891..fc8764c 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
 
 #include "arm_compute/core/ITensorInfo.h"
 
@@ -38,7 +38,6 @@
 {
 class GpuKernelComponentGroup;
 class GpuCkwScopedKernelWriter;
-enum class TensorStorageType;
 
 /** A table of all the variables used in the kernel.
  *
@@ -54,7 +53,6 @@
      * @param[in] comp_group Component group the tensor belongs to
      * @param[in] writer     Compute Kernel Writer
      * @param[in] tensor     Tensor info with which the new variable is associated
-     * @param[in] storage    Tensor storage type associated with the tensor
      * @param[in] alias      Alias for the variable. Will be used as part of the variable name
      *
      * @return GpuCkwComponentArgument*
@@ -62,7 +60,6 @@
     GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
                                               GpuCkwScopedKernelWriter      &writer,
                                               const ITensorInfo             *tensor,
-                                              TensorStorageType              storage,
                                               const std::string             &alias = "unnamed");
 
 private:
@@ -72,4 +69,4 @@
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index c3b1b3c..68f478a 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,69 +26,29 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-namespace
-{
-/** Create a simple sampler from tile of dimension [m0, n0]
- */
-inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_t m0, int32_t n0)
-{
-    TensorTileSampler sampler;
-
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &x_coord = writer->declare_tile("x_coord", ckw::DataType::Int32);
-    auto &y_coord = writer->declare_tile("y_coord", ckw::DataType::Int32);
-    auto &m0_t    = writer->declare_tile("m0", m0);
-    auto &n0_t    = writer->declare_tile("n0", n0);
-    writer->op_binary_expression(x_coord, gid_0, BinaryOp::Mul, n0_t);
-    writer->op_binary_expression(y_coord, gid_1, BinaryOp::Mul, m0_t);
-
-    sampler.x(x_coord);
-    sampler.y(y_coord);
-    sampler.z(const_0); // 3rd dimension collapsed with 2nd dimension
-    sampler.b(gid_2);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip); // Dimensions higher than 3 not supported yet
-
-    return sampler;
-}
-} // namespace
 
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
-                                   const Attributes                &attributes)
+                                   const Attributes                &attributes) // NOLINT
     : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
@@ -100,78 +60,223 @@
                                             GpuCkwVariableTable     &vtable,
                                             GpuCkwScopedKernelWriter writer) const
 {
-    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h  = static_cast<int32_t>(_dst->dimension(1));
+    const auto dst_dt = to_ckw(_dst->data_type());
 
-    load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_neg_1_fp  = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt));
+    auto const_pos_1_fp  = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt));
+    auto const_0_fp      = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_A_fp      = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt));
+    auto const_B_fp      = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt));
 
-    auto &src_tile = src->tile();
-    auto &dst_tile = dst->tile();
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
 
-    // Constants
-    const auto &constant_minus_1 = writer->declare_tile("minus_1", -1);
-    const auto &constant_pos_1   = writer->declare_tile("one", 1);
-    const auto &constant_zero    = writer->declare_tile("zero", 0);
-    const auto &constant_A       = writer->declare_tile("A_VAL", _attributes.a());
-    const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
 
-    // Perform the operation.
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    // Shift-back for the overlapping-min strategy
+    int32_t dst_shift_back = -1;
+
+    if (!dst->has_tile())
+    {
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
+    }
+    else
+    {
+        // dst_m0_partial depends on the TensorSamplerFormat
+        dst_n0         = dst->tile().tile_info().width();
+        dst_m0         = dst->tile().tile_info().height();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+        ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+        if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        }
+        else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            dst_m0_partial = _dst->dimension(1) % dst_m0;
+        }
+
+        // Shift-back for the overlapping-min strategy
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+    }
+
+    const auto &tile_dst = dst->tile();
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_dst_shift_back_n0 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    if (!src->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0,
+                                                const_0_i32);
+        get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0);
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+
+        auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+        src->init_virtual_tensor(tile_src, sampler_src);
+    }
+
+    const auto &tile_src = src->tile();
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
     switch (_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
             // dst = src * -1
-            writer->op_binary_expression(dst_tile, src_tile, BinaryOp::Mul, constant_minus_1);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp);
             // dst = exp(src * -1)
-            writer->op_unary_elementwise_function(dst_tile, UnaryFunction::Exp, dst_tile);
+            writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst);
             // dst = 1 + (exp(src * -1))
-            writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, constant_pos_1);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp);
             // dst = 1 /  1 + (exp(src * -1))
-            writer->op_binary_expression(dst_tile, constant_pos_1, BinaryOp::Div, dst_tile);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::TANH:
         {
             // dst = B_VAL * src
-            writer->op_binary_expression(dst_tile, src_tile, BinaryOp::Mul, constant_B);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_B_fp);
             // dst = tanh(B_VAL * src)
-            writer->op_unary_elementwise_function(dst_tile, UnaryFunction::Tanh, dst_tile);
+            writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_dst);
             // dst = A_VAL * tanh(B_VAL * src)
-            writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Mul, constant_A);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_dst, const_A_fp);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::RELU:
         {
             // dst = max(src, 0)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Max, src_tile, constant_zero);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
         {
             //dst = max(src, 0)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Max, src_tile, constant_zero);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
             //dst = min(max(src, 0), A_VAL)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Min, dst_tile, constant_A);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
         {
             //dst = max(src, B_VAL)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Max, src_tile, constant_B);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp);
             //dst = min(max(src, B_VAL), A_VAL)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Min, dst_tile, constant_A);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
             break;
         }
         default:
             CKW_ASSERT(false);
             break;
     }
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwActivation::get_window() const
@@ -182,8 +287,8 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int     num_elems_processed_per_iteration =
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
         adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
     Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index e8e5087..d3e0dba 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,65 +26,25 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-namespace
-{
-/** Create a simple sampler from tile of dimension [m0, n0]
- */
-inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_t m0, int32_t n0)
-{
-    TensorTileSampler sampler;
-
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &x_coord = writer->declare_tile("x_coord", ckw::DataType::Int32);
-    auto &y_coord = writer->declare_tile("y_coord", ckw::DataType::Int32);
-    auto &m0_t    = writer->declare_tile("m0", m0);
-    auto &n0_t    = writer->declare_tile("n0", n0);
-    writer->op_binary_expression(x_coord, gid_0, BinaryOp::Mul, n0_t);
-    writer->op_binary_expression(y_coord, gid_1, BinaryOp::Mul, m0_t);
-
-    sampler.x(x_coord);
-    sampler.y(y_coord);
-    sampler.z(const_0); // 3rd dimension collapsed with 2nd dimension
-    sampler.b(gid_2);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip); // Dimensions higher than 3 not supported yet
-
-    return sampler;
-}
-} // namespace
 
 GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
     : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
@@ -92,72 +52,187 @@
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false,
+                             "The source data type must be a floating-point data type");
 }
 
 void GpuCkwCast::write_component_code(const ComponentGroup    &comp_group,
                                       GpuCkwVariableTable     &vtable,
                                       GpuCkwScopedKernelWriter writer) const
 {
-    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
 
-    // Load the source tile and prepare the sampler.
-    if (!src->has_tile())
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
+
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    // Shift-back for the overlapping-min strategy
+    int32_t dst_shift_back = -1;
+
+    if (!dst->has_tile())
     {
-        const auto sampler = create_sampler(writer, m0, n0);
-        writer->op_load_once(src, sampler);
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        ckw::DataType dst_dt   = to_ckw(_dst->data_type());
+        auto          tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
     }
     else
     {
-        const auto &sampler = src->tile_sampler();
-        writer->op_load_once(src, sampler);
+        // Change dst_n0 and dst_m0 if NOT root component!
+        // ATTENTION:
+        // dst_m0_partial depends on the TensorSamplerFormat
+        dst_n0         = dst->tile().tile_info().width();
+        dst_m0         = dst->tile().tile_info().height();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+        ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+        if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        }
+        else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            dst_m0_partial = _dst->dimension(1) % dst_m0;
+        }
+
+        // Shift-back for the overlapping-min strategy
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
     }
 
-    const auto &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
+    const auto &tile_dst = dst->tile();
 
-    // Prepare the output tile.
-    if (!dst->has_tile())
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_dst_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    if (!src->has_tile())
     {
-        // Get Target datatype and convert it to ckw::DataType.
-        ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
+        // Sampler
+        ckw::TensorSampler sampler_src = dst->tensor_sampler();
 
-        // Create dst_tile based on src_tile dimensions and with target DataType.
-        const TileInfo src_tile_info = src_tile.tile_info();
-        const TileInfo dst_tile_info = TileInfo(target_dt, src_tile_info.height(), src_tile_info.width());
+        auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
-        // Declare dst_tile
-        auto &tile = writer->declare_tile("dst_tile", dst_tile_info);
-        dst->init_virtual_tensor(tile, sampler);
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                                const_dst_shift_back_n0_i32, const_0_i32);
+        get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+        ckw::DataType src_dt   = to_ckw(_src->data_type());
+        auto          tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+
+        writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+        src->init_virtual_tensor(tile_src, sampler_src);
     }
 
-    const auto &dst_tile = dst->tile();
+    auto tile_src = src->tile();
 
-    // Check if this op is cast-down or cast-up
-    const size_t src_size  = data_size_from_type(_src->data_type());
-    const size_t dst_size  = data_size_from_type(_dst->data_type());
-    const bool   cast_down = (src_size >= dst_size);
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
 
-    if (cast_down && is_data_type_quantized(_src->data_type()))
-    {
-        const auto &constant_x80 = writer->declare_tile("0x80", 0x80);
-        writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80);
-    }
+    // Not required
 
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    // Only None ConvertPolicy is supported for floating-point data types
     ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
 
-    if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
-    {
-        convert_policy = ckw::ConvertPolicy::Saturate;
-    }
-
-    writer->op_cast_expression(dst_tile, src_tile, convert_policy);
+    writer->op_cast(tile_dst, tile_src, convert_policy);
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwCast::get_window() const
@@ -168,8 +243,8 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int     num_elems_processed_per_iteration =
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
         adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
     Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
index 1e09c78..cfccab1 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,21 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
 #include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
@@ -49,22 +49,16 @@
                                              const ArgumentPack<ITensorInfo> &tensors,
                                              const Attributes                &attributes,
                                              const Settings                  &settings)
-    : IGpuCkwComponentDriver{id, tensors},
-      _src{},
-      _weight{},
-      _bias{},
-      _dst{},
-      _attributes{attributes},
-      _settings{settings}
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
-    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
     if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
-        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+        _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _bias, _dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst);
 }
 
 void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup    &comp_group,
@@ -72,192 +66,294 @@
                                                  GpuCkwScopedKernelWriter writer) const
 {
     // Data Layout is NHWC
-    constexpr int32_t width_idx  = 1;
-    constexpr int32_t height_idx = 2;
+    const uint32_t width_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
 
-    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-
-    // Tunable parameters
-    // Currently only m0 and n0 = 1 are supported.
-    const int32_t     m0       = root_window.y().step();
-    const int32_t     n0       = root_window.x().step();
-    constexpr int32_t m0_a_val = 1;
-    constexpr int32_t n0_a_val = 1;
-    constexpr int32_t m0_b_val = 1;
-
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *wei =
-        vtable.declare_variable(comp_group, writer, _weight, TensorStorageType::ClBufferUint8Ptr, "wei");
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
     GpuCkwComponentArgument *bia = nullptr;
 
-    if (_bias && _bias->has_valid_id())
+    const bool using_bias = _bia != nullptr;
+
+    if (using_bias)
     {
-        bia = vtable.declare_variable(comp_group, writer, _bias, TensorStorageType::ClBufferUint8Ptr, "bia");
+        bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
     }
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
-    // Constants
-    const auto &const_1    = writer->declare_tile("1", 1);
-    const auto &wei_height = writer->declare_tile("WEI_HEIGHT", static_cast<int32_t>(_weight->dimension(height_idx)));
-    const auto &wei_width  = writer->declare_tile("WEI_WIDTH", static_cast<int32_t>(_weight->dimension(width_idx)));
-    const auto &dst_height = writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
-    const auto &stride_x   = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
-    const auto &stride_y   = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
-    const auto &pad_left   = writer->declare_tile("PAD_LEFT", static_cast<int32_t>(_attributes.pad().left));
-    const auto &pad_top    = writer->declare_tile("PAD_TOP", static_cast<int32_t>(_attributes.pad().top));
-    const auto &depth_multiplier =
-        writer->declare_tile("DEPTH_MULTIPLIER", static_cast<int32_t>(_attributes.depth_multiplier()));
-    auto &const_0 = writer->declare_tile("0", 0);
-    auto &yo      = writer->declare_tile("yo", ckw::DataType::Int32);
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt           = to_ckw(_dst->data_type());
+    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_w            = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h            = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_h            = static_cast<int32_t>(_dst->dimension(height_idx));
+    const auto stride_x         = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y         = static_cast<int32_t>(_attributes.stride().y());
+    const auto pad_x            = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y            = static_cast<int32_t>(_attributes.pad().top);
+    const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier());
+    const auto dilation_x       = static_cast<int32_t>(_attributes.dilation().x());
+    const auto dilation_y       = static_cast<int32_t>(_attributes.dilation().y());
+    const auto kernel_size      = kernel_width * kernel_height;
 
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    // CKW constants
+    auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+    auto const_kernel_size_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+    auto const_dst_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_src_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_0_i32        = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_neg_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+    auto const_depth_multiplier_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32));
+    auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32));
+    auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32));
+    auto const_0_fp           = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
 
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    auto &bout = writer->declare_tile("bout", ckw::DataType::Int32);
-    writer->op_binary_expression(bout, gid_2, ckw::BinaryOp::Div, dst_height); // gid_2 / h
-    writer->op_binary_expression(yo, gid_2, ckw::BinaryOp::Mod, dst_height);   // gid_2 % h
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
 
-    const int32_t dst_partial_n0_v = _dst->tensor_shape()[0] % n0;
-    const int32_t dst_partial_m0_v = _dst->tensor_shape()[1] % m0;
-    auto         &g_ind_0          = writer->declare_tile("g_ind_0", ckw::DataType::Int32);
-    auto         &g_ind_1          = writer->declare_tile("g_ind_1", ckw::DataType::Int32);
-    get_coord(writer, g_ind_0, gid_0, n0, dst_partial_n0_v, "dst_x_", const_0);
-    get_coord(writer, g_ind_1, gid_1, m0, dst_partial_m0_v, "dst_y_", const_0);
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
 
-    TensorTileSampler src_sampler;
-    src_sampler.width(m0_a_val);
-    src_sampler.height(n0_a_val);
-    src_sampler.format(TensorSamplerFormat::C_W_H);
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    src_sampler.address_mode_y(TensorSamplerAddressModeY::Skip);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
 
-    TensorTileSampler wei_sampler;
-    wei_sampler.width(m0_b_val);
-    wei_sampler.height(n0);
-    wei_sampler.format(TensorSamplerFormat::C_W_H);
-    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    const int32_t src_m0 = kernel_width + (dst_m0 - 1);
+    const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0;
+    const int32_t wei_m0 = kernel_width;
+    const int32_t wei_n0 = dst_n0;
 
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-    dst_sampler.x(g_ind_0);
-    dst_sampler.y(g_ind_1);
-    dst_sampler.z(yo);
-    dst_sampler.b(bout);
-
-    if (!dst->has_tile())
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
     {
-        auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
-        dst->init_virtual_tensor(dst_tile, dst_sampler);
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
     }
-    auto &dst_tile = dst->tile();
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
 
-    writer->op_assign(dst_tile, const_0);
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
 
-    auto &xi = writer->declare_tile("xi", ckw::DataType::Int32);
-    writer->op_binary_expression(xi, g_ind_1, ckw::BinaryOp::Mul, stride_x);
-    writer->op_binary_expression(xi, xi, ckw::BinaryOp::Sub, pad_left);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
 
-    auto &yi = writer->declare_tile("yi", ckw::DataType::Int32);
-    writer->op_binary_expression(yi, yo, ckw::BinaryOp::Mul, stride_y);
-    writer->op_binary_expression(yi, yi, ckw::BinaryOp::Sub, pad_top);
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
 
-    auto &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
-    writer->op_binary_expression(a_x, g_ind_0, BinaryOp::Div, depth_multiplier);
+    // Initialize the destination tile
+    writer->op_assign(tile_dst, const_0_fp);
 
-    // src_tile
-    auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), m0_a_val, n0_a_val));
-    // wei_tile
-    auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_weight->data_type()), m0_b_val, n0));
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensors
+     ********************************************************************************/
+    // SOURCE SAMPLER
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // WEIGHTS SAMPLER
+    // We cannot have out-of-bounds accesses for the weights
+    ckw::TensorSampler sampler_wei;
+    sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    if (_settings.export_weights_to_cl_image())
+    {
+        sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+    }
+    else
+    {
+        sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
+
+    // BIAS SAMPLER
+    ckw::TensorSampler sampler_bia;
+    sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+    sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (Optional)
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+    writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32);
+    writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32);
+
+    auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32);
+    writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+    writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32);
+
+    auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32);
+    writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+    writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32);
 
     // Loop variables
-    auto &yk = writer->declare_tile("yk", ckw::DataType::Int32);
-    auto &xk = writer->declare_tile("xk", ckw::DataType::Int32);
+    auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32);
 
-    // Because 1x1 blocks are being used here, scalar values are being loaded from memory instead of using tiles, since tile vector access currently is not available. Hence the values are loaded in the inner loop.
-    // This loop will be reworked.
-    writer->op_assign(yk, const_0);
-    writer->op_for_loop(yk, BinaryOp::Less, wei_height, yk, AssignmentOp::Increment, const_1,
-                        [&]()
-                        {
-                            // xk = 0
-                            writer->op_assign(xk, const_0);
-                            writer->op_for_loop(
-                                xk, BinaryOp::Less, wei_width, xk, AssignmentOp::Increment, const_1,
-                                [&]()
-                                {
-                                    writer->op_assign(b, const_0);
-                                    writer->op_assign(a, const_0);
+    writer->op_assign(tile_yk, const_0_i32);
 
-                                    // src_tile loading
-                                    auto &xi_curr = writer->declare_tile("xi_curr", ckw::DataType::Int32);
-                                    writer->op_binary_expression(xi_curr, xi, BinaryOp::Add, xk);
-                                    auto &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
-                                    writer->op_binary_expression(a_y, yi, BinaryOp::Add, yk);
-                                    src_sampler.x(a_x);
-                                    src_sampler.y(xi_curr);
-                                    src_sampler.z(a_y);
-                                    src_sampler.b(bout);
-                                    writer->op_load(a, src->tensor(), src_sampler);
-
-                                    // wei_tile loading
-                                    auto &b_y = writer->declare_tile("b_y", ckw::DataType::Int32);
-                                    writer->op_binary_expression(b_y, wei_width, BinaryOp::Mul, yk);
-                                    writer->op_binary_expression(b_y, b_y, BinaryOp::Add, xk);
-                                    wei_sampler.x(g_ind_0);
-                                    wei_sampler.y(b_y);
-                                    wei_sampler.z(const_0);
-                                    wei_sampler.b(const_0);
-                                    writer->op_load(b, wei->tensor(), wei_sampler);
-
-                                    // Do the accumulation
-                                    auto &mul_result = writer->declare_tile("mul_results", a.data_type());
-                                    writer->op_binary_expression(mul_result, a, BinaryOp::Mul, b);
-                                    writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, mul_result);
-                                });
-                        });
-
-    // Add Bias
-    if (_bias && _bias->has_valid_id())
+    // clang-format off
+    writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32,
+    [&]()
     {
-        TensorTileSampler bias_sampler;
-        bias_sampler.width(n0);
-        bias_sampler.height(1);
-        bias_sampler.format(TensorSamplerFormat::C_W_H);
-        bias_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-        bias_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-        bias_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-        bias_sampler.x(g_ind_0);
-        bias_sampler.y(const_0);
-        bias_sampler.z(const_0);
-        bias_sampler.b(const_0);
+        auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0));
+        auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0));
 
-        auto &bias_tile = writer->declare_tile("bias_tile", ckw::TileInfo(to_ckw(_bias->data_type()), 1, n0));
-        writer->op_load(bias_tile, bia->tensor(), bias_sampler);
-        writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, bias_tile);
+        writer->op_assign(tile_src, const_0_fp);
+
+        auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_x_lt_w  = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_lt_h  = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32));
+
+        // Check if yi + yk * DILATION_Y is out-of-bound
+        writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32);
+        writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32);
+
+        auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32));
+
+        // Load src
+        for(int32_t xk = 0; xk < src_m0; ++xk)
+        {
+            auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32));
+
+            // xi + xk * DILATION_X
+            writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32);
+            writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi);
+
+            // Check if xi + xk * DILATION_X is out-of-bound
+            writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32);
+            writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32);
+
+            // Set mi to -1 if we have out-of-bound memory accesses
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h);
+
+            writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0);
+        }
+
+        // Load wei
+        writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32);
+
+        // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW
+        // Therefore, this part should be replaced with the MAC ternary operator when availabe
+        auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+        for(int32_t m0 = 0; m0 < dst_m0; ++m0)
+        {
+            for(int32_t xk = 0; xk < kernel_width; ++xk)
+            {
+                auto tile_a = tile_src.row(m0 + xk);
+                auto tile_b = tile_wei.row(xk);
+                auto tile_c = tile_dst.row(m0);
+
+                writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b);
+                writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp);
+            }
+        }
+        writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32);
+    });
+    // clang-format on
+
+    // Bias addition
+    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+    if (using_bias)
+    {
+        if (!bia->has_tile())
+        {
+            auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+            writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32);
+            bia->init_virtual_tensor(tile_bia, sampler_bia);
+        }
+        auto &tile_bia = bia->tile();
+
+        writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
     }
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwDepthwiseConv2d::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     TensorShape output_shape = _dst->tensor_shape();
-    // Currently only m0 and n0 = 1 are supported.
-    Window win = calculate_max_window(output_shape, Steps(1U, 1U));
+
+    Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0()));
     return win.collapse(win, Window::DimZ);
 }
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
index f9bcaab..a15d3ee 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -67,8 +68,8 @@
 
 private:
     const ITensorInfo *_src;
-    const ITensorInfo *_weight;
-    const ITensorInfo *_bias;
+    const ITensorInfo *_wei;
+    const ITensorInfo *_bia;
     const ITensorInfo *_dst;
     Attributes         _attributes;
     Settings           _settings;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
index 7833da2..eb4f644 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,18 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileInfo.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+#include <vector>
 
 namespace arm_compute
 {
@@ -47,7 +46,7 @@
 namespace dynamic_fusion
 {
 
-using TileContainer = std::vector<std::vector<std::string>>;
+using TileContainer = std::vector<std::vector<int32_t>>;
 
 GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
                                        const ArgumentPack<ITensorInfo> &tensors,
@@ -70,20 +69,126 @@
     ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
                              "Only the weights tensor can be exported to cl_image");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
-    const unsigned int width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+    const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const uint32_t width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
 
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
+
+    if (using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+    }
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt        = to_ckw(_dst->data_type());
+    const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width  = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_c         = static_cast<int32_t>(_src->dimension(channel_idx));
+    const auto src_w         = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h         = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_w         = static_cast<int32_t>(_dst->dimension(width_idx));
+    const auto stride_x      = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y      = static_cast<int32_t>(_attributes.stride().y());
+    const auto pad_x         = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y         = static_cast<int32_t>(_attributes.pad().top);
+    const auto kernel_size   = kernel_width * kernel_height;
+    const auto k0 =
+        static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)));
+
+    // CKW constants
+    auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+    auto const_kernel_size_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+    auto const_src_c_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32));
+    auto const_src_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32));
+    auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_k0_i32       = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+    auto const_0_i32        = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_neg_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+    auto const_0_fp         = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_src_c_i32_minus_k0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    // Tunable parameters
-    const int32_t m0         = root_window.y().step();
-    const int32_t n0         = root_window.x().step();
-    const int32_t k0         = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
-    const int32_t partial_n0 = _dst->dimension(0) % n0;
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
 
-    const int32_t K = _src->dimension(channel_idx);
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
 
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
     // Exporting the weights tensor to an OpenCL image object is currently only supported when:
     //   a) k0 is equal to 4
     // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
@@ -92,143 +197,123 @@
     // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
     // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
     // extra loop to compute the left-over elements.
-    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
+    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0);
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *wei = vtable.declare_variable(
-        comp_group, writer, _wei,
-        use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-    GpuCkwComponentArgument *bia = nullptr;
+    // SOURCE SAMPLER
+    // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+    //   compute left-over elements
+    // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the
+    //   indirection buffer mi does not contain negative values representing out-of-bounds reads.
+    auto address_mode_y_src =
+        kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero;
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(address_mode_y_src);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
 
-    const bool using_bias = _bia != nullptr;
+    // WEIGHTS SAMPLER
+    // We cannot have out-of-bounds accesses for the weights
+    ckw::TensorSampler sampler_wei;
+    sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+    sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    if (use_cl_image_for_weights)
+    {
+        sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+    }
+    else
+    {
+        sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
+
+    // BIAS SAMPLER
+    ckw::TensorSampler sampler_bia;
 
     if (using_bias)
     {
-        bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
+        sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+        sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
     }
 
-    // Constants
-    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
-    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
-    const auto src_channels     = static_cast<int32_t>(_src->dimension(channel_idx));
-    auto      &tile_kernel_w    = writer->declare_tile("kernel_w", kernel_width);
-    auto      &tile_kernel_size = writer->declare_tile("kernel_size", kernel_width * kernel_height);
-    auto      &tile_src_c       = writer->declare_tile("src_c", static_cast<int32_t>(_src->dimension(channel_idx)));
-    auto      &tile_dst_w       = writer->declare_tile("dst_w", static_cast<int32_t>(_dst->dimension(width_idx)));
-    auto      &tile_stride_x    = writer->declare_tile("stride_x", static_cast<int32_t>(_attributes.stride().x()));
-    auto      &tile_stride_y    = writer->declare_tile("stride_y", static_cast<int32_t>(_attributes.stride().y()));
-    auto      &tile_pad_x       = writer->declare_tile("pad_x", static_cast<int32_t>(_attributes.pad().left));
-    auto      &tile_pad_y       = writer->declare_tile("pad_y", static_cast<int32_t>(_attributes.pad().top));
-    auto      &tile_k0          = writer->declare_tile("k0", k0);
-    auto      &tile_0           = writer->declare_tile("0", 0);
-    auto      &tile_1           = writer->declare_tile("1", 1);
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
 
-    auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
     writer->op_get_global_id(tile_gid_0, 0);
     writer->op_get_global_id(tile_gid_1, 1);
     writer->op_get_global_id(tile_gid_2, 2);
 
-    auto &tile_cout = writer->declare_tile("cout", ckw::DataType::Int32); // OFM
-    auto &tile_mout = writer->declare_tile("mout", ckw::DataType::Int32); // WIDTH x HEIGHT
-    auto &tile_bout = writer->declare_tile("bout", ckw::DataType::Int32); // BATCH SIZE IDX
+    auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT
+    auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    // Get the boundary aware coordinates at each global dimension index
-    get_coord(writer, tile_cout, tile_gid_0, n0, partial_n0, tile_cout.name() + "_dim0_", tile_0);
-    get_coord(writer, tile_mout, tile_gid_1, m0, 0, tile_mout.name() + "_dim1_", tile_0);
-    get_coord(writer, tile_bout, tile_gid_2, 1, 0, tile_bout.name() + "_dim2_", tile_0);
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32);
+    get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32);
 
-    TensorTileSampler src_sampler;
-    src_sampler.width(k0);
-    src_sampler.height(m0);
-    src_sampler.format(TensorSamplerFormat::C_WH_1);
-    // We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
-    // compute left-over elements
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
-    // indirection buffer mi does not contain negative values representing out-of-bounds reads.
-    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None
-                                                  : TensorSamplerAddressModeY::SkipMinEdgeOnly);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler wei_sampler;
-    wei_sampler.width(k0);
-    wei_sampler.height(n0);
-    wei_sampler.format(TensorSamplerFormat::C_WH_1);
-    // We cannot have out-of-bounds accesses for the weights
-    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_WH_1);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::ClampToMaxEdgeOnly);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    dst_sampler.x(tile_cout);
-    dst_sampler.y(tile_mout);
-    dst_sampler.z(tile_0);
-    dst_sampler.b(tile_bout);
-
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
-        dst->init_virtual_tensor(tile, dst_sampler);
-    }
-    auto &tile_dst = dst->tile();
-
-    writer->op_assign(tile_dst, tile_0);
-
-    // We create a 2d container of size (M0, 1) to store the indices for iteration
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    // We create a 2d container of size (dst_m0, 1) to store the indices for iteration
     TileContainer it;
-    for (int m = 0; m < m0; ++m)
+    for (int32_t m = 0; m < dst_m0; ++m)
     {
-        std::vector<std::string> idx{std::to_string(m)};
+        std::vector<int32_t> idx{m};
         it.push_back({idx});
     }
-    const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
 
-    auto &tile_xi = writer->declare_tile("xi", TileInfo(ckw::DataType::Int32, m0, 1));
-    auto &tile_yi = writer->declare_tile("yi", TileInfo(ckw::DataType::Int32, m0, 1));
+    const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32));
+
+    auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+    auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
 
     // Convert the linear index to coordinate
     // xi = ((mout + i) % dst_w) * stride_x - pad_x
     // yi = ((mout + i) / dst_w) * stride_y - pad_y
-    writer->op_binary_expression(tile_xi, tile_mout, BinaryOp::Add, tile_it);
-    writer->op_binary_expression(tile_yi, tile_mout, BinaryOp::Add, tile_it);
-    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mod, tile_dst_w);
-    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Div, tile_dst_w);
-    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mul, tile_stride_x);
-    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Mul, tile_stride_y);
-    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Sub, tile_pad_x);
-    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Sub, tile_pad_y);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32);
 
-    auto &tile_y_b = writer->declare_tile("y_b", ckw::DataType::Int32);
-    writer->op_binary_expression(tile_y_b, tile_cout, BinaryOp::Mul, tile_kernel_size);
+    auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32));
+    writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32);
 
-    auto &tile_i = writer->declare_tile("i", ckw::DataType::Int32);
-    writer->op_assign(tile_i, tile_0);
+    auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32));
+    writer->op_assign(tile_i, const_0_i32);
 
     // clang-format off
-    writer->op_for_loop(tile_i, BinaryOp::Less, tile_kernel_size, tile_i, AssignmentOp::Increment, tile_1, [&]()
+    writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
     {
-        auto &tile_x_k = writer->declare_tile("x_k", ckw::DataType::Int32);
-        auto &tile_y_k = writer->declare_tile("y_k", ckw::DataType::Int32);
+        auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32));
 
-        writer->op_binary_expression(tile_x_k, tile_i, BinaryOp::Mod, tile_kernel_w);
-        writer->op_binary_expression(tile_y_k, tile_i, BinaryOp::Div, tile_kernel_w);
+        writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32);
+        writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32);
 
-        auto &tile_ck = writer->declare_tile("ck", ckw::DataType::Int32);
-        writer->op_assign(tile_ck, tile_0);
+        auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32));
+        writer->op_assign(tile_ck, const_0_i32);
 
-        auto &tile_mi = writer->declare_tile("mi", TileInfo(ckw::DataType::Int32, m0, 1));
         // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
         // x_s = xi + x_k
         // y_s = yi + y_k
@@ -237,68 +322,78 @@
         // mi = select(-1, mi, x_s < width);
         // mi = select(-1, mi, y_s >= 0);
         // mi = select(-1, mi, y_s < height);
-        writer->util_get_indirect_buffer(tile_mi, src->tensor(), src_sampler, tile_xi, tile_yi, tile_x_k, tile_y_k);
+        auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
 
-        src_sampler.x(tile_ck);
-        src_sampler.y(tile_mi);
-        src_sampler.z(tile_0);
-        src_sampler.b(tile_bout);
+        auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_xs_lt_w  = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys_lt_h  = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
 
-        wei_sampler.x(tile_ck);
-        wei_sampler.y(tile_y_b);
-        wei_sampler.z(tile_0);
-        wei_sampler.b(tile_0);
+        writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k);
+        writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k);
+        writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32);
+        writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs);
+        writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32);
+        writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32);
+        writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32);
+        writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h);
 
-        auto &tile_src_c_minus_k0 = writer->declare_tile("src_c_minus_k0", src_channels - k0);
-
-        writer->op_for_loop(tile_ck, BinaryOp::LessEqual, tile_src_c_minus_k0, tile_ck, AssignmentOp::Increment, tile_k0, [&]()
+        writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]()
         {
-            auto &tile_lhs = writer->declare_tile("lhs", TileInfo(to_ckw(_src->data_type()), m0, k0));
-            auto &tile_rhs = writer->declare_tile("rhs", TileInfo(to_ckw(_wei->data_type()), n0, k0));
-            writer->op_assign(tile_lhs, tile_0);
-            writer->op_assign(tile_rhs, tile_0);
+            auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0));
+            auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0));
+            writer->op_assign(tile_lhs, const_0_fp);
+            writer->op_assign(tile_rhs, const_0_fp);
 
-            writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
-            writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+            writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+            writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
 
-            writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+            writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
         });
 
         // Left-over accumulations for when K is not a multiple of k0
-        if(!(K % k0 == 0))
+        if(((src_c % k0) != 0))
         {
-            writer->op_for_loop(tile_ck, BinaryOp::Less, tile_src_c, tile_ck, AssignmentOp::Increment, tile_1, [&]()
+            writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
             {
-                auto &tile_lhs = writer->declare_tile("lhs_leftover", TileInfo(to_ckw(_src->data_type()), m0, 1));
-                auto &tile_rhs = writer->declare_tile("rhs_leftover", TileInfo(to_ckw(_wei->data_type()), n0, 1));
-                writer->op_assign(tile_lhs, tile_0);
-                writer->op_assign(tile_rhs, tile_0);
+                auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1));
+                auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1));
+                writer->op_assign(tile_lhs, const_0_fp);
+                writer->op_assign(tile_rhs, const_0_fp);
 
-                writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
-                writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+                writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+                writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
 
-                writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+                writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
             });
         }
 
-    writer->op_binary_expression(tile_y_b, tile_y_b, BinaryOp::Add, tile_1);
+        writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32);
     });
     // clang-format on
 
-    // Bias addition
-    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of
     // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
     if (using_bias)
     {
         if (!bia->has_tile())
         {
-            // Reuse the destination sampler for the bias
-            writer->op_load_once(bia, dst_sampler);
+            auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+            writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32);
+            bia->init_virtual_tensor(tile_bia, sampler_bia);
         }
         auto &tile_bia = bia->tile();
 
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_bia);
+        writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
     }
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwDirectConv2d::get_window() const
@@ -308,13 +403,13 @@
     const auto dst_shape = _dst->tensor_shape();
     const auto desc      = _settings.direct_conv_descriptor();
 
-    const unsigned int n0 = adjust_vec_size(desc.n0, dst_shape[0]);
-    const unsigned int m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+    const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+    const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
 
-    Window win = calculate_max_window(dst_shape, Steps(n0, m0));
+    Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0));
 
-    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], m0);
-    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
+    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0));
     win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
 
     return win;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
index ac32d2d..139cf62 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -37,7 +38,7 @@
 {
 namespace dynamic_fusion
 {
-class GpuCkwDirectConv2d final : public IGpuCkwComponentDriver
+class GpuCkwDirectConv2d : public IGpuCkwComponentDriver
 {
 public:
     using Attributes = ClComponentDirectConv2d::Attributes;
@@ -57,9 +58,7 @@
                        const ArgumentPack<ITensorInfo> &tensors,
                        const Attributes                &attributes,
                        const Settings                  &settings);
-
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
-
     /** Destructor */
     ~GpuCkwDirectConv2d() override = default;
 
@@ -76,11 +75,11 @@
     const ITensorInfo *_bia;
     const ITensorInfo *_dst;
 
-    const Attributes _attributes;
-    const Settings   _settings;
+    Attributes _attributes;
+    Settings   _settings;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index 2935ba4..fb55aca 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,11 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/types/TensorSamplerTypes.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
@@ -42,10 +39,12 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
-#include <algorithm>
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/ConstantData.h"
+#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h"
+#include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
@@ -67,67 +66,339 @@
                                                    GpuCkwVariableTable     &vtable,
                                                    GpuCkwScopedKernelWriter writer) const
 {
-    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const auto n0          = static_cast<int32_t>(root_window.x().step());
-    const auto m0          = static_cast<int32_t>(root_window.y().step());
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *lhs =
-        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs =
-        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
 
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
 
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
 
-    auto &const_0 = writer->declare_tile("0", 0);
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
 
-    // Load the LHS and RHS tiles
-    if (!lhs->has_tile())
-    {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1),
-                                                        n0, m0, "lhs_", const_0);
-        sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-        sampler.z(const_0);
-        sampler.b(gid_2);
-        writer->op_load_once(lhs, sampler);
-    }
-    if (!rhs->has_tile())
-    {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1),
-                                                        n0, m0, "rhs_", const_0);
-        sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-        sampler.z(const_0);
-        sampler.b(gid_2);
-        writer->op_load_once(rhs, sampler);
-    }
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
 
-    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1),
-                                                        n0, m0, "dst_", const_0);
-    dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-    dst_sampler.z(const_0);
-    dst_sampler.b(gid_2);
-
-    // Prepare the output tile.
     if (!dst->has_tile())
     {
-        auto &tile = writer->declare_tile(
-            "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
-        dst->init_virtual_tensor(tile, dst_sampler);
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        ckw::DataType dst_dt   = to_ckw(_dst->data_type());
+        auto          tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
+    }
+    else
+    {
+        // Change dst_n0 and dst_m0 if NOT root component!
+        dst_n0 = dst->tile().tile_info().width();
+        dst_m0 = dst->tile().tile_info().height();
+
+        // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial
+        // because if we enter this condition it means that the element-wise op is not the
+        // root component and the address modes have been already set.
     }
 
-    auto &lhs_tile = lhs->tile();
-    auto &rhs_tile = rhs->tile();
-    auto &dst_tile = dst->tile();
+    const auto &tile_dst = dst->tile();
 
-    // Perform the operation.
-    writer->op_binary_expression(dst_tile, lhs_tile, to_ckw(_attributes), rhs_tile);
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // ...
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // Check whether the lhs tensor is a tile or tensor
+    // If it is a tile, create a sampler and load the content in a tile
+    if (!lhs->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_lhs = dst->tensor_sampler();
+
+        bool broadcast_x = false;
+        bool broadcast_y = false;
+
+        int32_t lhs_n0 = dst_n0;
+        int32_t lhs_m0 = dst_m0;
+
+        // Check whether we have broadcasting
+        // In case of broadcast, lhs can only be a vector or scalar.
+        // Broadcasting in other dimensions is not supported
+        if (_dst->dimension(0) != _lhs->dimension(0))
+        {
+            broadcast_x = true;
+            lhs_n0      = 1;
+        }
+
+        if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2))
+            {
+                broadcast_y = true;
+                lhs_m0      = 1;
+            }
+        }
+        else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            if (_dst->dimension(1) != _lhs->dimension(1))
+            {
+                broadcast_y = true;
+                lhs_m0      = 1;
+            }
+        }
+
+        const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0;
+        const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0;
+
+        // Constants
+        auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32));
+        auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32));
+        auto const_lhs_shift_back_n0_i32 =
+            writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32));
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        if (!broadcast_x)
+        {
+            get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32,
+                                                    const_lhs_shift_back_n0_i32, const_0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_cout0, const_0_i32);
+        }
+
+        if (!broadcast_y)
+        {
+            get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_mout0, const_0_i32);
+        }
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            // For tile_mout1 and tile_bout0 the step can only be 1
+            if (!broadcast_y)
+            {
+                writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            }
+            else
+            {
+                // If broadcast_y == true, it means that we have either a scalar or vector
+                // because broadcasting in other dimensions is not supported
+                writer->op_assign(tile_mout1, const_0_i32);
+            }
+
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+
+        ckw::DataType lhs_dt   = to_ckw(_lhs->data_type());
+        auto          tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0));
+
+        writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement
+        lhs->init_virtual_tensor(tile_lhs, sampler_lhs);
+    }
+
+    // Check whether the rhs tensor is a tile or tensor
+    // If it is a tile, create a sampler and load the content in a tile
+    if (!rhs->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_rhs = dst->tensor_sampler();
+
+        bool broadcast_x = false;
+        bool broadcast_y = false;
+
+        int32_t rhs_n0 = dst_n0;
+        int32_t rhs_m0 = dst_m0;
+
+        // Check whether we have broadcasting
+        // In case of broadcast, rhs can only be a vector or scalar.
+        // Broadcasting in other dimensions is not supported
+        if (_dst->dimension(0) != _rhs->dimension(0))
+        {
+            broadcast_x = true;
+            rhs_n0      = 1;
+        }
+
+        if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2))
+            {
+                broadcast_y = true;
+                rhs_m0      = 1;
+            }
+        }
+        else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            if (_dst->dimension(1) != _rhs->dimension(1))
+            {
+                broadcast_y = true;
+                rhs_m0      = 1;
+            }
+        }
+
+        const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0;
+        const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0;
+
+        // Constants
+        auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32));
+        auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32));
+        auto const_rhs_shift_back_n0_i32 =
+            writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32));
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        if (!broadcast_x)
+        {
+            get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32,
+                                                    const_rhs_shift_back_n0_i32, const_0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_cout0, const_0_i32);
+        }
+
+        if (!broadcast_y)
+        {
+            get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_mout0, const_0_i32);
+        }
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            // For tile_mout1 and tile_bout0 the step can only be 1
+            const auto src_w       = static_cast<int32_t>(_rhs->dimension(1));
+            auto       const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+            if (!broadcast_y)
+            {
+                writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w);
+            }
+            else
+            {
+                // If broadcast_y == true, it means that we have either a scalar or vector
+                // because broadcasting in other dimensions is not supported
+                writer->op_assign(tile_mout1, const_0_i32);
+            }
+
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w);
+        }
+
+        ckw::DataType rhs_dt   = to_ckw(_rhs->data_type());
+        auto          tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0));
+
+        writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement
+        rhs->init_virtual_tensor(tile_rhs, sampler_rhs);
+    }
+
+    const auto &tile_lhs = lhs->tile();
+    const auto &tile_rhs = rhs->tile();
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    // Perform the element-wise operation
+    writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwElementwiseBinary::get_window() const
@@ -138,8 +409,8 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int     num_elems_processed_per_iteration =
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
         adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
     Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
@@ -158,24 +429,6 @@
     };
     return join(build_params, "_");
 }
-
-std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles)
-    std::vector<std::string> build_params = {
-        "elementwise_binary",
-        "op",
-        to_string(_attributes.operation()),
-        "dt",
-        lower_string(string_from_data_type(_dst->data_type())),
-        "dst_dim0",
-        support::cpp11::to_string(_dst->dimension(0)),
-        "dst_dim1",
-        support::cpp11::to_string(_dst->dimension(1)),
-    };
-    return join(build_params, "_");
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
index 1a20d4c..c6cbba2 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
 
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
@@ -56,7 +56,6 @@
                                       GpuCkwScopedKernelWriter writer) const override;
     Window       get_window() const override;
     std::string  get_name(const ComponentGroup &comp_group) const override;
-    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_lhs;
@@ -68,4 +67,4 @@
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
index 9beba03..14ad384 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,20 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
-using namespace ckw;
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
 namespace arm_compute
 {
 namespace experimental
@@ -59,189 +61,189 @@
                                         GpuCkwVariableTable     &vtable,
                                         GpuCkwScopedKernelWriter writer) const
 {
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto k =
+        _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0));
+    const auto k0     = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k));
+    const auto dst_dt = to_ckw(_dst->data_type());
+
+    // CKW constants
+    auto const_k_i32          = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32));
+    auto const_k0_i32         = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+    auto const_0_i32          = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32      = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_fp           = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    GpuCkwComponentArgument *lhs =
-        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs =
-        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
 
-    // Constants
-    const int   height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT);
-    const auto &rhs_h      = writer->declare_tile("rhs_h", static_cast<int32_t>(_rhs->dimension(height_idx)));
-    const int   m          = static_cast<int>(_dst->dimension(1));
-    const int   n          = static_cast<int>(_dst->dimension(0));
-    const int   k =
-        _attributes.adj_lhs() ? static_cast<int>(_lhs->tensor_shape().y()) : static_cast<int>(_lhs->tensor_shape().x());
-    const int m0               = root_window.y().step();
-    const int n0               = root_window.x().step();
-    const int k0               = _settings.k0();
-    const int partial_store_m0 = m % m0;
-    const int partial_store_n0 = n % n0;
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
 
-    const auto &const_1 = writer->declare_tile("1", 1);
-    auto       &const_0 = writer->declare_tile("0", 0);
-    auto       &k0_tile = writer->declare_tile("k0", k0);
-    auto       &k_tile  = writer->declare_tile("k", k);
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
 
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &x = writer->declare_tile("x", ckw::DataType::Int32);
-    auto &y = writer->declare_tile("y", ckw::DataType::Int32);
-    auto &z = writer->declare_tile("z", ckw::DataType::Int32);
-
-    get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0);
-    get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0);
-    get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0);
-
-    TensorTileSampler lhs_sampler;
-    lhs_sampler.height(m0);
-    lhs_sampler.width(k0);
-    lhs_sampler.format(TensorSamplerFormat::C_W_H);
-    lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler rhs_sampler;
-    rhs_sampler.height(k0);
-    rhs_sampler.width(n0);
-    rhs_sampler.format(TensorSamplerFormat::C_WH_1);
-    rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    dst_sampler.x(x);
-    dst_sampler.y(y);
-    dst_sampler.z(z);
-    dst_sampler.b(const_0);
-
-    if (!dst->has_tile())
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
     {
-        auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
-        dst->init_virtual_tensor(dst_tile, dst_sampler);
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
     }
-    auto &dst_tile = dst->tile();
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
 
-    // Initialize the accumulators
-    writer->op_assign(dst_tile, const_0);
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
 
-    auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32);
-    writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
 
-    auto &k_i     = writer->declare_tile("k_i", ckw::DataType::Int32);
-    auto &k_limit = writer->declare_tile("k_limit", k - k0);
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
 
-    auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32);
-    writer->op_assign(x_i, const_0);
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
 
-    writer->op_assign(k_i, const_0);
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
 
-    // *INDENT-OFF*
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // LHS SAMPLER
+    // The assumption here is that M is multiple of M0. This limitation will be removed once
+    // we have the support for OverlappingMin as address mode for the Y direction
+    ckw::TensorSampler sampler_lhs;
+    sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // RHS SAMPLER
+    ckw::TensorSampler sampler_rhs;
+    sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index
+    auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index
+    auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32);
+    get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index
+
+    writer->op_assign(tile_idx_k, const_0_i32);
+
     // clang-format off
-    writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile,
-        [&]()
-        {
-            //Initialize tiles
-            // lhs_tile
-            auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0));
-            // rhs_tile
-            auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0));
-            writer->op_assign(a, const_0);
-            writer->op_assign(b, const_0);
-
-            // Loading the tiles
-            // LHS
-            lhs_sampler.x(x_i);
-            lhs_sampler.y(y);
-            lhs_sampler.z(z);
-            lhs_sampler.b(const_0);
-            writer->op_load(a, lhs->tensor(), lhs_sampler);
-
-            // RHS
-            auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32);
-            writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
-            rhs_sampler.x(k_i);
-            rhs_sampler.y(y_i);
-            rhs_sampler.z(const_0);
-            rhs_sampler.b(const_0);
-            writer->op_load(b, rhs->tensor(), rhs_sampler);
-
-            // Perform Matmul
-            writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
-            writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile);
-        });
-// *INDENT-ON*
-    // clang-format on
-
-    // Handling leftovers
-    if (k % k0 != 0)
+    writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32,
+    [&]()
     {
-        // *INDENT-OFF*
-        // clang-format off
-        writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1,
-            [&]()
-            {
-                //Initialize tiles
-                // lhs_tile
-                auto &a =
-                    writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1));
-                // rhs_tile
-                auto &b =
-                    writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1));
-                writer->op_assign(a, const_0);
-                writer->op_assign(b, const_0);
+        auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0));
+        auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0));
+        writer->op_assign(tile_lhs, const_0_fp);
+        writer->op_assign(tile_rhs, const_0_fp);
 
-                // Loading the tiles
-                // LHS
-                lhs_sampler.x(x_i);
-                lhs_sampler.y(y);
-                lhs_sampler.z(z);
-                lhs_sampler.b(const_0);
-                writer->op_load(a, lhs->tensor(), lhs_sampler);
+        writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+        writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
 
-                // RHS
-                auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32);
-                writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
-                rhs_sampler.x(k_i);
-                rhs_sampler.y(y_i);
-                rhs_sampler.z(const_0);
-                rhs_sampler.b(const_0);
-                writer->op_load(b, rhs->tensor(), rhs_sampler);
+        writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
 
-                // Perform Matmul
-                writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
-                writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1);
-            });
-// *INDENT-ON*
-        // clang-format on
+    });
+
+    // Left-over accumulations for when K is not a multiple of k0
+    if(((k % k0) != 0))
+    {
+        writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+        {
+            auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1));
+            auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1));
+            writer->op_assign(tile_lhs, const_0_fp);
+            writer->op_assign(tile_rhs, const_0_fp);
+
+            writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+            writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+            writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+        });
     }
+    // clang-format on
 }
 
 Window GpuCkwMatMul::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
-    const int  m       = _dst->dimension(1);
-    const int  n       = _dst->dimension(0);
-    const bool adj_lhs = _attributes.adj_lhs();
+    const int32_t m       = _dst->dimension(1);
+    const int32_t n       = _dst->dimension(0);
+    const bool    adj_lhs = _attributes.adj_lhs();
 
-    int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
-    int n0 = adjust_vec_size(_settings.n0(), n);
+    const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+    const int32_t n0 = adjust_vec_size(_settings.n0(), n);
 
     // Configure kernel window
     Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
@@ -256,9 +258,9 @@
 
     std::string kernel_name("mat_mul_native");
 
-    const int m = _dst->dimension(1);
-    const int n = _dst->dimension(0);
-    const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+    const int32_t m = _dst->dimension(1);
+    const int32_t n = _dst->dimension(0);
+    const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
 
     kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
     kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
index ae2ea09..790418b 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -75,8 +76,8 @@
     const ITensorInfo *_rhs;
     const ITensorInfo *_dst;
 
-    const Attributes _attributes;
-    const Settings   _settings;
+    Attributes _attributes;
+    Settings   _settings;
 };
 } // namespace dynamic_fusion
 } // namespace experimental

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
index 8ab3ec3..d027f34 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,17 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
-using namespace ckw;
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -61,272 +60,324 @@
                                         GpuCkwVariableTable     &vtable,
                                         GpuCkwScopedKernelWriter writer) const
 {
-    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
+    const uint32_t width_idx  = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT);
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    TileOperand &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt    = to_ckw(_dst->data_type());
+    const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x());
+    const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y());
+    const auto pad_x     = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y     = static_cast<int32_t>(_attributes.pad().top);
+    const auto stride_x  = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y  = static_cast<int32_t>(_attributes.stride().y());
+    const auto src_w     = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h     = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_h     = static_cast<int32_t>(_dst->dimension(height_idx));
 
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
+    // CKW constants
+    auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32));
+    auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32     = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32     = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_stride_x_i32  = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32  = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_src_w_i32     = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32     = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32     = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_0_i32         = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32     = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_fp          = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_lowest_val_fp =
+        writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32));
+    auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32));
 
-    // Data Layout is NHWC
-    constexpr int width_idx  = 1;
-    constexpr int height_idx = 2;
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    const int32_t pool_size_x   = static_cast<int32_t>(_attributes.pool_size().x());
-    const int32_t pool_size_y   = static_cast<int32_t>(_attributes.pool_size().y());
-    const int32_t pad_x         = static_cast<int32_t>(_attributes.pad().left);
-    const int32_t pad_y         = static_cast<int32_t>(_attributes.pad().top);
-    const int32_t src_width     = static_cast<int32_t>(_src->dimension(width_idx));
-    const int32_t src_height    = static_cast<int32_t>(_src->dimension(height_idx));
-    const auto    src_data_type = _src->data_type();
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
 
-    // Check if this is global pooling path
-    const bool is_global_pooling =
-        (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
-    // Check if this a case of FP_MIXED_PRECISION
-    const bool use_fp_mixed_precision =
-        (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
 
-    TileOperand       &const_0            = writer->declare_tile("0", 0);
-    const TileOperand &const_1            = writer->declare_tile("1", 1);
-    const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits<float>::lowest());
-    const TileOperand &pool_size_x_tile   = writer->declare_tile("POOL_SIZE_X", pool_size_x);
-    const TileOperand &pool_size_y_tile   = writer->declare_tile("POOL_SIZE_Y", pool_size_y);
-    const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
-    const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
-    const TileOperand &pad_x_tile    = writer->declare_tile("PAD_X", pad_x);
-    const TileOperand &pad_y_tile    = writer->declare_tile("PAD_Y", pad_y);
-    const TileOperand &dst_height_tile =
-        writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
-    const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height);
-    const TileOperand &src_width_tile  = writer->declare_tile("SRC_WIDTH", src_width);
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
 
-    TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32);
-    TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32);
-    TileOperand &idx_out_w = writer->declare_tile("idx_out_w", ckw::DataType::Int32);
-    TileOperand &idx_out_c = writer->declare_tile("idx_out_c", ckw::DataType::Int32);
-
-    const int32_t dst_partial_n0_v = _dst->tensor_shape()[0] % n0;
-
-    get_coord(writer, idx_out_c, gid_0, n0, dst_partial_n0_v, "dst_x_", const_0);
-    get_coord(writer, idx_out_w, gid_1, 1, 0, "dst_y_", const_0);
-
-    writer->op_binary_expression(idx_out_h, gid_2, BinaryOp::Mod, dst_height_tile); // gid_2 % h
-    writer->op_binary_expression(idx_out_n, gid_2, BinaryOp::Div, dst_height_tile); // gid_2 / h
-
-    TensorTileSampler src_sampler;
-    src_sampler.width(n0);
-    src_sampler.height(m0);
-    src_sampler.format(TensorSamplerFormat::C_W_H);
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    src_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    src_sampler.x(idx_out_c);
-    src_sampler.b(idx_out_n);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    dst_sampler.x(idx_out_c);
-    dst_sampler.y(idx_out_w);
-    dst_sampler.z(idx_out_h);
-    dst_sampler.b(idx_out_n);
-
-    // Prepare dst tensor and tile
-    TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0);
-    if (!dst->has_tile())
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
     {
-        TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info);
-        dst->init_virtual_tensor(dst_tile, dst_sampler);
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
     }
-    const TileOperand &dst_tile = dst->tile();
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
 
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+    // Check if it is global pooling
+    const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0);
+
+    // Accumulate always in F32 if the pool type is not MAX
+    const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) ||
+                         ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX);
+
+    const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16;
+
+    const bool is_wider_acc = dst_dt != acc_dt;
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+    writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
     // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling.
-    const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+    auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
 
     // Initialise result tile with appropriate value
     if (_attributes.pool_type() == PoolingType::MAX)
     {
         if (_settings.use_inf_as_limit())
         {
-            TileContainer            minus_inf_tile_container;
-            std::vector<std::string> value = std::vector<std::string>(n0, "(-INFINITY)");
-            minus_inf_tile_container.push_back({value});
-            const TileOperand &minus_inf =
-                writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
-            writer->op_assign(res_tile, minus_inf);
+            writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None);
         }
         else
         {
-            writer->op_assign(res_tile, const_lowest_value);
+            writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None);
         }
     }
     else
     {
-        writer->op_assign(res_tile, const_0);
+        writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None);
     }
 
-    // idx_in_w = idx_out_w * STRIDE_X - PAD_X
-    TileOperand &idx_in_w = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
-    writer->op_binary_expression(idx_in_w, idx_out_w, BinaryOp::Mul, stride_x_tile);
-    writer->op_binary_expression(idx_in_w, idx_in_w, BinaryOp::Sub, pad_x_tile);
+    // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X
+    auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
+    writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+    writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32);
 
-    // idx_in_h = idx_out_h * STRIDE_Y - PAD_Y
-    TileOperand &idx_in_h = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
-    writer->op_binary_expression(idx_in_h, idx_out_h, BinaryOp::Mul, stride_y_tile);
-    writer->op_binary_expression(idx_in_h, idx_in_h, BinaryOp::Sub, pad_y_tile);
+    // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y
+    auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
+    writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+    writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32);
 
-    TileOperand &minus_idx_in_w = writer->declare_tile("minus_idx_in_w", ckw::DataType::Int32);
-    TileOperand &minus_idx_in_h = writer->declare_tile("minus_idx_in_h", ckw::DataType::Int32);
+    auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32);
+    auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32);
 
-    writer->op_unary_expression(minus_idx_in_w, UnaryOp::Negate, idx_in_w);
-    writer->op_unary_expression(minus_idx_in_h, UnaryOp::Negate, idx_in_h);
+    writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start);
+    writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start);
 
-    // Pooling starting/ending offsets for X dim
-    TileOperand &pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
-    TileOperand &pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+    // int pool_x_s = max((int)0, -idx_in_w);
+    // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    // int pool_y_s = max((int)0, -idx_in_h);
+    // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+    auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
+    auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
+    auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+    auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
 
-    writer->op_binary_elementwise_function(pool_x_s, BinaryFunction::Max, const_0, minus_idx_in_w);
-    writer->op_binary_expression(pool_x_e, src_width_tile, BinaryOp::Add, minus_idx_in_w);
-    writer->op_binary_elementwise_function(pool_x_e, BinaryFunction::Min, pool_size_x_tile, pool_x_e);
+    writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start);
+    writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start);
+    writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e);
+    writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start);
+    writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start);
+    writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e);
 
-    // Pooling starting/ending offsets for Y dim
-    TileOperand &pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
-    TileOperand &pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
-
-    writer->op_binary_elementwise_function(pool_y_s, BinaryFunction::Max, const_0, minus_idx_in_h);
-    writer->op_binary_expression(pool_y_e, src_height_tile, BinaryOp::Add, minus_idx_in_h);
-    writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e);
-
-    const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
+    // #if defined(EXCLUDE_PADDING)
+    // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+    // #else // defined(EXCLUDE_PADDING)
+    // int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+    // #endif // defined(EXCLUDE_PADDING)
+    auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
     if (_attributes.exclude_padding())
     {
-        const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
-        const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+        auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+        auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
 
-        writer->op_binary_expression(y_diff, pool_y_e, BinaryOp::Sub, pool_y_s);
-        writer->op_binary_expression(x_diff, pool_x_e, BinaryOp::Sub, pool_x_s);
-
-        writer->op_binary_expression(filter_size, y_diff, BinaryOp::Mul, x_diff);
+        writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s);
+        writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s);
+        writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff);
     }
     else
     {
-        writer->op_binary_expression(filter_size, pool_size_x_tile, BinaryOp::Mul, pool_size_y_tile);
+        writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32);
     }
 
-    const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32);
-    const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32);
+    auto tile_x = writer->declare_tile("x", ckw::DataType::Int32);
+    auto tile_y = writer->declare_tile("y", ckw::DataType::Int32);
 
     if (is_global_pooling)
     {
-        writer->op_assign(x, const_0);
-        writer->op_assign(y, const_0);
-
-        writer->op_assign(pool_y_e, pool_size_y_tile);
-        writer->op_assign(pool_x_e, pool_size_x_tile);
+        writer->op_assign(tile_y, const_0_i32);
+        writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32);
     }
     else
     {
-        writer->op_assign(x, pool_x_s);
-        writer->op_assign(y, pool_y_s);
+        writer->op_assign(tile_y, tile_pool_y_s);
     }
 
     // Y dim for-loop
     writer->op_for_loop(
-        y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1,
+        tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32,
         [&]()
         {
             // Reset the iterator for the inner loop
             if (is_global_pooling)
             {
-                writer->op_assign(x, const_0);
+                writer->op_assign(tile_x, const_0_i32);
+                writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32);
             }
             else
             {
-                writer->op_assign(x, pool_x_s);
+                writer->op_assign(tile_x, tile_pool_x_s);
             }
 
-            TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
-            writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
+            auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32);
+            writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y);
 
             // X dim for-loop
             writer->op_for_loop(
-                x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1,
+                tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32,
                 [&]()
                 {
-                    TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
-                    writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
+                    auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32);
+                    writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x);
 
-                    TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
-
-                    src_sampler.y(a_x);
-                    src_sampler.z(a_y);
+                    ckw::DataType src_dt   = to_ckw(_src->data_type());
+                    auto          tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
 
                     // Load src tile
-                    if (use_fp_mixed_precision)
+                    if (is_wider_acc)
                     {
-                        TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
-                        writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
-                        writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                        auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+                        writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+                                        tile_src_coord_y, tile_bout0);
+                        writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None);
                     }
                     else
                     {
-                        writer->op_load(src_tile, src->tensor(), src_sampler);
+                        writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+                                        tile_src_coord_y, tile_bout0);
                     }
 
                     // Take the square of the input, for L2 Pooling
                     if (_attributes.pool_type() == PoolingType::L2)
                     {
-                        writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
+                        writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src);
                     }
 
                     // Perfom Pooling op
                     if (_attributes.pool_type() == PoolingType::MAX)
                     {
-                        writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
+                        writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src);
                     }
                     else
                     {
-                        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
+                        writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src);
                     }
                 });
         });
 
     if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
     {
-        // filter_size is automatically broadcasted in the operation
-        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size);
+        // Filter_size is automatically broadcasted in the operation
+        auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt));
+        writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None);
+        writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp);
     }
 
     // Take square root of the result in L2 pooling
     if (_attributes.pool_type() == PoolingType::L2)
     {
-        writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile);
+        writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res);
     }
 
-    // Store the results and do casting if FP_MIXED_PRECISION
-    if (use_fp_mixed_precision)
+    // Store the results and do casting if mixed precision
+    if (is_wider_acc)
     {
-        writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None);
+        writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None);
     }
     else
     {
-        writer->op_assign(dst_tile, res_tile);
+        writer->op_assign(tile_dst, tile_res);
     }
 }
 
@@ -334,8 +385,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
-    TensorShape        output_shape = _dst->tensor_shape();
-    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    TensorShape    output_shape = _dst->tensor_shape();
+    const uint32_t vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));
     win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
index f2a7d41..edd7ea9 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,22 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
+#include <cstdint>
+
 namespace arm_compute
 {
 namespace experimental
@@ -45,7 +48,7 @@
 {
 namespace
 {
-constexpr unsigned int opencl_vector_size_in_bytes = 16;
+constexpr uint32_t opencl_vector_size_in_bytes = 16;
 } // namespace
 
 GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
@@ -60,65 +63,141 @@
                                               GpuCkwVariableTable     &vtable,
                                               GpuCkwScopedKernelWriter writer) const
 {
-    const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+    const uint32_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
 
-    const Window  root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const int32_t n0          = root_window.x().step();
-    const int32_t m0          = root_window.y().step();
-    const int32_t partial_n0  = _dst->dimension(0) % n0;
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    // Constants
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto  dst_dt  = to_ckw(_dst->data_type());
     const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
                                                               _attributes.align_corners());
     const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
                                                               _attributes.align_corners());
-    const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
-    const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
-    const auto &tile_0       = writer->declare_tile("0", 0);
-    const auto &tile_half    = writer->declare_tile("half", 0.5f);
-    const auto &tile_1       = writer->declare_tile("1", 1);
-    const auto &tile_src_w   = writer->declare_tile("src_w", static_cast<int32_t>(_src->dimension(width_idx)));
-    const auto &tile_src_h   = writer->declare_tile("src_h", static_cast<int32_t>(_src->dimension(height_idx)));
-    const auto &tile_dst_h   = writer->declare_tile("dst_h", static_cast<int32_t>(_dst->dimension(height_idx)));
+    const auto  src_w   = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto  src_h   = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto  dst_h   = static_cast<int32_t>(_dst->dimension(height_idx));
 
-    const auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    const auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    const auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    // CKW constants
+    auto const_src_w_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32  = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32      = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_0_fp       = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+    auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+    auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+
+    // dst_m0 must be 1
+    ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+    sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+
+    // ....
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
     writer->op_get_global_id(tile_gid_0, 0);
     writer->op_get_global_id(tile_gid_1, 1);
     writer->op_get_global_id(tile_gid_2, 2);
 
-    auto &tile_co = writer->declare_tile("co", ckw::DataType::Int32); // OFM
-    auto &tile_xo = writer->declare_tile("xo", ckw::DataType::Int32); // WIDTH
-    auto &tile_yo = writer->declare_tile("yo", ckw::DataType::Int32); // HEIGHT
-    auto &tile_bo = writer->declare_tile("bo", ckw::DataType::Int32); // BATCH SIZE IDX
+    auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    // Get the boundary aware coordinates at each global dimension index
-    get_coord(writer, tile_co, tile_gid_0, n0, partial_n0, tile_co.name() + "_dim0_", tile_0);
-    get_coord(writer, tile_xo, tile_gid_1, 1, 0, tile_xo.name() + "_dim1_", tile_0);
-    get_coord(writer, tile_yo, tile_gid_2, 1, 0, tile_yo.name() + "_dim2_", tile_0);
-    get_coord(writer, tile_bo, tile_gid_2, 1, 0, tile_yo.name() + "_dim3_", tile_0);
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    writer->op_assign(tile_xo, tile_gid_1);
+    writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
 
-    writer->op_binary_expression(tile_yo, tile_yo, BinaryOp::Mod, tile_dst_h);
-    writer->op_binary_expression(tile_bo, tile_bo, BinaryOp::Div, tile_dst_h);
-
-    const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
-    const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
     switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
             // yi_f = (yo * scale_y)
-            writer->op_binary_expression(tile_xi_f, tile_xo, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
             break;
         case SamplingPolicy::CENTER:
         {
@@ -127,11 +206,12 @@
             const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
             const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
 
-            writer->op_binary_expression(tile_xo_plus_half, tile_xo, BinaryOp::Add, tile_half);
-            writer->op_binary_expression(tile_yo_plus_half, tile_yo, BinaryOp::Add, tile_half);
-
-            writer->op_binary_expression(tile_xi_f, tile_xo_plus_half, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo_plus_half, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
         }
         break;
         default:
@@ -140,63 +220,32 @@
 
     if (_attributes.align_corners())
     {
-        writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f);
-        writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f);
+        writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f);
+        writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f);
     }
 
     // xi0 = clamp((int)xi_f, 0, (int)src_w - 1)
     // yi0 = clamp((int)yi_f, 0, (int)src_h - 1)
-    const auto &tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
-    const auto &tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
+    auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
+    auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
 
-    writer->op_cast_expression(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
-    writer->op_cast_expression(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
 
-    const auto &tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
-    const auto &tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+    auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
 
-    writer->op_binary_expression(tile_src_w_minus_1, tile_src_w, BinaryOp::Sub, tile_1);
-    writer->op_binary_expression(tile_src_h_minus_1, tile_src_h, BinaryOp::Sub, tile_1);
+    writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+    writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
 
-    auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
-    auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+    auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
 
-    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0,
-                                            tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0,
-                                            tile_src_h_minus_1);
+    writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1);
 
-    TensorTileSampler src_sampler;
-    src_sampler.x(tile_co);
-    src_sampler.y(tile_xi0);
-    src_sampler.z(tile_yi0);
-    src_sampler.b(tile_bo);
-    src_sampler.height(m0);
-    src_sampler.width(n0);
-    // We guarantee to not have out-of-bounds accesses
-    src_sampler.format(TensorSamplerFormat::C_W_H);
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    src_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    writer->op_load_once(src, src_sampler);
-    auto &tile_src = src->tile();
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.x(tile_co);
-    dst_sampler.y(tile_xo);
-    dst_sampler.z(tile_yo);
-    dst_sampler.b(tile_bo);
-    dst_sampler.height(m0);
-    dst_sampler.width(n0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    // Do not write to the same memory location with multiple threads
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    auto &tile_dst = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
-    dst->init_virtual_tensor(tile_dst, dst_sampler);
+    auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0));
+    writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
 
     writer->op_assign(tile_dst, tile_src);
 }
@@ -208,64 +257,139 @@
     const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
     const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
 
-    const Window  root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const int32_t n0          = root_window.x().step();
-    const int32_t m0          = root_window.y().step();
-    const int32_t partial_n0  = _dst->dimension(0) % n0;
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    // Constants
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto  dst_dt  = to_ckw(_dst->data_type());
     const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
                                                               _attributes.align_corners());
     const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
                                                               _attributes.align_corners());
-    const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
-    const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
-    const auto &tile_0       = writer->declare_tile("0", 0);
-    const auto &tile_half    = writer->declare_tile("half", 0.5f);
-    const auto &tile_1       = writer->declare_tile("1", 1);
-    const auto &tile_src_w   = writer->declare_tile("src_w", static_cast<int32_t>(_src->dimension(width_idx)));
-    const auto &tile_src_h   = writer->declare_tile("src_h", static_cast<int32_t>(_src->dimension(height_idx)));
-    const auto &tile_dst_h   = writer->declare_tile("dst_h", static_cast<int32_t>(_dst->dimension(height_idx)));
+    const auto  src_w   = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto  src_h   = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto  dst_h   = static_cast<int32_t>(_dst->dimension(height_idx));
 
-    const auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    const auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    const auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    // CKW constants
+    auto const_src_w_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32  = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32      = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_0_fp       = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_pos_1_fp   = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32));
+    auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+    auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+    auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+
+    // dst_m0 must be 1
+    ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+    sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+
+    // ....
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
     writer->op_get_global_id(tile_gid_0, 0);
     writer->op_get_global_id(tile_gid_1, 1);
     writer->op_get_global_id(tile_gid_2, 2);
 
-    auto &tile_co = writer->declare_tile("co", ckw::DataType::Int32); // OFM
-    auto &tile_xo = writer->declare_tile("xo", ckw::DataType::Int32); // WIDTH
-    auto &tile_yo = writer->declare_tile("yo", ckw::DataType::Int32); // HEIGHT
-    auto &tile_bo = writer->declare_tile("bo", ckw::DataType::Int32); // BATCH SIZE IDX
+    auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    // Get the boundary aware coordinates at each global dimension index
-    get_coord(writer, tile_co, tile_gid_0, n0, partial_n0, tile_co.name() + "_dim0_", tile_0);
-    get_coord(writer, tile_xo, tile_gid_1, 1, 0, tile_xo.name() + "_dim1_", tile_0);
-    get_coord(writer, tile_yo, tile_gid_2, 1, 0, tile_yo.name() + "_dim2_", tile_0);
-    get_coord(writer, tile_bo, tile_gid_2, 1, 0, tile_yo.name() + "_dim3_", tile_0);
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    writer->op_assign(tile_xo, tile_gid_1);
+    writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
 
-    // yo = coord_dim2 % dst_h
-    // bo = coord_dim2 / dst_h
-    writer->op_binary_expression(tile_yo, tile_yo, BinaryOp::Mod, tile_dst_h);
-    writer->op_binary_expression(tile_bo, tile_bo, BinaryOp::Div, tile_dst_h);
-
-    const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
-    const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
     switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
             // yi_f = (yo * scale_y)
-            writer->op_binary_expression(tile_xi_f, tile_xo, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
             break;
         case SamplingPolicy::CENTER:
         {
@@ -273,14 +397,16 @@
             // yi_f = ((yo + 0.5f) * scale_y - 0.5f)
             const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
             const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
-            writer->op_binary_expression(tile_xo_plus_half, tile_xo, BinaryOp::Add, tile_half);
-            writer->op_binary_expression(tile_yo_plus_half, tile_yo, BinaryOp::Add, tile_half);
 
-            writer->op_binary_expression(tile_xi_f, tile_xo_plus_half, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo_plus_half, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
 
-            writer->op_binary_expression(tile_xi_f, tile_xi_f, BinaryOp::Sub, tile_half);
-            writer->op_binary_expression(tile_yi_f, tile_yi_f, BinaryOp::Sub, tile_half);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp);
         }
         break;
         default:
@@ -289,186 +415,102 @@
 
     // xi = (int)floor(xi_f);
     // yi = (int)floor(yi_f);
-    const auto &tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
-    const auto &tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
-    writer->op_unary_elementwise_function(tile_xi_f_floor, UnaryFunction::Floor, tile_xi_f);
-    writer->op_unary_elementwise_function(tile_yi_f_floor, UnaryFunction::Floor, tile_yi_f);
+    auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
+    auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
+    writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f);
+    writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f);
 
-    const auto &tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
-    const auto &tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
-    writer->op_cast_expression(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
-    writer->op_cast_expression(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
+    auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
+    auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
+    writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
 
     // xi0  = clamp(xi, 0, (int)src_w - 1);
     // yi0  = clamp(yi, 0, (int)src_h - 1);
     // xi1  = clamp(xi + 1, 0, (int)src_w - 1);
     // yi1  = clamp(yi + 1, 0, (int)src_h - 1);
-    const auto &tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
-    const auto &tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
-    writer->op_binary_expression(tile_src_w_minus_1, tile_src_w, BinaryOp::Sub, tile_1);
-    writer->op_binary_expression(tile_src_h_minus_1, tile_src_h, BinaryOp::Sub, tile_1);
+    auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+    writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+    writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
 
-    const auto &tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
-    const auto &tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
-    writer->op_binary_expression(tile_xi_plus_1, tile_xi, BinaryOp::Add, tile_1);
-    writer->op_binary_expression(tile_yi_plus_1, tile_yi, BinaryOp::Add, tile_1);
+    auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
+    auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
+    writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32);
+    writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32);
 
-    auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
-    auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
-    auto &tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
-    auto &tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
+    auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+    auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
+    auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
 
-    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1);
-    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0,
-                                            tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0,
-                                            tile_src_h_minus_1);
+    writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1);
+    writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1);
 
-    TensorTileSampler in_sampler;
-    in_sampler.x(tile_co);
-    in_sampler.b(tile_bo);
-    in_sampler.height(1);
-    in_sampler.width(n0);
-    // We guarantee to not have out-of-bounds accesses
-    in_sampler.format(TensorSamplerFormat::C_W_H);
-    in_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    in_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    in_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0));
 
-    TensorTileSampler in00_sampler = in_sampler;
-    in00_sampler.y(tile_xi0);
-    in00_sampler.z(tile_yi0);
-
-    TensorTileSampler in01_sampler = in_sampler;
-    in01_sampler.y(tile_xi1);
-    in01_sampler.z(tile_yi0);
-
-    TensorTileSampler in10_sampler = in_sampler;
-    in10_sampler.y(tile_xi0);
-    in10_sampler.z(tile_yi1);
-
-    TensorTileSampler in11_sampler = in_sampler;
-    in11_sampler.y(tile_xi1);
-    in11_sampler.z(tile_yi1);
-
-    auto &tile_in00 = writer->declare_tile("in00", TileInfo(to_ckw(_src->data_type()), 1, n0));
-    auto &tile_in01 = writer->declare_tile("in01", TileInfo(to_ckw(_src->data_type()), 1, n0));
-    auto &tile_in10 = writer->declare_tile("in10", TileInfo(to_ckw(_src->data_type()), 1, n0));
-    auto &tile_in11 = writer->declare_tile("in11", TileInfo(to_ckw(_src->data_type()), 1, n0));
-
-    writer->op_load(tile_in00, src->tensor(), in00_sampler);
-    writer->op_load(tile_in01, src->tensor(), in01_sampler);
-    writer->op_load(tile_in10, src->tensor(), in10_sampler);
-    writer->op_load(tile_in11, src->tensor(), in11_sampler);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.x(tile_co);
-    dst_sampler.y(tile_xo);
-    dst_sampler.z(tile_yo);
-    dst_sampler.b(tile_bo);
-    dst_sampler.height(m0);
-    dst_sampler.width(n0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    // Do not write to the same memory location with multiple threads
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    auto &tile_dst = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
-    dst->init_virtual_tensor(tile_dst, dst_sampler);
+    writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+    writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo);
+    writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo);
+    writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo);
 
     // Weights of each nearest pixel
-    const auto &tile_a  = writer->declare_tile("a", ckw::DataType::Fp32);
-    const auto &tile_b  = writer->declare_tile("b", ckw::DataType::Fp32);
-    const auto &tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
-    const auto &tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
+    auto tile_a  = writer->declare_tile("a", ckw::DataType::Fp32);
+    auto tile_b  = writer->declare_tile("b", ckw::DataType::Fp32);
+    auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
+    auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
 
     // a = (xi_f - (float)xi)
     // b = (1.f - a)
     // a1 = (yi_f - (float)yi)
     // b1 = (1.f - a1)
-    const auto &tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
-    const auto &tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
-    writer->op_cast_expression(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
-    writer->op_cast_expression(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
+    auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
+    auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
+    writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
 
-    writer->op_binary_expression(tile_a, tile_xi_f, BinaryOp::Sub, tile_xi_float);
-    writer->op_binary_expression(tile_b, tile_1, BinaryOp::Sub, tile_a);
-    writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float);
-    writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1);
+    writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float);
+    writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a);
+    writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float);
+    writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1);
 
-    if (is_data_type_float(_src->data_type()))
-    {
-        // Cast weights to source type
-        const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
-        const auto &tile_b_src_type  = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
-        const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
-        const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
+    // Cast weights to source type
+    const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
+    const auto &tile_b_src_type  = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
+    const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
+    const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
 
-        writer->op_cast_expression(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
 
-        // in00 * b * b1
-        writer->op_binary_expression(tile_in00, tile_in00, BinaryOp::Mul, tile_b_src_type);
-        writer->op_binary_expression(tile_in00, tile_in00, BinaryOp::Mul, tile_b1_src_type);
+    // in00 * b * b1
+    writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type);
+    writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type);
 
-        // in01 * a * b1
-        writer->op_binary_expression(tile_in01, tile_in01, BinaryOp::Mul, tile_a_src_type);
-        writer->op_binary_expression(tile_in01, tile_in01, BinaryOp::Mul, tile_b1_src_type);
+    // in01 * a * b1
+    writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type);
+    writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type);
 
-        // in10 * b * a1
-        writer->op_binary_expression(tile_in10, tile_in10, BinaryOp::Mul, tile_b_src_type);
-        writer->op_binary_expression(tile_in10, tile_in10, BinaryOp::Mul, tile_a1_src_type);
+    // in10 * b * a1
+    writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type);
+    writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type);
 
-        // in11 * a * a1
-        writer->op_binary_expression(tile_in11, tile_in11, BinaryOp::Mul, tile_a_src_type);
-        writer->op_binary_expression(tile_in11, tile_in11, BinaryOp::Mul, tile_a1_src_type);
+    // in11 * a * a1
+    writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type);
+    writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type);
 
-        // Summation of above terms
-        writer->op_assign(tile_dst, tile_in00);
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in01);
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in10);
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in11);
-    }
-    else
-    {
-        // Cast to float
-        const auto &tile_in00_f = writer->declare_tile("in00_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        const auto &tile_in01_f = writer->declare_tile("in01_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        const auto &tile_in10_f = writer->declare_tile("in10_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        const auto &tile_in11_f = writer->declare_tile("in11_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        writer->op_cast_expression(tile_in00_f, tile_in00, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_in01_f, tile_in01, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_in10_f, tile_in10, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_in11_f, tile_in11, ckw::ConvertPolicy::None);
-
-        // in00 * b * b1
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Mul, tile_b);
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Mul, tile_b1);
-
-        // in01 * a * b1
-        writer->op_binary_expression(tile_in01_f, tile_in01_f, BinaryOp::Mul, tile_a);
-        writer->op_binary_expression(tile_in01_f, tile_in01_f, BinaryOp::Mul, tile_b1);
-
-        // in10 * b * a1
-        writer->op_binary_expression(tile_in10_f, tile_in10_f, BinaryOp::Mul, tile_b);
-        writer->op_binary_expression(tile_in10_f, tile_in10_f, BinaryOp::Mul, tile_a1);
-
-        // in11 * a * a1
-        writer->op_binary_expression(tile_in11_f, tile_in11_f, BinaryOp::Mul, tile_a);
-        writer->op_binary_expression(tile_in11_f, tile_in11_f, BinaryOp::Mul, tile_a1);
-
-        // Summation of above terms
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in01_f);
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in10_f);
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in11_f);
-
-        // Cast to destination type with saturation
-        writer->op_cast_expression(tile_dst, tile_in00_f, ckw::ConvertPolicy::Saturate);
-    }
+    // Summation of above terms
+    writer->op_assign(tile_dst, tile_in00);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11);
 }
 
 void GpuCkwResize::write_component_code(const ComponentGroup    &comp_group,
@@ -492,8 +534,8 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
-    const unsigned int n0  = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
+    const uint32_t n0  = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
+    Window         win = calculate_max_window(*_dst, Steps(n0));
     return win.collapse(win, Window::DimZ);
 }
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 889706b..d9d741f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,11 @@
 
 #include "arm_compute/core/Error.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
+#include <cstdint>
 #include <string>
 
 namespace arm_compute
@@ -48,14 +48,90 @@
                                        GpuCkwVariableTable     &vtable,
                                        GpuCkwScopedKernelWriter writer) const
 {
-    auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    auto       &src_tile   = src->tile();
-    const auto &sampler    = src->tile_sampler();
-    auto       &dst_tensor = dst->tensor();
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(2));
 
-    writer->op_store(dst_tensor, src_tile, sampler);
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    const auto &tile_src    = src->tile();
+    auto       &sampler_src = src->tensor_sampler();
+
+    const auto    dst_n0         = static_cast<int32_t>(tile_src.tile_info().width());
+    const auto    dst_m0         = static_cast<int32_t>(tile_src.tile_info().height());
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensor
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32);
+
+    // Get the boundary aware coordinates at each global dimension index
+    if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+    {
+        writer->op_assign(tile_mout1, const_0_i32);
+        get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+    }
+    else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+    {
+        // For tile_mout1 and tile_bout0 the step can only be 1
+        writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+        writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+    }
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
 }
 
 std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
new file mode 100644
index 0000000..1e6f084
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CkwHelper.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+                             ckw::TileOperand        &coord,
+                             const ckw::TileOperand  &gid,
+                             ckw::TileOperand        &step)
+{
+    writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+}
+
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+                                             ckw::TileOperand        &coord,
+                                             const ckw::TileOperand  &gid,
+                                             ckw::TileOperand        &step,
+                                             ckw::TileOperand        &shift_back,
+                                             ckw::TileOperand        &const_0)
+{
+    // Applied formula: max((gid * step) - shift_back, 0)
+    // where the shift_back operand is: (step - leftover_step) % step
+
+    writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+    writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back);
+    writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
new file mode 100644
index 0000000..956e7c8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Get coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out]    coord  Resultant coordinate
+ * @param[in]     gid    Global work item id
+ * @param[in]     step   Step size / vector size
+ */
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+                             ckw::TileOperand        &coord,
+                             const ckw::TileOperand  &gid,
+                             ckw::TileOperand        &step);
+
+/** Get boundary aware coordinate along one axis.
+ *
+ * @param[in,out] writer     Writer
+ * @param[out]    coord      Resultant coordinate
+ * @param[in]     gid        Global work item id
+ * @param[in]     step       Step size / vector size
+ * @param[in]     shift_back It is (step - leftover_step) % step
+ * @param[in]     const_0    Constant tile of value 0
+ */
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+                                             ckw::TileOperand        &coord,
+                                             const ckw::TileOperand  &gid,
+                                             ckw::TileOperand        &step,
+                                             ckw::TileOperand        &shift_back,
+                                             ckw::TileOperand        &const_0);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
deleted file mode 100644
index 6ba2b2f..0000000
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ /dev/null

@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
-
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "ckw/TensorTileSampler.h"
-
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-
-#include <algorithm>
-#include <functional>
-
-using namespace ckw;
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-using SamplerCreator = std::function<TensorTileSampler(GpuCkwScopedKernelWriter &, int32_t /* m0 */, int32_t /* n0 */)>;
-
-/** Load src and dst tiles of dimension [m0, n0] only when not loaded and prepare the sampler
- */
-inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer,
-                                                   GpuCkwComponentArgument  *src,
-                                                   GpuCkwComponentArgument  *dst,
-                                                   int32_t                   m0,
-                                                   int32_t                   n0,
-                                                   SamplerCreator            create_sampler)
-{
-    if (!src->has_tile())
-    {
-        const auto sampler = create_sampler(writer, m0, n0);
-        writer->op_load_once(src, sampler);
-    }
-    else
-    {
-        const auto &sampler = src->tile_sampler();
-        writer->op_load_once(src, sampler);
-    }
-
-    auto       &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
-
-    // Prepare the output tile.
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-}
-
-/** Get boundary aware coordinate along one axis. Load and store of size step_v at the coordinate will not be out of bound
- *
- * @param[in,out] writer          Writer
- * @param[out]    coord           Resultant coordinate
- * @param[in]     gid             Global work item id
- * @param[in]     step_v          Step size / vector size
- * @param[in]     leftover_step_v Leftover step size at the boundary
- * @param[in]     prefix          Prefix to all the tiles declared within this function
- * @param[in]     const_0         Constant tile of value 0
- */
-inline void get_coord(GpuCkwScopedKernelWriter writer,
-                      TileOperand             &coord,
-                      const TileOperand       &gid,
-                      int32_t                  step_v,
-                      int32_t                  leftover_step_v,
-                      const std::string       &prefix,
-                      const TileOperand       &const_0)
-{
-    auto &step          = writer->declare_tile(prefix + "step", step_v);
-    auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v);
-
-    // step - leftover_step
-    auto &step_minus_leftover = writer->declare_tile(prefix + "step_minus_leftover", ckw::DataType::Int32);
-    writer->op_binary_expression(step_minus_leftover, step, ckw::BinaryOp::Sub, leftover_step);
-
-    // (step - leftover_step) % step
-    auto &coord_correction = writer->declare_tile(prefix + "coord_correction", ckw::DataType::Int32);
-    writer->op_binary_expression(coord_correction, step_minus_leftover, ckw::BinaryOp::Mod, step);
-
-    // (gid * step)
-    auto &raw_coord = writer->declare_tile(prefix + "raw_coord", ckw::DataType::Int32);
-    writer->op_binary_expression(raw_coord, gid, ckw::BinaryOp::Mul, step);
-
-    // (gid * step) - (step - leftover_step) % step
-    auto &corrected_coord = writer->declare_tile(prefix + "corrected_coord", ckw::DataType::Int32);
-    writer->op_binary_expression(corrected_coord, raw_coord, ckw::BinaryOp::Sub, coord_correction);
-
-    // max((gid * step) - (step - leftover_step) % step, 0)
-    writer->op_binary_elementwise_function(coord, ckw::BinaryFunction::Max, corrected_coord, const_0);
-}
-
-/** Declare coordinate tiles "{prefix}_dim0_coord" and "{prefix}_dim1_coord", and create a boundary-aware sampler from tile of size [n0, m0], against the overall dimensions [dim0, dim1]
- * The load and store of tile [n0, m0] will never be out of bound of [dim0, dim1]
- */
-
-/** Declare coordinate tiles "{prefix}_dim0_coord" and "{prefix}_dim1_coord", and create a boundary-aware sampler from tile of size [n0, m0], against the overall dimensions [dim0, dim1]
- * The load and store of tile [n0, m0] will never be out of bound of [dim0, dim1]
- *
- * @param[in,out] writer  Writer
- * @param[in]     gid_0   Global work item id 0
- * @param[in]     gid_1   Global work item id 1
- * @param[in]     dim0_v  Dimension 0
- * @param[in]     dim1_v  Dimension 1
- * @param[in]     n0_v    Tile size dimension 0
- * @param[in]     m0_v    Tile size dimension 1
- * @param[in]     prefix  Prefix to all the tiles declared within this function
- * @param[in]     const_0 Constant tile of value 0
- *
- * @return TensorTileSampler
- */
-inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer,
-                                                          TileOperand             &gid_0,
-                                                          TileOperand             &gid_1,
-                                                          int32_t                  dim0_v,
-                                                          int32_t                  dim1_v,
-                                                          int32_t                  n0_v,
-                                                          int32_t                  m0_v,
-                                                          const std::string        prefix,
-                                                          TileOperand             &const_0)
-{
-    // Clamp tile size [n0, m0] against dimension [dim0, dim1]
-    // This is needed to:
-    // * Guard against tile sizes are bigger than the tensor dimensions
-    // * Handle broadcasting tiles (e.g. src tensor is of size 1 in one of the dimensions)
-    n0_v                       = utility::clamp(n0_v, 1, dim0_v);
-    m0_v                       = utility::clamp(m0_v, 1, dim1_v);
-    const int32_t partial_n0_v = dim0_v % n0_v;
-    const int32_t partial_m0_v = dim1_v % m0_v;
-
-    // Declare #prefix_dim0_coord and #prefix_dim1_coord
-    auto &dim0_coord = writer->declare_tile(prefix + "dim0_coord", ckw::DataType::Int32);
-    get_coord(writer, dim0_coord, gid_0, n0_v, partial_n0_v, prefix + "dim0_", const_0);
-    auto &dim1_coord = writer->declare_tile(prefix + "dim1_coord", ckw::DataType::Int32);
-    get_coord(writer, dim1_coord, gid_1, m0_v, partial_m0_v, prefix + "dim1_", const_0);
-
-    // Set sampler
-    // Only set fields related to boundary aware loading/storing. Other info (e.g. format) is not responsibility of this function
-    TensorTileSampler sampler;
-
-    sampler.x(dim0_coord);
-    sampler.y(dim1_coord);
-
-    sampler.width(n0_v);
-    sampler.height(m0_v);
-
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    return sampler;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
new file mode 100644
index 0000000..ad31b06
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp

@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::DataType to_ckw(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::F32:
+            return ckw::DataType::Fp32;
+        case DataType::F16:
+            return ckw::DataType::Fp16;
+        case DataType::S32:
+            return ckw::DataType::Int32;
+        case DataType::S16:
+            return ckw::DataType::Int16;
+        case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
+            return ckw::DataType::Int8;
+        case DataType::U32:
+            return ckw::DataType::Uint32;
+        case DataType::U16:
+            return ckw::DataType::Uint16;
+        case DataType::U8:
+        case DataType::QASYMM8:
+            return ckw::DataType::Uint8;
+        default:
+            return ckw::DataType::Unknown;
+    }
+}
+
+ckw::TensorShape to_ckw(const TensorShape &shape)
+{
+    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
+    /// NOTE: Overflow danger. Use size_t?
+    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+                            static_cast<int32_t>(shape[4])};
+}
+
+ckw::TensorDataLayout to_ckw(DataLayout dl)
+{
+    switch (dl)
+    {
+        case DataLayout::NHWC:
+            return ckw::TensorDataLayout::Nhwc;
+        case DataLayout::NDHWC:
+            return ckw::TensorDataLayout::Ndhwc;
+        default:
+            return ckw::TensorDataLayout::Unknown;
+    }
+}
+
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
+{
+    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
+}
+
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+    switch (storage)
+    {
+        case TensorStorageType::ClBufferUint8Ptr:
+            return ckw::TensorStorageType::BufferUint8Ptr;
+        case TensorStorageType::ClImage2dReadOnly:
+            return ckw::TensorStorageType::Texture2dReadOnly;
+        case TensorStorageType::ClImage2dWriteOnly:
+            return ckw::TensorStorageType::Texture2dWriteOnly;
+        case TensorStorageType::Unknown:
+            return ckw::TensorStorageType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor storage type");
+    }
+}
+
+TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+    switch (component)
+    {
+        case ckw::TensorComponentType::OffsetFirstElement:
+            return TensorComponentType::OffsetFirstElement;
+        case ckw::TensorComponentType::Stride0:
+            return TensorComponentType::Stride0;
+        case ckw::TensorComponentType::Stride1:
+            return TensorComponentType::Stride1;
+        case ckw::TensorComponentType::Stride2:
+            return TensorComponentType::Stride2;
+        case ckw::TensorComponentType::Stride3:
+            return TensorComponentType::Stride3;
+        case ckw::TensorComponentType::Stride4:
+            return TensorComponentType::Stride4;
+        case ckw::TensorComponentType::Dim0:
+            return TensorComponentType::Dim0;
+        case ckw::TensorComponentType::Dim1:
+            return TensorComponentType::Dim1;
+        case ckw::TensorComponentType::Dim2:
+            return TensorComponentType::Dim2;
+        case ckw::TensorComponentType::Dim3:
+            return TensorComponentType::Dim3;
+        case ckw::TensorComponentType::Dim4:
+            return TensorComponentType::Dim4;
+        case ckw::TensorComponentType::Dim1xDim2:
+            return TensorComponentType::Dim1xDim2;
+        case ckw::TensorComponentType::Dim2xDim3:
+            return TensorComponentType::Dim2xDim3;
+        case ckw::TensorComponentType::Dim1xDim2xDim3:
+            return TensorComponentType::Dim1xDim2xDim3;
+        case ckw::TensorComponentType::Unknown:
+            return TensorComponentType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+    }
+}
+
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+    switch (storage)
+    {
+        case ckw::TensorStorageType::BufferUint8Ptr:
+            return TensorStorageType::ClBufferUint8Ptr;
+        case ckw::TensorStorageType::Texture2dReadOnly:
+            return TensorStorageType::ClImage2dReadOnly;
+        case ckw::TensorStorageType::Texture2dWriteOnly:
+            return TensorStorageType::ClImage2dWriteOnly;
+        case ckw::TensorStorageType::Unknown:
+            return TensorStorageType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
index 5da317b..26740cd 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,151 +21,83 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
 
+#include "arm_compute/core/CoreTypes.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "ckw/TensorInfo.h"
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
+#include "compute_kernel_writer/include/ckw/TensorInfo.h"
+#include "compute_kernel_writer/include/ckw/types/DataType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h"
+
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-inline ckw::DataType to_ckw(DataType dt)
-{
-    switch (dt)
-    {
-        case DataType::F32:
-            return ckw::DataType::Fp32;
-        case DataType::F16:
-            return ckw::DataType::Fp16;
-        case DataType::S32:
-            return ckw::DataType::Int32;
-        case DataType::S16:
-            return ckw::DataType::Int16;
-        case DataType::S8:
-        case DataType::QASYMM8_SIGNED:
-            return ckw::DataType::Int8;
-        case DataType::U32:
-            return ckw::DataType::Uint32;
-        case DataType::U16:
-            return ckw::DataType::Uint16;
-        case DataType::U8:
-        case DataType::QASYMM8:
-            return ckw::DataType::Uint8;
-        default:
-            return ckw::DataType::Unknown;
-    }
-}
+/** Convert the Compute Library data type to Compute Kernel Writer data type
+ *
+ * @param[in] dt The Compute Library data type
+ *
+ * @return the Compute Kernel Writer data type (ckw::DataType)
+ */
+ckw::DataType to_ckw(DataType dt);
 
-inline ckw::TensorShape to_ckw(const TensorShape &shape)
-{
-    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
-    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
-    /// NOTE: Overflow danger. Use size_t?
-    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
-                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
-                            static_cast<int32_t>(shape[4])};
-}
-inline ckw::TensorDataLayout to_ckw(DataLayout dl)
-{
-    switch (dl)
-    {
-        case DataLayout::NHWC:
-            return ckw::TensorDataLayout::Nhwc;
-        case DataLayout::NDHWC:
-            return ckw::TensorDataLayout::Ndhwc;
-        default:
-            return ckw::TensorDataLayout::Unknown;
-    }
-}
-inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
-{
-    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
-                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
-}
+/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape
+ *
+ * @param[in] shape The Compute Library tensor shape
+ *
+ * @return the Compute Kernel Writer tensor shape (ckw::TensorShape)
+ */
+ckw::TensorShape to_ckw(const TensorShape &shape);
 
-inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
-{
-    switch (component)
-    {
-        case ckw::TensorComponentType::OffsetFirstElement:
-            return TensorComponentType::OffsetFirstElement;
-        case ckw::TensorComponentType::Stride0:
-            return TensorComponentType::Stride0;
-        case ckw::TensorComponentType::Stride1:
-            return TensorComponentType::Stride1;
-        case ckw::TensorComponentType::Stride2:
-            return TensorComponentType::Stride2;
-        case ckw::TensorComponentType::Stride3:
-            return TensorComponentType::Stride3;
-        case ckw::TensorComponentType::Stride4:
-            return TensorComponentType::Stride4;
-        case ckw::TensorComponentType::Dim0:
-            return TensorComponentType::Dim0;
-        case ckw::TensorComponentType::Dim1:
-            return TensorComponentType::Dim1;
-        case ckw::TensorComponentType::Dim2:
-            return TensorComponentType::Dim2;
-        case ckw::TensorComponentType::Dim3:
-            return TensorComponentType::Dim3;
-        case ckw::TensorComponentType::Dim4:
-            return TensorComponentType::Dim4;
-        case ckw::TensorComponentType::Dim1xDim2:
-            return TensorComponentType::Dim1xDim2;
-        case ckw::TensorComponentType::Dim2xDim3:
-            return TensorComponentType::Dim2xDim3;
-        case ckw::TensorComponentType::Dim1xDim2xDim3:
-            return TensorComponentType::Dim1xDim2xDim3;
-        case ckw::TensorComponentType::Unknown:
-            return TensorComponentType::Unknown;
-        default:
-            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
-            return TensorComponentType::Unknown;
-    }
-}
+/** Convert the Compute Library data layout to Compute Kernel Writer data layout
+ *
+ * @param[in] dl The Compute Library data layout
+ *
+ * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout)
+ */
+ckw::TensorDataLayout to_ckw(DataLayout dl);
 
-inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
-{
-    switch (storage)
-    {
-        case TensorStorageType::ClBufferUint8Ptr:
-            return ckw::TensorStorageType::BufferUint8Ptr;
-        case TensorStorageType::ClImage2dReadOnly:
-            return ckw::TensorStorageType::Texture2dReadOnly;
-        case TensorStorageType::ClImage2dWriteOnly:
-            return ckw::TensorStorageType::Texture2dWriteOnly;
-        case TensorStorageType::Unknown:
-            return ckw::TensorStorageType::Unknown;
-        default:
-            ARM_COMPUTE_ERROR("Unknown tensor storage type");
-            return ckw::TensorStorageType::Unknown;
-    }
-}
-inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
-{
-    switch (storage)
-    {
-        case ckw::TensorStorageType::BufferUint8Ptr:
-            return TensorStorageType::ClBufferUint8Ptr;
-        case ckw::TensorStorageType::Texture2dReadOnly:
-            return TensorStorageType::ClImage2dReadOnly;
-        case ckw::TensorStorageType::Texture2dWriteOnly:
-            return TensorStorageType::ClImage2dWriteOnly;
-        case ckw::TensorStorageType::Unknown:
-            return TensorStorageType::Unknown;
-        default:
-            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
-            return TensorStorageType::Unknown;
-    }
-}
+/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info
+ *
+ * @param[in] tensor_info The Compute Library tensor info
+ *
+ * @return the Compute Kernel Writer tensor info (ckw::TensorInfo)
+ */
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info);
+
+/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage
+ *
+ * @param[in] storage The Compute Library tensor storage
+ *
+ * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType)
+ */
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage);
+
+/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component
+ *
+ * @param[in] component The Compute Kernel Writer tensor component
+ *
+ * @return the Compute Library tensor component
+ */
+TensorComponentType from_ckw(const ckw::TensorComponentType &component);
+
+/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage
+ *
+ * @param[in] storage The Compute Kernel Writer tensor storage
+ *
+ * @return the Compute Library tensor storage
+ */
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage);
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
new file mode 100644
index 0000000..5630e39
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
+{
+    switch (attributes.operation())
+    {
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
+            return ckw::BinaryOp::Add;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
+            return ckw::BinaryOp::Sub;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
+            return ckw::BinaryOp::Div;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
+            return ckw::BinaryOp::Mul;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
+        default:
+            ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
index 0cba258..644a407 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,42 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
-
-#include "ckw/types/Operators.h"
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
+#include "compute_kernel_writer/include/ckw/types/Operators.h"
+
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
-{
-    switch (attributes.operation())
-    {
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
-            return ckw::BinaryOp::Add;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
-            return ckw::BinaryOp::Sub;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
-            return ckw::BinaryOp::Div;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
-            return ckw::BinaryOp::Mul;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
-        default:
-            ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
-    }
-}
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes);
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index 409b191..5544963 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
                                    const Attributes                &attributes,
                                    const Settings                  &settings)
 {
-    ARM_COMPUTE_UNUSED(properties);
+    ARM_COMPUTE_UNUSED(properties, settings);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
@@ -57,7 +57,7 @@
     // 1. Check validity
     // Check if pooling is valid
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
+        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, true)),
         "Pooling region that is entirely outside input tensor is unsupported");
 
     // Matching data type
@@ -74,8 +74,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
-        dst->tensor_shape(), misc::shape_calculator::compute_pool_shape(
-                                 *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
+        dst->tensor_shape(),
+        misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, true)));
 
     // 2. Check support level
     // Data type

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
index 2cec67d..201c9f2 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,8 +49,7 @@
 Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
-                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common is_supported_op

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
index 6f35e66..d25a2a3 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,12 +57,8 @@
 
     // Check support level
     // Data Type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-        src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
-        DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8,
-                                                         DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32,
-                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::F16, DataType::F32);
 
     if (context.gpu_language() == GpuLanguage::OpenCL)
     {

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
index 55c604a..2d04f75 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 
 #include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
@@ -52,10 +53,12 @@
                                      const Pool2dAttributes  &attributes,
                                      const GpuPool2dSettings &settings)
 {
+    ARM_COMPUTE_UNUSED(settings);
+
     if (dst->total_size() == 0U)
     {
         auto shape = misc::shape_calculator::compute_pool_shape(
-            *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+            *src, convert_pool_attr_to_pool_info(attributes, /* mixed_precision */ true));
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
 }
@@ -63,17 +66,6 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-GpuPool2dSettings &GpuPool2dSettings::mixed_precision(bool mixed_precision)
-{
-    _mixed_precision = mixed_precision;
-    return *this;
-}
-
-bool GpuPool2dSettings::mixed_precision() const
-{
-    return _mixed_precision;
-}
-
 GpuPool2dSettings GpuPool2dSettings::use_inf_as_limit(bool use_inf_as_limit)
 {
     _use_inf_as_limit = use_inf_as_limit;

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
index fb09875..8e794c8 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,6 @@
                               const ResizeAttributes   &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
@@ -73,8 +72,7 @@
 
     // Check support level
     // Data type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Interpolation policy

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
index e5d62c9..c53453a 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,8 +36,7 @@
 Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
-                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common validate_op
@@ -49,8 +48,7 @@
 Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
-                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common is_supported_op

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
index ebb0374..8936db6 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,9 +78,8 @@
 
 std::string ClTemplatePool2d::get_MxN_kernel_code() const
 {
-    const auto pool_type = _attributes.pool_type();
-    const bool fp_mixed_precision =
-        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    const auto pool_type          = _attributes.pool_type();
+    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
 
     // Define pool op macro.
     std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
@@ -226,11 +225,10 @@
 
 std::string ClTemplatePool2d::get_2x2_kernel_code() const
 {
-    const auto pool_type = _attributes.pool_type();
-    const bool fp_mixed_precision =
-        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
-                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    const auto  pool_type          = _attributes.pool_type();
+    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
+    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                                     : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     std::string code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
@@ -385,12 +383,12 @@
     lut["meta_kernel_id"] = id();
 
     // Retrieve relevant data
-    const auto padding                = _attributes.pad();
-    const auto stride                 = _attributes.stride();
-    const auto pool_size              = _attributes.pool_size();
-    const auto data_type              = _src->data_type();
-    const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() &&
-                                        _attributes.pool_type() != PoolingType::MAX;
+    const auto padding   = _attributes.pad();
+    const auto stride    = _attributes.stride();
+    const auto pool_size = _attributes.pool_size();
+    const auto data_type = _src->data_type();
+    const auto use_fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _attributes.pool_type() != PoolingType::MAX;
     const std::string max_initial_value =
         _settings.use_inf_as_limit() ? "(-INFINITY)"
                                      : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());

diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index bb9c008..80dcaa8 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp

@@ -280,10 +280,10 @@
     ITensorInfo *out_1_info = context.create_tensor_info();
 
     CastAttributes cast_0_attr;
-    cast_0_attr.data_type(DataType::S32).convert_policy(ConvertPolicy::SATURATE);
+    cast_0_attr.data_type(DataType::F16);
 
     CastAttributes cast_1_attr;
-    cast_1_attr.data_type(DataType::F32).convert_policy(ConvertPolicy::SATURATE);
+    cast_1_attr.data_type(DataType::F32);
 
     ITensorInfo *ans_0_info = GpuAdd::create_op(sketch, in_0_info, in_1_info);
     GpuOutput::create_op(sketch, ans_0_info, out_0_info);

diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
index a358d47..9bfdc96 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp

@@ -22,9 +22,6 @@
  * SOFTWARE.
  */
 
-// TODO: Fix testing of CKW Elementwise Binary (COMPMID-6530)
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
 
@@ -265,4 +262,3 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF

diff --git a/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp b/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp
index cb6c8c5..4ef359e 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Cast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,84 +42,27 @@
 namespace
 {
 // Tolerance
-constexpr AbsoluteTolerance<float> one_tolerance(1);
 constexpr AbsoluteTolerance<float> zero_tolerance(0);
 
 /** Input data sets **/
-// QASYMM8
-const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32));
-
-// U8
-const auto CastU8toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S8));
-const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
-const auto CastU8toS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-const auto CastU8toU32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U32));
-const auto CastU8toS32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32));
-const auto CastU8toF16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F16));
-const auto CastU8toF32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F32));
-
-// S8
-const auto CastS8toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U8));
-const auto CastS8toU16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U16));
-const auto CastS8toS16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::S16));
-const auto CastS8toU32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U32));
-const auto CastS8toS32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::S32));
-const auto CastS8toF16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::F16));
-const auto CastS8toF32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::F32));
-
-// U16
-const auto CastU16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8));
-const auto CastU16toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S8));
-const auto CastU16toS16Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S16));
-const auto CastU16toU32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
-const auto CastU16toS32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S32));
-const auto CastU16toF16Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::F16));
-const auto CastU16toF32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::F32));
-
-// S16
-const auto CastS16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
-const auto CastS16toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S8));
-const auto CastS16toU16Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U16));
-const auto CastS16toU32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U32));
-const auto CastS16toS32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
-const auto CastS16toF16Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::F16));
-const auto CastS16toF32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::F32));
-
-// U32
-const auto CastU32toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::U8));
-const auto CastU32toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S8));
-const auto CastU32toU16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::U16));
-const auto CastU32toS16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S16));
-const auto CastU32toS32Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S32));
-const auto CastU32toF16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::F16));
-const auto CastU32toF32Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::F32));
-
-// S32
-const auto CastS32toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8));
-const auto CastS32toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S8));
-const auto CastS32toU16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U16));
-const auto CastS32toS16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S16));
-const auto CastS32toU32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U32));
-const auto CastS32toF16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16));
-const auto CastS32toF32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32));
 
 // F16
-const auto CastF16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U8));
-const auto CastF16toS8Dataset  = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S8));
-const auto CastF16toU16Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U16));
-const auto CastF16toS16Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S16));
-const auto CastF16toU32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U32));
-const auto CastF16toS32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32));
 const auto CastF16toF32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
 
 // F32
-const auto CastF32toU8Dataset  = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8));
-const auto CastF32toS8Dataset  = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S8));
-const auto CastF32toU16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U16));
-const auto CastF32toS16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S16));
-const auto CastF32toU32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U32));
-const auto CastF32toS32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
 const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+
+class DFConvertPolicies final : public framework::dataset::ContainerDataset<std::vector<ConvertPolicy>>
+{
+public:
+    DFConvertPolicies()
+        : ContainerDataset("ConvertPolicy",
+    {
+        ConvertPolicy::WRAP
+    })
+    {
+    }
+};
 } // namespace
 
 TEST_SUITE(CL)
@@ -127,18 +70,6 @@
 TEST_SUITE(CAST)
 
 template <typename T>
-using DynamicFusionCLCastToU8Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, uint8_t>;
-template <typename T>
-using DynamicFusionCLCastToS8Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, int8_t>;
-template <typename T>
-using DynamicFusionCLCastToU16Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, uint16_t>;
-template <typename T>
-using DynamicFusionCLCastToS16Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, int16_t>;
-template <typename T>
-using DynamicFusionCLCastToU32Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, uint32_t>;
-template <typename T>
-using DynamicFusionCLCastToS32Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, int32_t>;
-template <typename T>
 using DynamicFusionCLCastToF16Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, half>;
 template <typename T>
 using DynamicFusionCLCastToF32Fixture = DynamicFusionCastValidationFixture<CLTensor, CLAccessor, GpuCast, T, float>;
@@ -146,85 +77,16 @@
 #define CAST_SUITE(NAME, idt, odt, type, dataset, tolerance)                                                                     \
     TEST_SUITE(NAME)                                                                                                             \
     FIXTURE_DATA_TEST_CASE(RunSmall, type, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), dataset), \
-                                                                                      datasets::ConvertPolicies()))              \
+                                                                                              DFConvertPolicies()))              \
     {                                                                                                                            \
         validate(CLAccessor(_target), _reference, tolerance);                                                                    \
     }                                                                                                                            \
     TEST_SUITE_END()
 
-// QASYMM8
-CAST_SUITE(QASYMM8_to_F32, DataType::QASYMM8, DataType::F32, DynamicFusionCLCastToF32Fixture<uint8_t>, CastQASYMM8toF32Dataset, zero_tolerance)
-
-// U8
-CAST_SUITE(U8_to_S8, DataType::U8, DataType::S8, DynamicFusionCLCastToS8Fixture<uint8_t>, CastU8toS8Dataset, zero_tolerance)
-CAST_SUITE(U8_to_U16, DataType::U8, DataType::U16, DynamicFusionCLCastToU16Fixture<uint8_t>, CastU8toU16Dataset, zero_tolerance)
-CAST_SUITE(U8_to_S16, DataType::U8, DataType::S16, DynamicFusionCLCastToS16Fixture<uint8_t>, CastU8toS16Dataset, zero_tolerance)
-CAST_SUITE(U8_to_U32, DataType::U8, DataType::U32, DynamicFusionCLCastToU32Fixture<uint8_t>, CastU8toU32Dataset, zero_tolerance)
-CAST_SUITE(U8_to_S32, DataType::U8, DataType::S32, DynamicFusionCLCastToS32Fixture<uint8_t>, CastU8toS32Dataset, zero_tolerance)
-CAST_SUITE(U8_to_F16, DataType::U8, DataType::F16, DynamicFusionCLCastToF16Fixture<uint8_t>, CastU8toF16Dataset, zero_tolerance)
-CAST_SUITE(U8_to_F32, DataType::U8, DataType::F32, DynamicFusionCLCastToF32Fixture<uint8_t>, CastU8toF32Dataset, zero_tolerance)
-
-// S8
-CAST_SUITE(S8_to_U8, DataType::S8, DataType::U8, DynamicFusionCLCastToU8Fixture<int8_t>, CastS8toU8Dataset, zero_tolerance)
-CAST_SUITE(S8_to_U16, DataType::S8, DataType::U16, DynamicFusionCLCastToU16Fixture<int8_t>, CastS8toU16Dataset, zero_tolerance)
-CAST_SUITE(S8_to_S16, DataType::S8, DataType::S16, DynamicFusionCLCastToS16Fixture<int8_t>, CastS8toS16Dataset, zero_tolerance)
-CAST_SUITE(S8_to_U32, DataType::S8, DataType::U32, DynamicFusionCLCastToU32Fixture<int8_t>, CastS8toU32Dataset, zero_tolerance)
-CAST_SUITE(S8_to_S32, DataType::S8, DataType::S32, DynamicFusionCLCastToS32Fixture<int8_t>, CastS8toS32Dataset, zero_tolerance)
-CAST_SUITE(S8_to_F16, DataType::S8, DataType::F16, DynamicFusionCLCastToF16Fixture<int8_t>, CastS8toF16Dataset, zero_tolerance)
-CAST_SUITE(S8_to_F32, DataType::S8, DataType::F32, DynamicFusionCLCastToF32Fixture<int8_t>, CastS8toF32Dataset, zero_tolerance)
-
-// U16
-CAST_SUITE(U16_to_U8, DataType::U16, DataType::U8, DynamicFusionCLCastToU8Fixture<uint16_t>, CastU16toU8Dataset, zero_tolerance)
-CAST_SUITE(U16_to_S8, DataType::U16, DataType::S8, DynamicFusionCLCastToS8Fixture<uint16_t>, CastU16toS8Dataset, zero_tolerance)
-CAST_SUITE(U16_to_S16, DataType::U16, DataType::S16, DynamicFusionCLCastToS16Fixture<uint16_t>, CastU16toS16Dataset, zero_tolerance)
-CAST_SUITE(U16_to_U32, DataType::U16, DataType::U32, DynamicFusionCLCastToU32Fixture<uint16_t>, CastU16toU32Dataset, zero_tolerance)
-CAST_SUITE(U16_to_S32, DataType::U16, DataType::S32, DynamicFusionCLCastToS32Fixture<uint16_t>, CastU16toS32Dataset, zero_tolerance)
-CAST_SUITE(U16_to_F16, DataType::U16, DataType::F16, DynamicFusionCLCastToF16Fixture<uint16_t>, CastU16toF16Dataset, zero_tolerance)
-CAST_SUITE(U16_to_F32, DataType::U16, DataType::F32, DynamicFusionCLCastToF32Fixture<uint16_t>, CastU16toF32Dataset, zero_tolerance)
-
-// S16
-CAST_SUITE(S16_to_U8, DataType::S16, DataType::U8, DynamicFusionCLCastToU8Fixture<int16_t>, CastS16toU8Dataset, zero_tolerance)
-CAST_SUITE(S16_to_S8, DataType::S16, DataType::S8, DynamicFusionCLCastToS8Fixture<int16_t>, CastS16toS8Dataset, zero_tolerance)
-CAST_SUITE(S16_to_U16, DataType::S16, DataType::U16, DynamicFusionCLCastToU16Fixture<int16_t>, CastS16toU16Dataset, zero_tolerance)
-CAST_SUITE(S16_to_U32, DataType::S16, DataType::U32, DynamicFusionCLCastToU32Fixture<int16_t>, CastS16toU32Dataset, zero_tolerance)
-CAST_SUITE(S16_to_S32, DataType::S16, DataType::S32, DynamicFusionCLCastToS32Fixture<int16_t>, CastS16toS32Dataset, zero_tolerance)
-CAST_SUITE(S16_to_F16, DataType::S16, DataType::F16, DynamicFusionCLCastToF16Fixture<int16_t>, CastS16toF16Dataset, zero_tolerance)
-CAST_SUITE(S16_to_F32, DataType::S16, DataType::F32, DynamicFusionCLCastToF32Fixture<int16_t>, CastS16toF32Dataset, zero_tolerance)
-
-// U32
-CAST_SUITE(U32_to_U8, DataType::U32, DataType::U8, DynamicFusionCLCastToU8Fixture<uint32_t>, CastU32toU8Dataset, zero_tolerance)
-CAST_SUITE(U32_to_S8, DataType::U32, DataType::S8, DynamicFusionCLCastToS8Fixture<uint32_t>, CastU32toS8Dataset, zero_tolerance)
-CAST_SUITE(U32_to_U16, DataType::U32, DataType::U16, DynamicFusionCLCastToU16Fixture<uint32_t>, CastU32toU16Dataset, zero_tolerance)
-CAST_SUITE(U32_to_S16, DataType::U32, DataType::S16, DynamicFusionCLCastToS16Fixture<uint32_t>, CastU32toS16Dataset, zero_tolerance)
-CAST_SUITE(U32_to_S32, DataType::U32, DataType::S32, DynamicFusionCLCastToS32Fixture<uint32_t>, CastU32toS32Dataset, zero_tolerance)
-CAST_SUITE(U32_to_F16, DataType::U32, DataType::F16, DynamicFusionCLCastToF16Fixture<uint32_t>, CastU32toF16Dataset, zero_tolerance)
-CAST_SUITE(U32_to_F32, DataType::U32, DataType::F32, DynamicFusionCLCastToF32Fixture<uint32_t>, CastU32toF32Dataset, zero_tolerance)
-
-// S32
-CAST_SUITE(S32_to_U8, DataType::S32, DataType::U8, DynamicFusionCLCastToU8Fixture<int32_t>, CastS32toU8Dataset, zero_tolerance)
-CAST_SUITE(S32_to_S8, DataType::S32, DataType::S8, DynamicFusionCLCastToS8Fixture<int32_t>, CastS32toS8Dataset, zero_tolerance)
-CAST_SUITE(S32_to_U16, DataType::S32, DataType::U16, DynamicFusionCLCastToU16Fixture<int32_t>, CastS32toU16Dataset, zero_tolerance)
-CAST_SUITE(S32_to_S16, DataType::S32, DataType::S16, DynamicFusionCLCastToS16Fixture<int32_t>, CastS32toS16Dataset, zero_tolerance)
-CAST_SUITE(S32_to_U32, DataType::S32, DataType::U32, DynamicFusionCLCastToU32Fixture<int32_t>, CastS32toU32Dataset, zero_tolerance)
-CAST_SUITE(S32_to_F16, DataType::S32, DataType::F16, DynamicFusionCLCastToF16Fixture<int32_t>, CastS32toF16Dataset, zero_tolerance)
-CAST_SUITE(S32_to_F32, DataType::S32, DataType::F32, DynamicFusionCLCastToF32Fixture<int32_t>, CastS32toF32Dataset, zero_tolerance)
-
 // F16
-CAST_SUITE(F16_to_U8, DataType::F16, DataType::U8, DynamicFusionCLCastToU8Fixture<half>, CastF16toU8Dataset, one_tolerance)
-CAST_SUITE(F16_to_S8, DataType::F16, DataType::S8, DynamicFusionCLCastToS8Fixture<half>, CastF16toS8Dataset, one_tolerance)
-CAST_SUITE(F16_to_U16, DataType::F16, DataType::U16, DynamicFusionCLCastToU16Fixture<half>, CastF16toU16Dataset, one_tolerance)
-CAST_SUITE(F16_to_S16, DataType::F16, DataType::S16, DynamicFusionCLCastToS16Fixture<half>, CastF16toS16Dataset, one_tolerance)
-CAST_SUITE(F16_to_U32, DataType::F16, DataType::U32, DynamicFusionCLCastToU32Fixture<half>, CastF16toU32Dataset, one_tolerance)
-CAST_SUITE(F16_to_S32, DataType::F16, DataType::S32, DynamicFusionCLCastToS32Fixture<half>, CastF16toS32Dataset, one_tolerance)
 CAST_SUITE(F16_to_F32, DataType::F16, DataType::F32, DynamicFusionCLCastToF32Fixture<half>, CastF16toF32Dataset, zero_tolerance)
 
 // F32
-CAST_SUITE(F32_to_U8, DataType::F32, DataType::U8, DynamicFusionCLCastToU8Fixture<float>, CastF32toU8Dataset, one_tolerance)
-CAST_SUITE(F32_to_S8, DataType::F32, DataType::S8, DynamicFusionCLCastToS8Fixture<float>, CastF32toS8Dataset, one_tolerance)
-CAST_SUITE(F32_to_U16, DataType::F32, DataType::U16, DynamicFusionCLCastToU16Fixture<float>, CastF32toU16Dataset, one_tolerance)
-CAST_SUITE(F32_to_S16, DataType::F32, DataType::S16, DynamicFusionCLCastToS16Fixture<float>, CastF32toS16Dataset, one_tolerance)
-CAST_SUITE(F32_to_U32, DataType::F32, DataType::U32, DynamicFusionCLCastToU32Fixture<float>, CastF32toU32Dataset, one_tolerance)
-CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, DynamicFusionCLCastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
 CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, DynamicFusionCLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
 
 TEST_SUITE_END() // CAST

diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
index d714a2f..96b7967 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp

@@ -25,6 +25,7 @@
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/LargeMatMulDataset.h"
+#include "tests/datasets/MatMulDataset.h"
 #include "tests/datasets/SmallMatMulDataset.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/framework/Fixture.h"
@@ -54,27 +55,36 @@
     0.02)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
 } // namespace
 
-/** M0 values to test --precommit*/
-const auto m0_values_precommit = framework::dataset::make("M0", {1, 3});
+/** M0 values to test - precommit */
+const auto m0_values_lhs_nt_precommit = framework::dataset::make("M0", {1, 2, 3});
 
-/** N0 values to test --precommit*/
-const auto n0_values_precommit = framework::dataset::make("N0", {1, 2, 4});
+/** N0 values to test - precommit */
+const auto n0_values_rhs_t_precommit = framework::dataset::make("N0", {1, 2, 4});
 
-/** K0 values to test --precommit*/
-const auto k0_values_precommit = framework::dataset::make("K0", {1, 2, 3});
+/** K0 values to test - precommit */
+const auto k0_values_rhs_t_precommit = framework::dataset::make("K0", {1, 2, 4});
 
-/** M0 values to test --nightly*/
-const auto m0_values_nightly_lhs_nt = framework::dataset::make("M0", {1, 2, 3, 4, 5, 6, 7, 8});
-const auto m0_values_nightly_lhs_t  = framework::dataset::make("M0", {1, 2, 3, 4, 8});
+/** M0 values to test - nightly */
+const auto m0_values_lhs_nt_nightly = framework::dataset::make("M0", {1, 2, 3, 4});
 
-/** N0 values to test --nightly*/
-const auto n0_values_nightly_rhs_nt = framework::dataset::make("N0", {1, 2, 3, 4, 8, 16});
-const auto n0_values_nightly_rhs_t  = framework::dataset::make("N0", {1, 2, 3, 4, 8});
+/** N0 values to test - nightly */
+const auto n0_values_rhs_t_nightly = framework::dataset::make("N0", {1, 2, 3, 4, 8});
 
-/** K0 values to test --nightly*/
-const auto k0_values_nightly_lhs_nt_rhs_nt = framework::dataset::make("K0", {1, 2, 3, 4, 8, 16});
-const auto k0_values_nightly_rhs_t         = framework::dataset::make("K0", {1, 2, 3, 4, 8});
-const auto k0_values_nightly_lhs_t_rhs_nt  = framework::dataset::make("K0", {1, 2, 3, 4, 5, 6, 7, 8});
+/** K0 values to test - nightly */
+const auto k0_values_rhs_t_nightly = framework::dataset::make("K0", {1, 2, 3, 4, 8});
+
+class DFMatMulDataset final : public datasets::MatMulDataset
+{
+public:
+    DFMatMulDataset()
+    {
+        // LHS = [K, M], RHS = [N, K], DST = [N, M]
+        add_config(TensorShape(1U, 1U), TensorShape(1U, 1U), TensorShape(1U, 1U));
+        add_config(TensorShape(1U, 2U), TensorShape(2U, 1U), TensorShape(2U, 2U));
+        add_config(TensorShape(9U, 6U), TensorShape(5U, 9U), TensorShape(5U, 6U));
+        add_config(TensorShape(32U, 37U), TensorShape(17U, 32U), TensorShape(17U, 37U));
+    }
+};
 
 TEST_SUITE(CL)
 TEST_SUITE(DYNAMIC_FUSION)
@@ -247,70 +257,33 @@
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
-FIXTURE_DATA_TEST_CASE(
-    RunTiny,
-    DynamicFusionGpuMatmulFixture<float>,
-    framework::DatasetMode::ALL,
-    combine(combine(combine(combine(combine(combine(combine(datasets::TinyMatMulDataset(),
-                                                            framework::dataset::make("TransposeA", {false})),
-                                                    framework::dataset::make("TransposeB", {true})),
-                                            m0_values_precommit),
-                                    n0_values_precommit),
-                            k0_values_precommit),
-                    framework::dataset::make("ExportRhsToCLImage", {false})),
-            framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunPrecommit,
+                       DynamicFusionGpuMatmulFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_precommit,
+                               n0_values_rhs_t_precommit,
+                               k0_values_rhs_t_precommit,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(
-    RunSmall,
-    DynamicFusionGpuMatmulFixture<float>,
-    framework::DatasetMode::ALL,
-    combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                            framework::dataset::make("TransposeA", {false})),
-                                                    framework::dataset::make("TransposeB", {true})),
-                                            m0_values_precommit),
-                                    n0_values_precommit),
-                            k0_values_precommit),
-                    framework::dataset::make("ExportRhsToCLImage", {false})),
-            framework::dataset::make("DataType", DataType::F32)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-FIXTURE_DATA_TEST_CASE(
-    RunLargeRhsTransposed,
-    DynamicFusionGpuMatmulFixture<float>,
-    framework::DatasetMode::NIGHTLY,
-    combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                            framework::dataset::make("TransposeA", {false})),
-                                                    framework::dataset::make("TransposeB", {true})),
-                                            m0_values_nightly_lhs_nt),
-                                    n0_values_nightly_rhs_t),
-                            k0_values_nightly_rhs_t),
-                    framework::dataset::make("ExportRhsToCLImage", {false})),
-            framework::dataset::make("DataType", DataType::F32)))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
-}
-
-// Running High Dimensional test is enough for FP32, because we're stressing the number of dimensions, not data type or M0/N0/K0
-FIXTURE_DATA_TEST_CASE(
-    RunHighDimensional,
-    DynamicFusionGpuMatmulFixture<float>,
-    framework::DatasetMode::ALL,
-    combine(combine(combine(combine(combine(combine(combine(datasets::HighDimensionalMatMulDataset(),
-                                                            framework::dataset::make("TransposeA", {false})),
-                                                    framework::dataset::make("TransposeB", {true})),
-                                            framework::dataset::make("M0", {2})),
-                                    framework::dataset::make("N0", {2})),
-                            framework::dataset::make("K0", {2})),
-                    framework::dataset::make("ExportRhsToCLImage", {false})),
-            framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionGpuMatmulFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_nightly,
+                               n0_values_rhs_t_nightly,
+                               k0_values_rhs_t_nightly,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32);
@@ -319,35 +292,33 @@
 
 TEST_SUITE(FP16)
 
-FIXTURE_DATA_TEST_CASE(
-    RunSmall,
-    DynamicFusionGpuMatmulFixture<half>,
-    framework::DatasetMode::ALL,
-    combine(combine(combine(combine(combine(combine(combine(datasets::SmallMatMulDataset(),
-                                                            framework::dataset::make("TransposeA", {false})),
-                                                    framework::dataset::make("TransposeB", {true})),
-                                            m0_values_precommit),
-                                    n0_values_precommit),
-                            k0_values_precommit),
-                    framework::dataset::make("ExportRhsToCLImage", {false})),
-            framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunPrecommit,
+                       DynamicFusionGpuMatmulFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_precommit,
+                               n0_values_rhs_t_precommit,
+                               k0_values_rhs_t_precommit,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);
 }
 
-FIXTURE_DATA_TEST_CASE(
-    RunLargeRhsTransposed,
-    DynamicFusionGpuMatmulFixture<half>,
-    framework::DatasetMode::NIGHTLY,
-    combine(combine(combine(combine(combine(combine(combine(datasets::LargeMatMulDataset(),
-                                                            framework::dataset::make("TransposeA", {false})),
-                                                    framework::dataset::make("TransposeB", {true})),
-                                            m0_values_nightly_lhs_nt),
-                                    n0_values_nightly_rhs_t),
-                            k0_values_nightly_rhs_t),
-                    framework::dataset::make("ExportRhsToCLImage", {false})),
-            framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunNightly,
+                       DynamicFusionGpuMatmulFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(DFMatMulDataset(),
+                               framework::dataset::make("TransposeA", {false}),
+                               framework::dataset::make("TransposeB", {true}),
+                               m0_values_lhs_nt_nightly,
+                               n0_values_rhs_t_nightly,
+                               k0_values_rhs_t_nightly,
+                               framework::dataset::make("ExportRhsToCLImage", {false}),
+                               framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16);

diff --git a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
index c11bffe..af02ce3 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Mul.cpp

@@ -22,9 +22,6 @@
  * SOFTWARE.
  */
 
-// TODO: Fix testing of CKW Elementwise Binary (COMPMID-6530)
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
 
@@ -222,4 +219,3 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF

diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
index f894ce3..e537826 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp

@@ -55,17 +55,11 @@
                     framework::dataset::make("Stride", {Size2D(1, 1), Size2D(2, 1), Size2D(5, 7)})),
             framework::dataset::make("ExcludePadding", {true}));
 
-const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", {true, false});
-
 template <typename T>
 using DynamicFusionGpuPool2dFixture = DynamicFusionGpuPool2dValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
 
 template <typename T>
 using DFSpecialGpuPool2dFixture = DynamicFusionGpuPool2dSpecialValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
-
-template <typename T>
-using DFPoolMixedPrecisionFixture =
-    DynamicFusionGpuPool2dMixedPrecisionValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
 // *INDENT-OFF*
 // clang-format off
 
@@ -92,7 +86,7 @@
     GpuWorkloadSketch sketch{ &context };
 
     // Declare GpuPool2d settings
-    const GpuPool2dSettings &settings = GpuPool2dSettings().mixed_precision(false);
+    const GpuPool2dSettings &settings = GpuPool2dSettings();
 
     // Validate Pool2d Configuration
     auto                   src_info    = context.create_tensor_info(input_info);
@@ -175,27 +169,6 @@
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall,
-                       DFPoolMixedPrecisionFixture<half>,
-                       framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallNoneUnitShapes(), PoolingLayerDatasetFP),
-                                       framework::dataset::make("DataType", DataType::F16)),
-                               pool_fp_mixed_precision_dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge,
-                       DFPoolMixedPrecisionFixture<half>,
-                       framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), PoolingLayerDatasetFP),
-                                       framework::dataset::make("DataType", DataType::F16)),
-                               pool_fp_mixed_precision_dataset))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_f16);
-}
-
 TEST_SUITE(GlobalPooling)
 FIXTURE_DATA_TEST_CASE(
     RunSmall,

diff --git a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
index 10915ac..a6bcf4a 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp

@@ -64,10 +64,7 @@
                                                           });
 
 /** Tolerance */
-constexpr AbsoluteTolerance<uint8_t> tolerance_q8(1);
-constexpr AbsoluteTolerance<int8_t>  tolerance_qs8(1);
-constexpr AbsoluteTolerance<int16_t> tolerance_s16(1);
-constexpr float                      tolerance_f32_absolute(0.001f);
+constexpr float tolerance_f32_absolute(0.001f);
 
 RelativeTolerance<float> tolerance_f32(0.05);
 constexpr float          abs_tolerance_f16(0.1f);
@@ -105,26 +102,27 @@
 
 TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
 {
-    const std::map<DataType, bool> supported_data_types = {
-        {DataType::U8, true},
-        {DataType::S8, false},
-        {DataType::QSYMM8, false},
-        {DataType::QASYMM8, true},
-        {DataType::QASYMM8_SIGNED, true},
-        {DataType::QSYMM8_PER_CHANNEL, false},
-        {DataType::U16, false},
-        {DataType::S16, true},
-        {DataType::QSYMM16, false},
-        {DataType::QASYMM16, false},
-        {DataType::U32, false},
-        {DataType::S32, false},
-        {DataType::U64, false},
-        {DataType::S64, false},
-        {DataType::BFLOAT16, false},
-        {DataType::F16, true},
-        {DataType::F32, true},
-        {DataType::F64, false},
-        {DataType::SIZET, false},
+    const std::map<DataType, bool> supported_data_types =
+    {
+        { DataType::U8, false },
+        { DataType::S8, false },
+        { DataType::QSYMM8, false },
+        { DataType::QASYMM8, false },
+        { DataType::QASYMM8_SIGNED, false },
+        { DataType::QSYMM8_PER_CHANNEL, false },
+        { DataType::U16, false },
+        { DataType::S16, false },
+        { DataType::QSYMM16, false },
+        { DataType::QASYMM16, false },
+        { DataType::U32, false },
+        { DataType::S32, false },
+        { DataType::U64, false },
+        { DataType::S64, false },
+        { DataType::BFLOAT16, false },
+        { DataType::F16, true },
+        { DataType::F32, true },
+        { DataType::F64, false },
+        { DataType::SIZET, false },
     };
 
     for (auto &kv : supported_data_types)
@@ -352,266 +350,6 @@
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
-TEST_SUITE(Integer)
-TEST_SUITE(U8)
-const auto u8_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<uint8_t>())),
-                              framework::dataset::make("DataType", DataType::U8));
-FIXTURE_DATA_TEST_CASE(Run,
-                       DynamicFusionResizeFixture<uint8_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(u8_shape, ScaleSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-FIXTURE_DATA_TEST_CASE(RunAlignCorners,
-                       DynamicFusionResizeFixture<uint8_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(u8_shape, ScaleAlignCornersSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-const auto u8_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<uint8_t>())),
-                                      framework::dataset::make("DataType", DataType::U8));
-FIXTURE_DATA_TEST_CASE(RunNightly,
-                       DynamicFusionResizeFixture<uint8_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(u8_nightly_shape, ScaleSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
-                       DynamicFusionResizeFixture<uint8_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(u8_nightly_shape, ScaleAlignCornersSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-TEST_SUITE_END() // U8
-
-TEST_SUITE(S16)
-const auto s16_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<int16_t>())),
-                               framework::dataset::make("DataType", DataType::S16));
-FIXTURE_DATA_TEST_CASE(Run,
-                       DynamicFusionResizeFixture<int16_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(s16_shape, ScaleSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
-}
-FIXTURE_DATA_TEST_CASE(RunAlignCorners,
-                       DynamicFusionResizeFixture<int16_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(s16_shape, ScaleAlignCornersSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
-}
-const auto s16_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<int16_t>())),
-                                       framework::dataset::make("DataType", DataType::S16));
-FIXTURE_DATA_TEST_CASE(RunNightly,
-                       DynamicFusionResizeFixture<int16_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(s16_nightly_shape, ScaleSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
-}
-FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
-                       DynamicFusionResizeFixture<int16_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_DATASET_DYNAMIC_FUSION(s16_nightly_shape, ScaleAlignCornersSamplingPolicySet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
-}
-TEST_SUITE_END() // S16
-TEST_SUITE_END() // Integer
-
-template <typename T>
-using DynamicFusionResizeQuantizedFixture =
-    DynamicFusionResizeQuantizedValidationFixture<CLTensor, CLAccessor, GpuResize, T>;
-TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
-const auto qasymm8_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<uint8_t>())),
-                                   framework::dataset::make("DataType", DataType::QASYMM8));
-FIXTURE_DATA_TEST_CASE(Run,
-                       DynamicFusionResizeQuantizedFixture<uint8_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_shape,
-                                                                 ScaleSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-FIXTURE_DATA_TEST_CASE(RunAlignCorners,
-                       DynamicFusionResizeQuantizedFixture<uint8_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_shape,
-                                                                 ScaleAlignCornersSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-const auto qasymm8_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<uint8_t>())),
-                                           framework::dataset::make("DataType", DataType::QASYMM8));
-FIXTURE_DATA_TEST_CASE(RunNightly,
-                       DynamicFusionResizeQuantizedFixture<uint8_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_nightly_shape,
-                                                                 ScaleSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
-                       DynamicFusionResizeQuantizedFixture<uint8_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_nightly_shape,
-                                                                 ScaleAlignCornersSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
-}
-TEST_SUITE_END() // QASYMM8
-
-TEST_SUITE(QASYMM8_SIGNED)
-const auto qasymm8_signed_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<int8_t>())),
-                                          framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-FIXTURE_DATA_TEST_CASE(Run,
-                       DynamicFusionResizeQuantizedFixture<int8_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_signed_shape,
-                                                                 ScaleSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
-}
-FIXTURE_DATA_TEST_CASE(RunAlignCorners,
-                       DynamicFusionResizeQuantizedFixture<int8_t>,
-                       framework::DatasetMode::ALL,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_signed_shape,
-                                                                 ScaleAlignCornersSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
-}
-const auto qasymm8_signed_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<int8_t>())),
-                                                  framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-FIXTURE_DATA_TEST_CASE(RunNightly,
-                       DynamicFusionResizeQuantizedFixture<int8_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_signed_nightly_shape,
-                                                                 ScaleSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
-}
-FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners,
-                       DynamicFusionResizeQuantizedFixture<int8_t>,
-                       framework::DatasetMode::NIGHTLY,
-                       ASSEMBLE_QUANTIZED_DATASET_DYNAMIC_FUSION(qasymm8_signed_nightly_shape,
-                                                                 ScaleAlignCornersSamplingPolicySet,
-                                                                 QuantizationInfoSet))
-{
-    //Create valid region
-    TensorInfo        src_info(_shape, 1, _data_type);
-    const ValidRegion valid_region =
-        calculate_valid_region_scale(src_info, _reference.shape(), _interpolation_policy, _sampling_policy, false);
-
-    // Validate output
-    validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
-}
-TEST_SUITE_END() // QASYMM8_SIGNED
-
-TEST_SUITE_END() // Quantized
-
 TEST_SUITE_END() // RESIZE
 TEST_SUITE_END() // DYNAMIC_FUSION
 TEST_SUITE_END() // CL

diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
index ef9f75b..c7ab1e7 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp

@@ -22,9 +22,6 @@
  * SOFTWARE.
  */
 
-// TODO: Fix testing of CKW Elementwise Binary (COMPMID-6530)
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
 
@@ -63,13 +60,13 @@
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // S16 is valid data type for Sub
-                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // S32 is valid data type for Sub
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
                                                         TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
                                                         TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                         TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
-                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Invalid data type combination
                                                         TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
                                                       }),
                framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
@@ -86,7 +83,7 @@
                                                        TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("Expected", { true, false, false, false, false, true, true, false, true, true, false, false, true })),
+               framework::dataset::make("Expected", { true, false, false, false, false, false, false, false, true, true, false, false, true })),
                input1_info, input2_info, expected)
 {
     // Create a new workload sketch
@@ -263,4 +260,3 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF

diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
index 65a3363..4c1cc94 100644
--- a/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/MatMulKernelFixture.h

@@ -203,7 +203,7 @@
                                       bool               pretranspose_b,
                                       DataType           data_type)
     {
-        // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 4D
+        // We collapse dimensions > 3 onto dimension 3, i.e. 5D+ tensors will look like 3D
         // This is necessary unless we choose to extend gemm reference for 5D+ tensors
         TensorShape output_shape_collapsed = output_shape.collapsed_from(Window::DimZ);
         TensorShape shape_a_collapsed      = shape_a.collapsed_from(Window::DimZ);

diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
index dd3519b..b0c7143 100644
--- a/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h

@@ -51,11 +51,11 @@
 class DynamicFusionGpuPool2dValidationGenericFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape input_shape, const Pool2dAttributes &pool_attr, DataType data_type, bool mixed_precision)
+    void setup(TensorShape input_shape, const Pool2dAttributes &pool_attr, DataType data_type)
     {
-        _target = compute_target(input_shape, pool_attr, data_type, mixed_precision);
-        _reference =
-            compute_reference(input_shape, convert_pool_attr_to_pool_info(pool_attr, mixed_precision), data_type);
+        _target    = compute_target(input_shape, pool_attr, data_type);
+        _reference = compute_reference(
+            input_shape, convert_pool_attr_to_pool_info(pool_attr, true /* mixed_precision */), data_type);
     }
 
 protected:
@@ -82,10 +82,7 @@
     }
 
     // Given input is in nchw format
-    TensorType compute_target(TensorShape             input_shape,
-                              const Pool2dAttributes &pool_attr,
-                              const DataType          data_type,
-                              bool                    mixed_precision)
+    TensorType compute_target(TensorShape input_shape, const Pool2dAttributes &pool_attr, const DataType data_type)
     {
         CLScheduler::get().default_reinit();
 
@@ -102,7 +99,7 @@
         auto dst_info   = context.create_tensor_info();
 
         // Create Pool2dSettings
-        GpuPool2dSettings pool_settings = GpuPool2dSettings().mixed_precision(mixed_precision);
+        GpuPool2dSettings pool_settings = GpuPool2dSettings();
 
         ITensorInfo *ans_info = FunctionType::create_op(sketch, input_info, pool_attr, pool_settings);
         GpuOutput::create_op(sketch, ans_info, dst_info);
@@ -168,29 +165,7 @@
             input_shape,
             Pool2dAttributes().pool_type(pool_type).pool_size(pool_size).pad(pad).stride(stride).exclude_padding(
                 exclude_padding),
-            data_type, false);
-    }
-};
-
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class DynamicFusionGpuPool2dMixedPrecisionValidationFixture
-    : public DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
-{
-public:
-    void setup(TensorShape input_shape,
-               PoolingType pool_type,
-               Size2D      pool_size,
-               Padding2D   pad,
-               Size2D      stride,
-               bool        exclude_padding,
-               DataType    data_type,
-               bool        mixed_precision)
-    {
-        DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
-            input_shape,
-            Pool2dAttributes().pool_type(pool_type).pool_size(pool_size).pad(pad).stride(stride).exclude_padding(
-                exclude_padding),
-            data_type, mixed_precision);
+            data_type);
     }
 };
 
@@ -202,7 +177,7 @@
     void setup(TensorShape input_shape, Pool2dAttributes pool_attr, DataType data_type)
     {
         DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(
-            input_shape, pool_attr, data_type, false);
+            input_shape, pool_attr, data_type);
     }
 };
 

diff --git a/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h b/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h
index edf0dff..08fffb3 100644
--- a/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h
+++ b/tests/validation/fixtures/dynamic_fusion/operators/CastFixture.h

@@ -120,6 +120,8 @@
         GpuWorkloadSketch sketch{&context};
 
         // Create sketch tensors
+        // Here, we use DataLayout::NCHW just for the test. However, the optimal data layout to
+        // be used with dynamic fusion is NHWC
         ITensorInfo *src_info =
             context.create_tensor_info(TensorInfo(shape, 1, dt_in, DataLayout::NCHW)); // layout is not important
         ITensorInfo *dst_info = context.create_tensor_info();

diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index e8831a3..23e28d6 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -3330,7 +3330,7 @@
 {
     os << "Settings="
        << "["
-       << "FPMixedPrecision=" << settings.mixed_precision() << "]";
+       << "UseInfAsLimit=" << settings.use_inf_as_limit() << "]";
     return os;
 }
commit	2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c	[log] [tgz]
author	Gunes Bayir <gunes.bayir@arm.com>	Wed Jan 17 16:07:03 2024 +0000
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	Thu Feb 01 16:00:34 2024 +0000
tree	a4e2d5ce46443a79a0778e4960462ce3edf106ec
parent	7ab7fca87cca8775f82b0e9efec6a40975910c17 [diff]