Port DirectConv2d to CKW backend

Ports the direct convolution 2D kernel from the experimental Dynamic Fusion interface to use the new Compute Kernel Writer backend for OpenCL code generation.

Support is for FP16/FP32 only.

Resolves: COMPMID-6259

Change-Id: Ia8d7b9cb789737b22b1d877cd798a73eda0ce4ab
Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10059
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index b7936e5..14290b9 100644
--- a/Android.bp
+++ b/Android.bp
@@ -647,6 +647,7 @@
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp",
diff --git a/compute_kernel_writer/CMakeLists.txt b/compute_kernel_writer/CMakeLists.txt
index 1e82f9c..9a97563 100644
--- a/compute_kernel_writer/CMakeLists.txt
+++ b/compute_kernel_writer/CMakeLists.txt
@@ -102,7 +102,7 @@
     ${CKW_CXX_FLAGS}
     "$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
     "$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
-    "$<$<BOOL:${CKW_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
+    "$<$<BOOL:${CKW_ENABLE_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
     # Set CMAKE_CXX_FLAGS last so user can overwrite options
     ${CMAKE_CXX_FLAGS}
     PRIVATE
@@ -113,7 +113,7 @@
 target_compile_definitions(ckw PUBLIC
     $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
     $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
+    $<$<BOOL:${CKW_ENABLE_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
     $<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
 )
 
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
index c116e62..72f85c7 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
@@ -129,11 +129,38 @@
 
     /** Load the data from the tensor memory to the tile using the sampling information.
      *
+     * @param[out] tile       The tile to be loaded.
+     * @param[in]  tensor     The tensor to be read.
+     * @param[in]  sampler    The tensor sampling information.
+     * @param[in]  dilation_y Dilation in the Y dimension.
+     */
+    void op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y = TileOperand("dil_y", 1));
+
+    /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
+     *
      * @param[out] tile    The tile to be loaded.
      * @param[in]  tensor  The tensor to be read.
      * @param[in]  sampler The tensor sampling information.
      */
-    void op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler);
+    void op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler);
+
+    /** Construct an indirection buffer in @p tile containing the precalculated addresses of elements in the source tensor.
+     *
+     * @param[out] tile    The tile to be loaded.
+     * @param[in]  tensor  The tensor the be read.
+     * @param[in]  sampler The tensor sampling information.
+     * @param[in]  x       The X coordinate.
+     * @param[in]  y       The Y coordinate.
+     * @param[in]  x_off   Offset in the X dimension.
+     * @param[in]  y_off   Offset in the Y dimension.
+     */
+    void util_get_indirect_buffer(TileOperand             &tile,
+                                  const TensorOperand     &tensor,
+                                  const TensorTileSampler &sampler,
+                                  const TileOperand       &x,
+                                  const TileOperand       &y,
+                                  const TileOperand       &x_off,
+                                  const TileOperand       &y_off);
 
     /** Store the tile to the tensor using the specified sampling information.
      *
diff --git a/compute_kernel_writer/prototype/include/ckw/TileOperand.h b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
index c071707..24ee373 100644
--- a/compute_kernel_writer/prototype/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
@@ -37,6 +37,8 @@
 
 class Kernel;
 
+using TileContainer = std::vector<std::vector<std::string>>;
+
 /** Tile operand which can be either scalar, vector or 2D tile. */
 class TileOperand : public OperandBase
 {
@@ -69,6 +71,13 @@
      */
     TileOperand(const ::std::string &name, float value);
 
+    /** Initialize a new instance of @ref TileOperand for compile-time constant variable.
+     *
+     * @param[in] name  The name of the tile.
+     * @param[in] value The value of the tile.
+     */
+    TileOperand(const ::std::string &name, const ::std::vector<std::vector<std::string>> &value, DataType dt);
+
     /** Prohibit copy of tile operand. */
     TileOperand(const TileOperand &) = delete;
 
@@ -96,13 +105,21 @@
     /** Get the scalar value of the tile.
      *
      * The tile must have the shape of 1, 1 (i.e. scalar).
+     *
+     * @return Scalar value as a string.
      */
-    ScalarValue scalar_value() const;
+    std::string scalar_value() const;
+
+    /** Get the values of the tile.
+     *
+     * @return 2D container of values.
+     */
+    const TileContainer &value() const;
 
 private:
-    TileInfo    _info;
-    ScalarValue _value{};
-    bool        _constant;
+    TileInfo      _info;
+    TileContainer _value{};
+    bool          _constant;
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
index 9122e51..f29cf12 100644
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ b/compute_kernel_writer/prototype/src/KernelWriter.cpp
@@ -128,6 +128,10 @@
             name,
             prototype::TileInfo(info.data_type(), info.width(), info.height()));
     }
+    else
+    {
+        _impl->declare_const_tile(name, operand.value(), operand.data_type());
+    }
 
     return operand;
 }
@@ -136,7 +140,30 @@
 // Load and store
 // =================================================================================================
 
-void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler)
+void KernelWriter::op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y)
+{
+    prototype::TensorOperand impl_tensor(
+        tensor.name(),
+        prototype::GpuSampler{
+            sampler.format(),
+            prototype::to_gpu_tensor_storage(tensor.storage_type()),
+            sampler.address_mode_x(),
+            sampler.address_mode_y(),
+            sampler.address_mode_z() });
+
+    auto impl_x = sampler.x().create_impl_operand(_impl.get());
+    auto impl_y = sampler.y().create_impl_operand(_impl.get());
+    auto impl_z = sampler.z().create_impl_operand(_impl.get());
+    auto impl_b = sampler.b().create_impl_operand(_impl.get());
+
+    auto impl_dilation_y = dilation_y.create_impl_operand(_impl.get());
+
+    auto impl_dst = tile.create_impl_operand(_impl.get());
+
+    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b, impl_dilation_y);
+}
+
+void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler)
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
@@ -154,7 +181,34 @@
 
     auto impl_dst = tile.create_impl_operand(_impl.get());
 
-    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+    _impl->op_load_indirect(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+}
+
+void KernelWriter::util_get_indirect_buffer(TileOperand             &tile,
+                                            const TensorOperand     &tensor,
+                                            const TensorTileSampler &sampler,
+                                            const TileOperand       &x,
+                                            const TileOperand       &y,
+                                            const TileOperand       &x_off,
+                                            const TileOperand       &y_off)
+{
+    prototype::TensorOperand impl_tensor(
+        tensor.name(),
+        prototype::GpuSampler{
+            sampler.format(),
+            prototype::to_gpu_tensor_storage(tensor.storage_type()),
+            sampler.address_mode_x(),
+            sampler.address_mode_y(),
+            sampler.address_mode_z() });
+
+    auto impl_x     = x.create_impl_operand(_impl.get());
+    auto impl_y     = y.create_impl_operand(_impl.get());
+    auto impl_x_off = x_off.create_impl_operand(_impl.get());
+    auto impl_y_off = y_off.create_impl_operand(_impl.get());
+
+    auto impl_dst = tile.create_impl_operand(_impl.get());
+
+    _impl->util_get_indirect_buffer(impl_dst, impl_tensor, impl_x, impl_y, impl_x_off, impl_y_off);
 }
 
 void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index a8dc7fb..2b51947 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -3009,7 +3009,7 @@
             address += " + (";
             address += x + ") * sizeof(" + dst_type + ")";
         }
-        if(y != "0" && (_mapper.is_one_component_y() != true))
+        if(y != "0")
         {
             const std::string stride_y = _mapper.tensor_component_stride_y();
             address += " + (";
@@ -3249,7 +3249,7 @@
         std::string coord_x = "(" + x + ") >> 2";
         std::string coord_y = "(";
 
-        if(y != "0" && (_mapper.is_one_component_y() != true))
+        if(y != "0")
         {
             coord_y += y;
         }
@@ -4024,13 +4024,6 @@
             _data->code += ", ";
             _data->code += x_s->scalar(0, i).str;
             _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, y_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
             // mi_0 = select(wxh, mi_0, x_s < width);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
@@ -4039,6 +4032,13 @@
             _data->code += x_s->scalar(0, i).str;
             _data->code += " < ";
             _data->code += width + ");\n";
+            // mi_0 = select(wxh, mi_0, y_s >= 0);
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = select(-1, ";
+            _data->code += dst->scalar(0, i).str;
+            _data->code += ", ";
+            _data->code += y_s->scalar(0, i).str;
+            _data->code += " >= 0);\n";
             // mi_0 = select(wxh, mi_0, y_s < height);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
index fcb3cb6..bf6a15b 100644
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TileOperand.cpp
@@ -30,22 +30,42 @@
 {
 
 TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name), _info(info), _value{ 0 }, _constant(false)
+    : OperandBase(name),
+      _info(info),
+      _value{ std::vector<std::string>{ "0" } },
+      _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name), _info(TileInfo{ data_type }), _value(0), _constant(false)
+    : OperandBase(name),
+      _info(TileInfo{ data_type }),
+      _value{ std::vector<std::string>{ "0" } },
+      _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, int32_t value)
-    : OperandBase(name), _info(TileInfo{ DataType::Int32 }), _value(value), _constant(true)
+    : OperandBase(name),
+      _info(TileInfo{ DataType::Int32 }),
+      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _constant(true)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, float value)
-    : OperandBase(name), _info(TileInfo{ DataType::Fp32 }), _value(value), _constant(true)
+    : OperandBase(name),
+      _info(TileInfo{ DataType::Fp32 }),
+      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _constant(true)
+{
+}
+
+TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
+    : OperandBase(name),
+      _info(TileInfo{ dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size()) }),
+      _value(vals),
+      _constant(true)
 {
 }
 
@@ -55,17 +75,23 @@
 
     if(_constant)
     {
-        switch(_info.data_type())
+        if(is_scalar())
         {
-            case DataType::Int32:
-                return prototype::Operand(std::to_string(_value.get<int32_t>()),
-                                          prototype::OperandType::ScalarInt32);
+            switch(_info.data_type())
+            {
+                case DataType::Int32:
+                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
 
-            case DataType::Fp32:
-                return prototype::Operand(std::to_string(_value.get<float>()), prototype::OperandType::ScalarFp32);
+                case DataType::Fp32:
+                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp32);
 
-            default:
-                CKW_ASSERT(false);
+                default:
+                    CKW_ASSERT(false);
+            }
+        }
+        else
+        {
+            return prototype::Operand(name());
         }
     }
     else
@@ -94,11 +120,16 @@
     return _info.width() == 1 && _info.height() == 1;
 }
 
-ScalarValue TileOperand::scalar_value() const
+std::string TileOperand::scalar_value() const
 {
     CKW_ASSERT(is_scalar());
     CKW_ASSERT(is_constant());
 
+    return _value[0][0];
+}
+
+const TileContainer &TileOperand::value() const
+{
     return _value;
 }
 
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 801f1f0..edc0c3b 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -48,6 +48,13 @@
      - @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
    - Enable transposed convolution with non-square kernels on CPU and GPU.
    - Added support for input data type U64/S64 in CLCast.
+   - Added new Compute Kernel Writer (CKW) subproject that offers a C++ interface to generate tile-based OpenCL code in just-in-time fashion.
+   - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface with support for FP16/FP32 only:
+     - @ref experimental::dynamic_fusion::GpuCkwActivation
+     - @ref experimental::dynamic_fusion::GpuCkwCast
+     - @ref experimental::dynamic_fusion::GpuCkwDirectConv2d
+     - @ref experimental::dynamic_fusion::GpuCkwElementwiseBinary
+     - @ref experimental::dynamic_fusion::GpuCkwStore
  - Various optimizations and bug fixes.
 
 v23.05.1 Public patch release
diff --git a/filelist.json b/filelist.json
index b7845a7..953b81d 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2353,6 +2353,7 @@
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
new file mode 100644
index 0000000..3c90664
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "ckw/TensorTileSampler.h"
+#include "ckw/TileInfo.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+using TileContainer = std::vector<std::vector<std::string>>;
+
+GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
+                                       const ArgumentPack<ITensorInfo> &tensors,
+                                       const Attributes                &attributes,
+                                       const Settings                  &settings)
+    : IGpuCkwComponentDriver{ id, tensors },
+      _src{},
+      _wei{},
+      _bia{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
+}
+
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+{
+    const auto desc = _settings.direct_conv_descriptor();
+    ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
+                             "Only the weights tensor can be exported to cl_image");
+
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Tunable parameters
+    const int32_t m0         = root_window.y().step();
+    const int32_t n0         = root_window.x().step();
+    const int32_t k0         = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
+    const int32_t partial_n0 = _dst->dimension(0) % n0;
+
+    const int32_t K = _src->dimension(channel_idx);
+
+    // Exporting the weights tensor to an OpenCL image object is currently only supported when:
+    //   a) k0 is equal to 4
+    // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
+    //   b) K is a multiple of 4
+    // This is a limitation in the current interface due to the variable table being responsible for maintaining
+    // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
+    // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
+    // extra loop to compute the left-over elements.
+    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
+
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(
+        comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
+
+    if(using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
+    }
+
+    // Constants
+    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_channels     = static_cast<int32_t>(_src->dimension(channel_idx));
+    auto      &tile_kernel_w    = writer->declare_tile("kernel_w", kernel_width);
+    auto      &tile_kernel_size = writer->declare_tile("kernel_size", kernel_width * kernel_height);
+    auto      &tile_src_c       = writer->declare_tile("src_c", static_cast<int32_t>(_src->dimension(channel_idx)));
+    auto      &tile_dst_w       = writer->declare_tile("dst_w", static_cast<int32_t>(_dst->dimension(width_idx)));
+    auto      &tile_stride_x    = writer->declare_tile("stride_x", static_cast<int32_t>(_attributes.stride().x()));
+    auto      &tile_stride_y    = writer->declare_tile("stride_y", static_cast<int32_t>(_attributes.stride().y()));
+    auto      &tile_pad_x       = writer->declare_tile("pad_x", static_cast<int32_t>(_attributes.pad().left));
+    auto      &tile_pad_y       = writer->declare_tile("pad_y", static_cast<int32_t>(_attributes.pad().top));
+    auto      &tile_k0          = writer->declare_tile("k0", k0);
+    auto      &tile_0           = writer->declare_tile("0", 0);
+    auto      &tile_1           = writer->declare_tile("1", 1);
+
+    auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto &tile_cout = writer->declare_tile("cout", ckw::DataType::Int32); // OFM
+    auto &tile_mout = writer->declare_tile("mout", ckw::DataType::Int32); // WIDTH x HEIGHT
+    auto &tile_bout = writer->declare_tile("bout", ckw::DataType::Int32); // BATCH SIZE IDX
+
+    // Get the boundary aware coordinates at each global dimension index
+    get_coord(writer, tile_cout, tile_gid_0, n0, partial_n0, tile_cout.name() + "_dim0_", tile_0);
+    get_coord(writer, tile_mout, tile_gid_1, m0, 0, tile_mout.name() + "_dim1_", tile_0);
+    get_coord(writer, tile_bout, tile_gid_2, 1, 0, tile_bout.name() + "_dim2_", tile_0);
+
+    TensorTileSampler src_sampler;
+    src_sampler.width(k0);
+    src_sampler.height(m0);
+    src_sampler.format(TensorSamplerFormat::C_WH_1);
+    // We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+    // compute left-over elements
+    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
+    // indirection buffer mi does not contain negative values representing out-of-bounds reads.
+    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly);
+    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler wei_sampler;
+    wei_sampler.width(k0);
+    wei_sampler.height(n0);
+    wei_sampler.format(TensorSamplerFormat::C_WH_1);
+    // We cannot have out-of-bounds accesses for the weights
+    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.width(n0);
+    dst_sampler.height(m0);
+    dst_sampler.format(TensorSamplerFormat::C_WH_1);
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::ClampToMaxEdgeOnly);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    dst_sampler.x(tile_cout);
+    dst_sampler.y(tile_mout);
+    dst_sampler.z(tile_0);
+    dst_sampler.b(tile_bout);
+
+    if(!dst->has_tile())
+    {
+        auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
+        dst->init_virtual_tensor(tile, dst_sampler);
+    }
+    auto &tile_dst = dst->tile();
+
+    writer->op_assign(tile_dst, tile_0);
+
+    // We create a 2d container of size (M0, 1) to store the indices for iteration
+    TileContainer it;
+    for(int m = 0; m < m0; ++m)
+    {
+        std::vector<std::string> idx { std::to_string(m) };
+        it.push_back({ idx });
+    }
+    const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
+
+    auto &tile_xi = writer->declare_tile("xi", TileInfo(ckw::DataType::Int32, m0, 1));
+    auto &tile_yi = writer->declare_tile("yi", TileInfo(ckw::DataType::Int32, m0, 1));
+
+    // Convert the linear index to coordinate
+    // xi = ((mout + i) % dst_w) * stride_x - pad_x
+    // yi = ((mout + i) / dst_w) * stride_y - pad_y
+    writer->op_binary_expression(tile_xi, tile_mout, BinaryOp::Add, tile_it);
+    writer->op_binary_expression(tile_yi, tile_mout, BinaryOp::Add, tile_it);
+    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mod, tile_dst_w);
+    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Div, tile_dst_w);
+    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mul, tile_stride_x);
+    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Mul, tile_stride_y);
+    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Sub, tile_pad_x);
+    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Sub, tile_pad_y);
+
+    auto &tile_y_b = writer->declare_tile("y_b", ckw::DataType::Int32);
+    writer->op_binary_expression(tile_y_b, tile_cout, BinaryOp::Mul, tile_kernel_size);
+
+    auto &tile_i = writer->declare_tile("i", ckw::DataType::Int32);
+    writer->op_assign(tile_i, tile_0);
+
+    // clang-format off
+    writer->op_for_loop(tile_i, BinaryOp::Less, tile_kernel_size, tile_i, AssignmentOp::Increment, tile_1, [&]()
+    {
+        auto &tile_x_k = writer->declare_tile("x_k", ckw::DataType::Int32);
+        auto &tile_y_k = writer->declare_tile("y_k", ckw::DataType::Int32);
+
+        writer->op_binary_expression(tile_x_k, tile_i, BinaryOp::Mod, tile_kernel_w);
+        writer->op_binary_expression(tile_y_k, tile_i, BinaryOp::Div, tile_kernel_w);
+
+        auto &tile_ck = writer->declare_tile("ck", ckw::DataType::Int32);
+        writer->op_assign(tile_ck, tile_0);
+
+        auto &tile_mi = writer->declare_tile("mi", TileInfo(ckw::DataType::Int32, m0, 1));
+        // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
+        // x_s = xi + x_k
+        // y_s = yi + y_k
+        // mi = x_s + y_s * width;
+        // mi = select(-1, mi, x_s >= 0);
+        // mi = select(-1, mi, x_s < width);
+        // mi = select(-1, mi, y_s >= 0);
+        // mi = select(-1, mi, y_s < height);
+        writer->util_get_indirect_buffer(tile_mi, src->tensor(), src_sampler, tile_xi, tile_yi, tile_x_k, tile_y_k);
+
+        src_sampler.x(tile_ck);
+        src_sampler.y(tile_mi);
+        src_sampler.z(tile_0);
+        src_sampler.b(tile_bout);
+
+        wei_sampler.x(tile_ck);
+        wei_sampler.y(tile_y_b);
+        wei_sampler.z(tile_0);
+        wei_sampler.b(tile_0);
+
+        auto &tile_src_c_minus_k0 = writer->declare_tile("src_c_minus_k0", src_channels - k0);
+
+        writer->op_for_loop(tile_ck, BinaryOp::LessEqual, tile_src_c_minus_k0, tile_ck, AssignmentOp::Increment, tile_k0, [&]()
+        {
+            auto &tile_lhs = writer->declare_tile("lhs", TileInfo(to_ckw(_src->data_type()), m0, k0));
+            auto &tile_rhs = writer->declare_tile("rhs", TileInfo(to_ckw(_wei->data_type()), n0, k0));
+            writer->op_assign(tile_lhs, tile_0);
+            writer->op_assign(tile_rhs, tile_0);
+
+            writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
+            writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+
+            writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+        });
+
+        // Left-over accumulations for when K is not a multiple of k0
+        if(!(K % k0 == 0))
+        {
+            writer->op_for_loop(tile_ck, BinaryOp::Less, tile_src_c, tile_ck, AssignmentOp::Increment, tile_1, [&]()
+            {
+                auto &tile_lhs = writer->declare_tile("lhs_leftover", TileInfo(to_ckw(_src->data_type()), m0, 1));
+                auto &tile_rhs = writer->declare_tile("rhs_leftover", TileInfo(to_ckw(_wei->data_type()), n0, 1));
+                writer->op_assign(tile_lhs, tile_0);
+                writer->op_assign(tile_rhs, tile_0);
+
+                writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
+                writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+
+                writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+            });
+        }
+
+    writer->op_binary_expression(tile_y_b, tile_y_b, BinaryOp::Add, tile_1);
+    });
+    // clang-format on
+
+    // Bias addition
+    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+    if(using_bias)
+    {
+        if(!bia->has_tile())
+        {
+            // Reuse the destination sampler for the bias
+            writer->op_load_once(bia, dst_sampler);
+        }
+        auto &tile_bia = bia->tile();
+
+        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_bia);
+    }
+}
+
+Window GpuCkwDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto dst_shape = _dst->tensor_shape();
+    const auto desc      = _settings.direct_conv_descriptor();
+
+    const unsigned int n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+    const unsigned int m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+
+    Window win = calculate_max_window(dst_shape, Steps(n0, m0));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], m0);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
+    win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    return "direct_conv2d";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
new file mode 100644
index 0000000..ac32d2d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwDirectConv2d final : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+
+public:
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+     * @param[in] settings   Component settings. Settings are a set of parameters that influence the implementation of a component
+     */
+    GpuCkwDirectConv2d(ComponentId                      id,
+                       const ArgumentPack<ITensorInfo> &tensors,
+                       const Attributes                &attributes,
+                       const Settings                  &settings);
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
+
+    /** Destructor */
+    ~GpuCkwDirectConv2d() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_wei;
+    const ITensorInfo *_bia;
+    const ITensorInfo *_dst;
+
+    const Attributes _attributes;
+    const Settings   _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index 3965dec..a713c82 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -26,8 +26,14 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -145,16 +151,27 @@
     const Attributes                &attributes,
     const Settings                  &settings)
     : IGpuKernelComponent{ id, properties, tensors },
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{ std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings) }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
+
 ClComponentDirectConv2d::~ClComponentDirectConv2d()
 {
 }
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index 8e555dc..24acb1b 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -66,7 +66,11 @@
 };
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateDirectConv2d;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwDirectConv2d;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentDirectConv2d final : public IGpuKernelComponent
 {
@@ -134,8 +138,12 @@
     ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -143,7 +151,11 @@
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index 5ab1faf..f27a179 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/framework/Fixture.h"
@@ -250,5 +249,3 @@
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF