Use the stable CKW API in the GPU dynamic fusion backend

- Refactor all kernels to work with the CKW stable API
- Add support for sub-tile in the op_load/op_store CKW operator
- Fix mismatch in resize
- Add comments in all kernels written with CKW to help developers
understand the structure of the code
- Add texture image support in depthwise convolution written with CKW
- Add support for different block sizes in depthwise convolution
- Remove the use of the dynamic fusion helper functions.
- Add support for floor in the op_unary() of CKW

Resolves: COMPMID-6708, COMPMID-6743, COMPMID-6530

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>

Change-Id: I8104ce4d04a3138a1aeb0b84940e1f1c89e76069
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10914
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
index a98ebed..7d16f35 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
@@ -34,15 +34,16 @@
 #include "src/cl/CLTile.h"
 #include "src/ITensor.h"
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 namespace ckw
 {
-bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
-                                      const ITensor        *tensor,
-                                      const TensorSampler  *sampler,
-                                      const Tensor3dMapper *mapper,
-                                      MemoryOperation       op,
-                                      const CLTile         *dst)
+bool CLMemoryOpBufferHelper::validate(const CLKernelWriter   *writer,
+                                      const ITensor          *tensor,
+                                      const TensorSampler    *sampler,
+                                      const Tensor3dMapper   *mapper,
+                                      MemoryOperation         op,
+                                      const TileView<CLTile> &dst)
 {
     CKW_UNUSED(writer, tensor, mapper, op, dst);
 
@@ -100,17 +101,14 @@
  *  The outermost block is x, then z and then y. This is why, if/else's covering for y are initialized
  *  at each row write. In some addressing modes, such as None, no if/else conditions are written.
  */
-void CLMemoryOpBufferHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpBufferHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    _dst = dst;
-
     CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
 
-    _ls_width_full = dst->info().width();
-    _coord_x       = x->scalar(0, 0).str;
-    _coord_z       = z->scalar(0, 0).str;
-    _coord_b       = b->scalar(0, 0).str;
-    _coord_orig_z  = _coord_z;
+    _coord_x      = x->scalar(0, 0).str;
+    _coord_z      = z->scalar(0, 0).str;
+    _coord_b      = b->scalar(0, 0).str;
+    _coord_orig_z = _coord_z;
 
     out_of_bound_initialize_x(_coord_x);
     out_of_bound_initialize_z(_coord_z);
@@ -121,7 +119,7 @@
     // The only check required is on Y.
     out_of_bound_initialize_y(coord_y);
 
-    const std::string dst     = _dst->vector(row_id).str;
+    const std::string dst     = _dst.vector(row_id).str;
     const std::string address = to_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
     const std::string ls_buf  = to_statement(_op, _ls_width_full, dst, address);
 
@@ -133,10 +131,17 @@
     // The left over load/store will be written in the finalize stage
     if (_ls_width_part.size() != 0)
     {
-        int32_t col_start = 0;
+        int32_t        col_start     = 0;
+        const TileArea original_area = _dst.area();
+
         for (int32_t partial_width : _ls_width_part)
         {
-            const std::string dst       = _dst->vector(row_id, col_start, partial_width).str;
+            // Set the active area
+            const TileArea area(original_area.row_start(), original_area.row_end(), col_start,
+                                col_start + partial_width);
+            _dst.area(area);
+
+            const std::string dst       = _dst.vector(row_id).str;
             const std::string coord_x   = _coord_x + " + " + std::to_string(col_start);
             const std::string address   = to_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
             const std::string statement = to_statement(_op, partial_width, dst, address);
@@ -144,6 +149,8 @@
 
             col_start += partial_width;
         }
+        // Restore the original area
+        _dst.area(original_area);
     }
 }
 
@@ -304,7 +311,7 @@
     CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
 
     const std::string ptr_buf  = _tensor->storage(tensor_storage).val;
-    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
+    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst.data_type(), 1);
 
     std::string address;
     address += "(__global ";
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
index 4e1a842..a6b3272 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
-#define CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
+#ifndef CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
+#define CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
 
 #include "src/cl/helpers/ICLMemoryOpHelper.h"
 
@@ -37,6 +37,8 @@
 // Forward Declarations
 class CLKernelWriter;
 class CLTile;
+template <class CLTile>
+class TileView;
 enum class MemoryOperation;
 
 /** Helper class to write memory operations (like load/store) in OpenCL
@@ -45,19 +47,23 @@
 {
 public:
     /** Constructor similar to @ref ICLMemoryOpHelper() */
-    CLMemoryOpBufferHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    CLMemoryOpBufferHelper(CLKernelWriter         *writer,
+                           ITensor                *tensor,
+                           TensorSampler          *sampler,
+                           MemoryOperation         op,
+                           const TileView<CLTile> &dst)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
     {
     }
 
     /** Copy constructor */
-    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = default;
+    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = delete;
 
     /** Assignment operator overload */
-    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = default;
+    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = delete;
 
     // Methods overridden
-    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
     void write_row(int32_t row_id, const std::string &coord_y) override;
     void finalize() override;
 
@@ -78,12 +84,12 @@
     std::vector<LeftoverDescriptor> _leftovers_x{};
     std::string                     _coord_orig_z{};
 
-    static bool validate(const CLKernelWriter *writer,
-                         const ITensor        *tensor,
-                         const TensorSampler  *sampler,
-                         const Tensor3dMapper *mapper,
-                         MemoryOperation       op,
-                         const CLTile         *dst);
+    static bool validate(const CLKernelWriter   *writer,
+                         const ITensor          *tensor,
+                         const TensorSampler    *sampler,
+                         const Tensor3dMapper   *mapper,
+                         MemoryOperation         op,
+                         const TileView<CLTile> &dst);
 
     void out_of_bound_initialize_x(const std::string &coord);
     void out_of_bound_finalize_x();
@@ -99,4 +105,4 @@
 };
 } // namespace ckw
 
-#endif /* CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
index b7d146b..f392cd8 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
@@ -33,18 +33,15 @@
 #include "src/cl/CLTile.h"
 #include "src/ITensor.h"
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 namespace ckw
 {
-void CLMemoryOpImage2dHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpImage2dHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, dst));
-
-    _dst           = dst;
-    _ls_width_full = dst->info().width();
-    _coord_x       = x->scalar(0, 0).str;
-    _coord_z       = z->scalar(0, 0).str;
-    _coord_b       = b->scalar(0, 0).str;
+    _coord_x = x->scalar(0, 0).str;
+    _coord_z = z->scalar(0, 0).str;
+    _coord_b = b->scalar(0, 0).str;
 }
 
 void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord_y)
@@ -52,7 +49,7 @@
     // The only check required is on Y.
     out_of_bound_initialize_y(coord_y);
 
-    const std::string dst     = _dst->vector(row_id).str;
+    const std::string dst     = _dst.vector(row_id).str;
     const std::string sampler = to_ls_image2d_sampler();
     const std::string coord   = to_ls_image2d_address(_coord_x, coord_y, _coord_z, _coord_b);
     const std::string ls_buf  = to_ls_image2d(_op, _ls_width_full, dst, sampler, coord);
@@ -66,16 +63,16 @@
 {
 }
 
-bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
-                                       const ITensor        *tensor,
-                                       const TensorSampler  *sampler,
-                                       const Tensor3dMapper *mapper,
-                                       MemoryOperation       op,
-                                       const CLTile         *dst)
+bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter   *writer,
+                                       const ITensor          *tensor,
+                                       const TensorSampler    *sampler,
+                                       const Tensor3dMapper   *mapper,
+                                       MemoryOperation         op,
+                                       const TileView<CLTile> &dst)
 {
     CKW_UNUSED(writer, tensor, mapper);
 
-    if (dst->info().width() != 4)
+    if (dst.width() != 4)
     {
         return false;
     }
@@ -95,7 +92,7 @@
     {
         return false;
     }
-    if ((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
+    if ((dst.data_type() != DataType::Fp32) && (dst.data_type() != DataType::Fp16))
     {
         return false;
     }
@@ -143,10 +140,12 @@
                                                    const std::string &address) const
 {
     CKW_UNUSED(vector_width);
+    CKW_ASSERT_MSG(_dst.data_type() == DataType::Fp32 || _dst.data_type() == DataType::Fp16,
+                   "Image2d only supports floating-point data type");
 
     const TensorStorageType tensor_storage = _sampler->storage();
     const std::string       image2d_obj    = _tensor->storage(tensor_storage).val;
-    const std::string       post_fix       = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
+    const std::string       post_fix       = _dst.data_type() == DataType::Fp32 ? "f" : "h";
 
     switch (op)
     {
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
index fd9b097..6c42c13 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
@@ -35,6 +35,8 @@
 // Forward Declarations
 class CLKernelWriter;
 class CLTile;
+template <class CLTile>
+class TileView;
 enum class MemoryOperation;
 
 /** Helper class to write memory operations (like load/store) in OpenCL for Image2d type */
@@ -42,29 +44,33 @@
 {
 public:
     /** Constructor similar to @ref ICLMemoryOpHelper() */
-    CLMemoryOpImage2dHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    CLMemoryOpImage2dHelper(CLKernelWriter         *writer,
+                            ITensor                *tensor,
+                            TensorSampler          *sampler,
+                            MemoryOperation         op,
+                            const TileView<CLTile> &dst)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
     {
     }
 
     /** Copy constructor */
-    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = default;
+    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = delete;
 
     /** Assignment operator overload */
-    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = default;
+    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = delete;
 
     // Methods overridden
-    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
     void write_row(int32_t row_id, const std::string &coord_y) override;
     void finalize() override;
 
 private:
-    static bool validate(const CLKernelWriter *writer,
-                         const ITensor        *tensor,
-                         const TensorSampler  *sampler,
-                         const Tensor3dMapper *mapper,
-                         MemoryOperation       op,
-                         const CLTile         *dst);
+    static bool validate(const CLKernelWriter   *writer,
+                         const ITensor          *tensor,
+                         const TensorSampler    *sampler,
+                         const Tensor3dMapper   *mapper,
+                         MemoryOperation         op,
+                         const TileView<CLTile> &dst);
 
     void out_of_bound_initialize_y(const std::string &coord);
     void out_of_bound_finalize_y();
diff --git a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
index f46fee9..a5b679a 100644
--- a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
@@ -28,6 +28,7 @@
 #include "ckw/TensorSampler.h"
 
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 #include <cstdint>
 #include <memory>
@@ -55,18 +56,24 @@
      * @param[in] tensor  @ref ckw::ITensor object to perform the memory operation on
      * @param[in] sampler @ref ckw::TensorSampler object that tells how to sample a tensor
      * @param[in] op      The memory operation to be done (e.g. Load/Store)
+     * @param[in] dst     The tile to perform the memory operation on
      */
-    ICLMemoryOpHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op)
+    ICLMemoryOpHelper(CLKernelWriter         *writer,
+                      ITensor                *tensor,
+                      TensorSampler          *sampler,
+                      MemoryOperation         op,
+                      const TileView<CLTile> &dst)
+        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op), _dst(dst)
     {
-        _mapper = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+        _mapper        = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+        _ls_width_full = _dst.width();
     }
 
     /** Copy constructor */
-    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = default;
+    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = delete;
 
     /** Assignment operator overload */
-    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = default;
+    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = delete;
 
     /** Destructor */
     virtual ~ICLMemoryOpHelper() = default;
@@ -75,12 +82,11 @@
      *  the batch offset as a tile object, and initializes the code inside
      *  the writer object.
      *
-     * @param[in] dst  tile object to perform the memory operation on
      * @param[in] x    tile object that describes the x-coordinate of the tensor involved
      * @param[in] z    tile object that describes the z-coordinate of the tensor involved
      * @param[in] b    tile object that describes the batch offset of the tensor involved
      */
-    virtual void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) = 0;
+    virtual void initialize(const CLTile *x, const CLTile *z, const CLTile *b) = 0;
 
     /** Method that writes the actual code to the writer that performs the mentioned memory
      *  operation on the tile initialized. It writes the code for a specific row given in the
@@ -104,7 +110,7 @@
     TensorSampler                  *_sampler{nullptr};
     MemoryOperation                 _op;
     std::unique_ptr<Tensor3dMapper> _mapper{nullptr};
-    const CLTile                   *_dst{nullptr};
+    TileView<CLTile>                _dst{};
     int32_t                         _ls_width_full{0};
     std::string                     _coord_x{};
     std::string                     _coord_z{};
@@ -112,4 +118,4 @@
 };
 } // namespace ckw
 
-#endif /* CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H