Add GpuKernelArgumentBinding for runtime argument setting

* Add flexible runtime argument setting that accept argument bindings
exported from ckw.

* Introduce internal build flag ACL_INTERNAL_TEST_CKW_IN_DF. If set to
true, ckw will be tested in dynamic fusion validation tests. Otherwise
it will not be tested and the dynamic fusion will keep using
ClTemplateWriter instead.

* Fix CKW sampler for elementwise binary to deal with tile sizes > 1
in both dimensions

Resolves: COMPMID-6282
Partially resolves: COMPMID-6260

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: I0ab225a4484eb2119643d900a4e72806558626ee
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9917
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 302d4c8..226e1a2 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -33,6 +33,7 @@
 namespace dynamic_fusion
 {
 /** Contain information required to set up a kernel argument at run time
+ * @deprecated To be removed along with ClTemplateWriter
  */
 struct GpuKernelArgumentInfo
 {
@@ -66,10 +67,9 @@
     }
     Type type{ Type::Tensor_4D_t_Buffer };
 };
-
 bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
-
 /** Kernel argument information linked with its corresponding @ref ITensorInfo
+ * @deprecated To be removed along with ClTemplateWriter
  */
 class GpuKernelArgument
 {
@@ -124,6 +124,130 @@
     TensorInfo            _tensor_info{};
     GpuKernelArgumentInfo _kernel_arg_info{};
 };
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+/** Describe how the tensor runtime memory can be accessed
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorStorageType
+{
+    Unknown,
+    ClBufferUint8Ptr,
+    ClImage2dReadOnly,
+    ClImage2dWriteOnly,
+};
+
+/** Describe additional runtime information about the tensor
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorComponentType
+{
+    Unknown,
+    OffsetFirstElement,
+    Stride0,
+    Stride1,
+    Stride2,
+    Stride3,
+    Stride4,
+    Dim0,
+    Dim1,
+    Dim2,
+    Dim3,
+    Dim4,
+    Dim1xDim2,
+    Dim2xDim3,
+    Dim1xDim2xDim3,
+};
+
+/** Describe how to extract information from a runtime Gpu tensor, and set it as an argument to a gpu kernel at runtime
+ *
+ * A kernel argument is just an argument to the gpu kernel as shown in the argument list below. This contrasts with a "workload argument" which is a tensor (@ref GpuWorkloadArgument)
+ * void kernel(arg0, arg1, ... argN)
+ *
+ * In a kernel generated using dynamic fusion (@ref GpuKernelSourceCode), every kernel argument describes part of a tensor.
+ * A tensor is described as: **storages** followed by **components**
+ *
+ * A storage (@ref TensorStorageType) describes how the tensor runtime memory can be accessed (e.g. via a global uint8 pointer to a CL buffer)
+ * A component (@ref TensorComponentType) describes additional runtime information about the tensor (e.g. the dimensions of the tensor)
+ *
+ * The arguments are arranged in the order of use in the generated kernel code:
+ *
+ *  arg0   , arg1      , arg2      ,                         ...,                         , argN
+ *  storage, component0, component1, ..., componentX, storage, component0, component1, ..., componentY
+ * |                   tensor0                       |                    tensor1                    |
+ *
+ * An example argument list:
+ *
+ * void kernel(
+ *  image2d_t       t0_image,               // TensorStorageType::ClImage2dReadOnly
+ *  uint8_t*        t0_ptr,                 // TensorStorageType::ClBufferUint8Ptr
+ *  uint            t0_dim0,                // TensorComponentType::Dim0
+ *  uint            t0_stride1,             // TensorComponentType::Stride1
+ *  image2d_t       t1_ptr,                 // TensorStorageType::ClImage2dReadOnly
+ *  uint            t1_dim1xdim2,           // TensorComponentType::Dim1xDim2
+ *  uint            t1_stride1,             // TensorComponentType::Stride1
+ *  uint            t1_stride2,             // TensorComponentType:Stride2
+ * )
+ *
+ */
+class GpuKernelArgumentBinding
+{
+public:
+    enum class Type : int32_t
+    {
+        TensorStorage,  /** @ref TensorStorageType */
+        TensorComponent /** @ref TensorComponentType */
+    };
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
+        : _type{ Type::TensorStorage }, _id{ id }, _value{}
+    {
+        _value.tensor_storage_type = storage;
+    }
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
+        : _type{ Type::TensorComponent }, _id{ id }, _value{}
+    {
+        _value.tensor_component_type = component;
+    }
+    /** Storage type of the tensor
+     */
+    TensorStorageType tensor_storage_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorStorage);
+        return _value.tensor_storage_type;
+    }
+    /** Component of the tensor
+     */
+    TensorComponentType tensor_component_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorComponent);
+        return _value.tensor_component_type;
+    }
+    /** Id of the tensor this kernel argument belongs to
+     */
+    ITensorInfo::Id id() const
+    {
+        return _id;
+    }
+    /** Type of the kernel argument
+     */
+    Type type() const
+    {
+        return _type;
+    }
+
+private:
+    Type            _type;
+    ITensorInfo::Id _id;
+    union Value
+    {
+        TensorStorageType   tensor_storage_type;
+        TensorComponentType tensor_component_type;
+    };
+    Value _value;
+};
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute