Use the stable CKW API in the GPU dynamic fusion backend

- Refactor all kernels to work with the CKW stable API
- Add support for sub-tile in the op_load/op_store CKW operator
- Fix mismatch in resize
- Add comments in all kernels written with CKW to help developers
understand the structure of the code
- Add texture image support in depthwise convolution written with CKW
- Add support for different block sizes in depthwise convolution
- Remove the use of the dynamic fusion helper functions.
- Add support for floor in the op_unary() of CKW

Resolves: COMPMID-6708, COMPMID-6743, COMPMID-6530

Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>

Change-Id: I8104ce4d04a3138a1aeb0b84940e1f1c89e76069
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10914
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
index bfa6724..44a4df1 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
@@ -61,25 +61,19 @@
 
         _tests.push_back({ 2, 4, DataType::Bool, 2, 1, 2, 1, DataType::Fp32, BinaryOp::GreaterEqual, "G0__dst__0 = (float4)G0__lhs__0 >= (float4)G0__rhs__0;\nG0__dst__1 = (float4)G0__lhs__1 >= (float4)G0__rhs__1;\n" }); // LHS and RHS x-dimension broadcast.
 
-        _tests.push_back({ 2, 3, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
+        _tests.push_back({ 2, 2, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
                            "G0__dst__0.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s0, G0__dst__0.s0);\n"
-                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
-                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s2, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s0, G0__dst__0.s1);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__1.s1);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__0.s0);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s2, G0__rhs__0.s2, G0__dst__0.s0);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s0, G0__rhs__1.s0, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__1.s1, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s2, G0__rhs__1.s2, G0__dst__0.s1);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__0.s0, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s1, G0__rhs__0.s1, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s2, G0__rhs__0.s2, G0__dst__1.s0);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__1.s1);\n"
                            "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s2, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n" });
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s2, G0__rhs__1.s2, G0__dst__1.s1);\n" });
     }
 
     bool run() override