Add DirectConvolution2D kernel component for dynamic fusion

Resolves: COMPMID-5156

Change-Id: I438da924cb80d3bce72106b06ca7181e0606bd01
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7399
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index cb36590..9e1b4d8 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -32,8 +32,10 @@
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
 #include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/AccessWindowStatic.h"
@@ -83,7 +85,7 @@
 TEST_SUITE(ClCompositeKernel)
 TEST_SUITE(Validate)
 
-TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
+TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL)
 {
     /* Computation:
      * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast)
@@ -100,11 +102,11 @@
     auto       t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
     auto       t_dst_info  = TensorInfo(t_dst_shape, 1, data_type);
 
-    const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
-    const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
-    const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
-    const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
-    const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
+    const ClTensorDescriptor t_lhs_desc{ &t_lhs_info };
+    const ClTensorDescriptor t_rhs_desc{ &t_rhs_info };
+    const ClTensorDescriptor t_bias_desc{ &t_bias_info };
+    const ClTensorDescriptor t_addend_desc{ &t_dst_info };
+    const ClTensorDescriptor t_dst_desc{ &t_dst_info };
 
     ClKernelBlueprint bp;
     ArgumentID        tid_lhs;
@@ -134,10 +136,10 @@
     st = set_tile_info(bp, store_tile_info);
     st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
 
-    ClExecutionDescriptor exec_desc;
+    ClExecutionDescriptor exec_desc{};
     st = tune_static(exec_desc, cl_code);
 
-    CLScheduler::get().default_init();
+    CLScheduler::get().default_reinit();
     ClCompositeKernel kernel;
     kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
 
@@ -193,10 +195,149 @@
     validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32);
 }
 
+TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = add(addend, direct_conv2d(lhs, rhs, bias)) (non-broadcast)
+     */
+
+    ClCompositeKernel     kernel{};
+    ClKernelBlueprint     bp{};
+    ClKernelCode          cl_code{};
+    ClExecutionDescriptor exec_desc{};
+    Status                st{};
+
+    const auto data_type = DataType::F32;
+    const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U);
+
+    const auto width     = 7U;
+    const auto height    = 6U;
+    const auto IFM       = 5U;
+    const auto OFM       = 4U;
+    const auto kernel_sz = 3U;
+
+    const auto src_shape = TensorShape(IFM, width, height);
+    const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM);
+    const auto bia_shape = TensorShape(OFM);
+    const auto dst_shape = TensorShape(OFM, width, height);
+
+    auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC);
+    auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC);
+    auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC);
+    auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC);
+
+    const auto src_desc    = ClTensorDescriptor(&src_info);
+    const auto wei_desc    = ClTensorDescriptor(&wei_info);
+    const auto bia_desc    = ClTensorDescriptor(&bia_info);
+    const auto addend_desc = ClTensorDescriptor(&dst_info);
+    const auto dst_desc    = ClTensorDescriptor(&dst_info);
+
+    const auto n0 = std::min(OFM, 4u);
+    const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
+
+    const ClKernelComponentDescriptor common_kernel_desc{};
+    const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info };
+    const EltwiseAddDescriptor        eltwise_add_desc{ ConvertPolicy::WRAP };
+    const TileDescriptor              store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
+
+    ArgumentID src_id{ g_arg_placeholder };
+    ArgumentID wei_id{ g_arg_placeholder };
+    ArgumentID bia_id{ g_arg_placeholder };
+    ArgumentID acc_id{ g_arg_placeholder };
+    ArgumentID addend_id{ g_arg_placeholder };
+    ArgumentID dst_id{ g_arg_placeholder };
+
+    st = add_tensor_argument(bp, src_desc, src_id);
+    st = add_tensor_argument(bp, wei_desc, wei_id);
+    st = add_tensor_argument(bp, bia_desc, bia_id);
+    st = add_tensor_intermed(bp, acc_id);
+    st = add_tensor_argument(bp, addend_desc, addend_id);
+    st = add_tensor_argument(bp, dst_desc, dst_id);
+
+    st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
+    st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id);
+    st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect);
+
+    exec_desc.skip_sliding_window = true;
+
+    st = set_tile_info(bp, store_tile_info);
+    st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
+    st = tune_static(exec_desc, cl_code);
+
+    CLScheduler::get().default_reinit();
+    kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
+
+    // Construct tensors
+    CLTensor src{};
+    CLTensor wei{};
+    CLTensor bia{};
+    CLTensor addend{};
+    CLTensor dst{};
+
+    // Init tensors
+    src.allocator()->init(src_info);
+    wei.allocator()->init(wei_info);
+    bia.allocator()->init(bia_info);
+    addend.allocator()->init(dst_info);
+    dst.allocator()->init(dst_info);
+
+    // "Pack" tensors
+    TensorBinding tensors({ { src_id, &src },
+        { wei_id, &wei },
+        { bia_id, &bia },
+        { addend_id, &addend },
+        { dst_id, &dst }
+    });
+
+    // Allocate and fill tensors
+    src.allocator()->allocate();
+    wei.allocator()->allocate();
+    bia.allocator()->allocate();
+    addend.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    fill<float>(CLAccessor(src), 0);
+    fill<float>(CLAccessor(wei), 1);
+    fill<float>(CLAccessor(bia), 2);
+    fill<float>(CLAccessor(addend), 3);
+
+    CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
+
+    // Create reference
+    SimpleTensor<float> ref_src_nhwc{ src_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_wei_nhwc{ wei_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_bia_nhwc{ bia_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+    // Fill reference
+    fill<float>(ref_src_nhwc, 0);
+    fill<float>(ref_wei_nhwc, 1);
+    fill<float>(ref_bia_nhwc, 2);
+    fill<float>(ref_addend_nhwc, 3);
+
+    auto ref_src    = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U));
+    auto ref_wei    = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U));
+    auto ref_bia    = reference::permute(ref_bia_nhwc, PermutationVector(1U, 2U, 0U));
+    auto ref_addend = reference::permute(ref_addend_nhwc, PermutationVector(1U, 2U, 0U));
+
+    TensorShape dst_shape_nchw{ dst_shape };
+    permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    const auto ref_dst = reference::arithmetic_operation(
+                             ArithmeticOperation::ADD,
+                             ref_addend,
+                             reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info),
+                             data_type,
+                             eltwise_add_desc.convert_policy);
+
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(dst), ref_dst, tolerance_f32);
+}
+
 TEST_SUITE_END() // Validate
 
 TEST_SUITE(Benchmark)
-TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
+TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL)
 {
     using std::chrono::duration_cast;
     using std::chrono::microseconds;
@@ -205,19 +346,19 @@
     /* Computation:
      * out = add(addend, gemm_native(lhs, rhs, bias))
      */
-    const auto         data_type     = DataType::F32;
-    const unsigned int m             = 12 * 12;
-    const unsigned int n             = 64;
-    const unsigned int k             = 384;
-    const auto         t_lhs_shape   = TensorShape(k, m);
-    const auto         t_rhs_shape   = TensorShape(n, k);
-    const auto         t_dst_shape   = TensorShape(n, m);
-    auto               t_lhs_info    = TensorInfo(t_lhs_shape, 1, data_type);
-    auto               t_rhs_info    = TensorInfo(t_rhs_shape, 1, data_type);
-    auto               t_bias_info   = TensorInfo(TensorShape(), 1, data_type);
-    auto               t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
-    auto               t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
-    auto               t_dst_info    = TensorInfo(t_dst_shape, 1, data_type);
+    const auto data_type     = DataType::F32;
+    const auto m             = 12U * 12U;
+    const auto n             = 64U;
+    const auto k             = 384U;
+    const auto t_lhs_shape   = TensorShape(k, m);
+    const auto t_rhs_shape   = TensorShape(n, k);
+    const auto t_dst_shape   = TensorShape(n, m);
+    auto       t_lhs_info    = TensorInfo(t_lhs_shape, 1, data_type);
+    auto       t_rhs_info    = TensorInfo(t_rhs_shape, 1, data_type);
+    auto       t_bias_info   = TensorInfo(TensorShape(), 1, data_type);
+    auto       t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
+    auto       t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
+    auto       t_dst_info    = TensorInfo(t_dst_shape, 1, data_type);
 
     const auto                 common_kernel_desc = ClKernelComponentDescriptor{};
     const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
@@ -242,7 +383,7 @@
                                data_type,
                                eltwise_add_desc.convert_policy);
 
-    CLScheduler::get().default_init();
+    CLScheduler::get().default_reinit();
 
     /* Condition 0: Dynamic Fused Kernel */
     CLTensor cond0_t_dst{};
@@ -256,11 +397,11 @@
         ArgumentID        tid_l1_addend;
         ArgumentID        tid_dst;
 
-        const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
-        const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
-        const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
-        const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
-        const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
+        const ClTensorDescriptor t_lhs_desc{ &t_lhs_info };
+        const ClTensorDescriptor t_rhs_desc{ &t_rhs_info };
+        const ClTensorDescriptor t_bias_desc{ &t_bias_info };
+        const ClTensorDescriptor t_addend_desc{ &t_dst_info };
+        const ClTensorDescriptor t_dst_desc{ &t_dst_info };
 
         ClKernelCode cl_code;
         TICK(cond0_build_time)
@@ -282,7 +423,7 @@
         TOCK(cond0_build_time, measurements)
 
         TICK(cond0_tune_time)
-        ClExecutionDescriptor exec_desc;
+        ClExecutionDescriptor exec_desc{};
         st = tune_static(exec_desc, cl_code);
         TOCK(cond0_tune_time, measurements)