Implement new Elementwise Dynamic Fusion Operators: Div, Floor

Resolves: COMPMID-5355

Change-Id: I92f73fbe885f28bbe7b07965b90cfd807c93602f
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7745
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index 96a845c..3ffbc07 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -74,8 +74,9 @@
     ClExecutionDescriptor exec_desc{};
     Status                st{};
 
-    const auto data_type = DataType::F32;
-    const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ };
+    const auto data_type    = DataType::F32;
+    const auto conv_info    = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ };
+    const auto eltwise_info = ElementwiseDescriptor{ ArithmeticOperation::ADD };
 
     const auto width     = 7U;
     const auto height    = 6U;
@@ -99,7 +100,7 @@
     const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
 
     const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info };
-    const ClEltwiseAddKernelDescriptor   eltwise_add_desc{};
+    const ClElementwiseKernelDescriptor  eltwise_add_desc{ eltwise_info };
     const TileDescriptor                 store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
 
     ArgumentID src_id{ g_arg_placeholder };
@@ -119,7 +120,7 @@
     st = add_tensor(bp, &dst_info, dst_id);
 
     st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
-    st = add_kcomp_eltwise_add(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id);
+    st = add_kcomp_eltwise_op(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id);
     st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id);
 
     exec_desc.skip_sliding_window = true;
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp
new file mode 100644
index 0000000..2b8f69e
--- /dev/null
+++ b/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#include "arm_compute/core/TensorInfo.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/validation/reference/Floor.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Operator_Floor_1_F32, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = floor(input)
+     */
+    const auto data_type    = DataType::F32;
+    const auto data_layout  = DataLayout::NHWC;
+    const auto t_shape      = TensorShape(32, 16);
+    auto       t_input_info = TensorInfo(t_shape, 1, data_type, data_layout);
+    auto       t_dst_info   = TensorInfo();
+
+    FloorDescriptor floor_desc{};
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{ t_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+
+    auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw = t_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    auto       ref_t_dst_nchw = reference::floor_layer(ref_t_input_nchw);
+    const auto ref_t_dst      = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    CLScheduler::get().default_reinit();
+    const auto    cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    OperatorGraph op_graph;
+
+    const auto op_t_input = add_tensor(op_graph, t_input_info);
+    const auto op_t_dst   = add_tensor(op_graph, t_dst_info);
+
+    add_op_floor(op_graph, floor_desc, op_t_input, op_t_dst);
+
+    const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+    ClWorkload              workload;
+    build(workload, op_graph, workload_ctx);
+
+    ClCompositeOperator op;
+    op.configure(cl_compile_ctx, workload);
+
+    // Construct tensors
+    CLTensor t_input{};
+    CLTensor t_dst{};
+
+    // Init tensors
+    t_input.allocator()->init(t_input_info);
+    t_dst.allocator()->init(t_dst_info);
+
+    // Allocate and fill tensors
+    t_input.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    // "Pack" tensors
+    OpTensorBinding bp_tensors({ { op_t_input, &t_input },
+        { op_t_dst, &t_dst }
+    });
+
+    // Populate prepare and run pack-maps (including allocating aux tensors)
+    ClAuxTensorData aux_tensor_data{};
+    TensorPackMap   prepare_pack_map{};
+    TensorPackMap   run_pack_map{};
+    bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
+
+    op.prepare(prepare_pack_map);
+    op.run(run_pack_map);
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
\ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
index fe8d23e..3a8b7c8 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
@@ -77,8 +77,8 @@
     auto       t_acc_info       = TensorInfo(); // Intermediate tensor for cond3
     auto       t_dst_info       = TensorInfo();
 
-    Conv2dDescriptor conv2d_desc{};
-    AddDescriptor    add_desc{};
+    Conv2dDescriptor      conv2d_desc{};
+    ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
 
     // Create reference
     SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
@@ -119,7 +119,7 @@
 
     auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
     force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
-    add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
@@ -180,8 +180,8 @@
     auto       t_acc_info       = TensorInfo(t_dst_shape, 1, data_type, data_layout);
     auto       t_dst_info       = TensorInfo(t_dst_shape, 1, data_type, data_layout);
 
-    Conv2dDescriptor conv2d_desc{};
-    AddDescriptor    add_desc{};
+    Conv2dDescriptor      conv2d_desc{};
+    ElementwiseDescriptor add_desc{};
 
     OperatorGraph op_graph;
 
@@ -192,7 +192,7 @@
     const auto op_t_dst       = add_tensor(op_graph, t_dst_info);
 
     auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
-    add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
     force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
@@ -290,7 +290,7 @@
     auto t_dst_info    = TensorInfo();
 
     OperatorGraph op_graph;
-    const auto    add_desc = AddDescriptor{};
+    const auto    add_desc = ElementwiseDescriptor{};
 
     const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
     const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info);
@@ -300,9 +300,9 @@
     const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred
     const auto op_t_dst    = add_tensor(op_graph, t_dst_info);
 
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
@@ -334,7 +334,7 @@
 
     OperatorGraph op_graph;
     const auto    conv2d_desc = Conv2dDescriptor{};
-    const auto    add_desc    = AddDescriptor{};
+    const auto    add_desc    = ElementwiseDescriptor{};
 
     const auto op_t_l0_0_input  = add_tensor(op_graph, t_l0_0_input_info);
     const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info);
@@ -345,8 +345,8 @@
     const auto op_t_dst         = add_tensor(op_graph, t_dst_info);
 
     add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
@@ -374,7 +374,7 @@
 
     OperatorGraph op_graph;
     const auto    conv2d_desc = Conv2dDescriptor{};
-    const auto    add_desc    = AddDescriptor{};
+    const auto    add_desc    = ElementwiseDescriptor{};
 
     const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
     const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info);
@@ -382,7 +382,7 @@
     const auto op_t_state1 = add_tensor(op_graph, state1_info);
 
     add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;