Implement new Elementwise Dynamic Fusion Operators: Div, Floor Resolves: COMPMID-5355 Change-Id: I92f73fbe885f28bbe7b07965b90cfd807c93602f Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7745 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com>

commit: b1fcefddf3f59219a9d7930d607175b7e6c39347 [log] [tgz]
author: Michalis Spyrou <michalis.spyrou@arm.com> Wed Jun 15 19:02:28 2022 +0100
committer: Michalis Spyrou <michalis.spyrou@arm.com> Mon Jun 27 14:05:05 2022 +0000
tree: 34e95efded15194b3c8abe4ba3da308c3259301d
parent: 41eb2d92c89274200d59ff97653e2bd66819b310 [diff]
diff --git a/Android.bp b/Android.bp
index 9f6f8de..74c2b96 100644
--- a/Android.bp
+++ b/Android.bp

@@ -370,7 +370,8 @@
         "src/core/Version.cpp",
         "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
         "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
-        "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp",
+        "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp",
+        "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp",
         "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
         "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
         "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp",

diff --git a/arm_compute/core/experimental/OperatorGraph.h b/arm_compute/core/experimental/OperatorGraph.h
index fd8fcd5..cab83c7 100644
--- a/arm_compute/core/experimental/OperatorGraph.h
+++ b/arm_compute/core/experimental/OperatorGraph.h

@@ -176,17 +176,18 @@
  */
 void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method);
 
-/** Descriptor for Addition operation
+/** Descriptor for Elementwise binary operation
  *
  */
-struct AddDescriptor
+struct ElementwiseDescriptor
 {
     /* TOSA compliant attribute parameters start */
     /* TOSA compliant attribute parameters end */
     /* Non-TOSA compliant attribute parameters start */
+    ArithmeticOperation op;
     /* Non-TOSA compliant attribute parameters end */
 };
-/** Add op Add to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
+/** Add op Elementwise to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
  *
  * @param[in,out] graph OperatorGraph where the operator is added to
  * @param[in]     desc  Operator descriptor
@@ -196,12 +197,34 @@
  *
  * @return Operator
  */
-Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst);
+Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst);
+
+/** Descriptor for Floor operation
+ *
+ */
+struct FloorDescriptor
+{
+    /* TOSA compliant attribute parameters start */
+    /* TOSA compliant attribute parameters end */
+    /* Non-TOSA compliant attribute parameters start */
+    /* Non-TOSA compliant attribute parameters end */
+};
+/** Add op Floor to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
+ *
+ * @param[in,out] graph OperatorGraph where the operator is added to
+ * @param[in]     desc  Operator descriptor
+ * @param[in]     src   Source OpTensor
+ * @param[in]     dst   Destination OpTensor
+ *
+ * @return Operator
+ */
+Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst);
 
 bool operator==(const OpTensor &t0, const OpTensor &t1);
 bool operator==(const Padding2D &pad0, const Padding2D &pad1);
 bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1);
-bool operator==(const AddDescriptor &, const AddDescriptor &);
+bool operator==(const ElementwiseDescriptor &, const ElementwiseDescriptor &);
+bool operator==(const FloorDescriptor &, const FloorDescriptor &);
 
 } // namespace dynamic_fusion
 } // namespace experimental

diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
index 285509b..8aee67e 100644
--- a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
+++ b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp

@@ -218,12 +218,12 @@
 
         /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Add Elementwise Add Operator
         // [Add Elementwise Add Operator]
-        auto          t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout);
-        auto          t_dst_info       = TensorInfo();
-        const auto    op_t_l1_addend   = add_tensor(op_graph, t_l1_addend_info);
-        const auto    op_t_dst         = add_tensor(op_graph, t_dst_info);
-        AddDescriptor add_desc{};
-        add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+        auto                  t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout);
+        auto                  t_dst_info       = TensorInfo();
+        const auto            op_t_l1_addend   = add_tensor(op_graph, t_l1_addend_info);
+        const auto            op_t_dst         = add_tensor(op_graph, t_dst_info);
+        ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
+        add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
         // [Add Elementwise Add Operator]
 
         /// @page example_dynamic_fusion_cl_conv2d_elementwise_add

diff --git a/filelist.json b/filelist.json
index 681cacd..e22ec1d 100644
--- a/filelist.json
+++ b/filelist.json

@@ -2097,7 +2097,8 @@
     "dynamic_fusion": [
       "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
       "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp",
+      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp",
+      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp",
       "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
       "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp",
 

diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index 9fad66d..53ce296 100644
--- a/src/core/CL/cl_kernels/common/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl

@@ -703,7 +703,7 @@
 
     a_offset_s32[0].v *= A_OFFSET;
 
-    T_ADD_BROADCAST_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
 #endif // defined(A_OFFSET)
 
 #if defined(B_OFFSET)
@@ -728,7 +728,7 @@
 
     T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
 
-    T_ADD_BROADCAST_X(int, M0, N0, offset_s32, bias, offset_s32);
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
 #endif // defined(ADD_BIAS)
 
     LOOP_UNROLLING(int, i, 0, 1, M0,

diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index f1b422a..d34e24b 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -233,7 +233,7 @@
     T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 
 #endif // HAS_BIAS
 

diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
index 587f398..807b990 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution3d.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -254,7 +254,7 @@
     }
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+    T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 
 #endif // HAS_BIAS
 

diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 4f57a81..b24a6ae 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -175,7 +175,7 @@
         T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0);
 
         // c = c + bias[broadcasted]
-        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+        T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 #endif // HAS_BIAS
 
         T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);

diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index ec2593a..263a23e 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -236,7 +236,7 @@
         T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0);
 
         // c = c + bias[broadcasted]
-        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+        T_ELTWISE_BROADCAST_ADD_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
 #endif // HAS_BIAS
 
         T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);

diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
index bab2ee8..0883cd9 100644
--- a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl

@@ -111,7 +111,7 @@
 
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
-    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
 #endif // defined(HAS_BIAS)
 
     T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
@@ -177,7 +177,7 @@
 
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
 #endif // defined(HAS_BIAS)
 
     T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
@@ -287,7 +287,7 @@
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
@@ -374,7 +374,7 @@
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
@@ -488,7 +488,7 @@
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
@@ -586,7 +586,7 @@
     T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
 
     // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
 #endif // HAS_BIAS
 
     int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;

diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index 69898e1..0ce343e 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h

@@ -970,6 +970,9 @@
 #define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
 #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
 
+#define T_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
+#define T_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
+
 /** Element-wise activation for quantized types
  *
  * @note Performs: activation(LHS) = DST
@@ -1014,11 +1017,15 @@
         })                                                        \
     })
 
-/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)
+#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise operation with RHS broadcasted (RHS has the X dimension only)
  *
- * @note Performs: LHS + RHS[broadcasted] = DST
+ * @note Performs: LHS OP RHS[broadcasted] = DST
  * @note Both tiles must have same data type
  *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
  * @param[in]  DST_DATA_TYPE DST data type
  * @param[in]  M0            Number of LHS rows
  * @param[in]  N0            Number of LHS columns
@@ -1026,19 +1033,23 @@
  * @param[in]  rhs           RHS tile
  * @param[out] dst           DST tile
  */
-#define T_ADD_BROADCAST_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
     ({                                                      \
         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
         {                                                   \
-            dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0));             \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
         })                                                  \
     })
 
-/** Element-wise addition between two tiles (LHS and RHS)
+#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(T_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
+/** Element-wise operation between two tiles (LHS and RHS)
  *
- * @note Performs: LHS + RHS = DST
+ * @note Performs: LHS OP RHS = DST
  * @note Both tiles must have same data type
  *
+ * @param[in]  T_ELWISE_OP   Elementwise operator to perform
  * @param[in]  DST_DATA_TYPE DST data type
  * @param[in]  M0            Number of LHS rows
  * @param[in]  N0            Number of LHS columns
@@ -1046,11 +1057,30 @@
  * @param[in]  rhs           RHS tile
  * @param[out] dst           DST tile
  */
-#define T_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
+#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
     ({                                                      \
         LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
         {                                                   \
-            dst[_m0].v = CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)) + CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0));             \
+            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
+        })                                                  \
+    })
+
+/** Floor operation on a tile
+ *
+ * @note Performs: floor(SRC) = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  DST_DATA_TYPE DST data type
+ * @param[in]  M0            Number of SRC rows
+ * @param[in]  N0            Number of SRC columns
+ * @param[in]  src           LHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
+    ({                                                      \
+        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
+        {                                                   \
+            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
         })                                                  \
     })
 

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index 0be2ba0..9b6daae 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp

@@ -55,18 +55,32 @@
     return Status{};
 }
 
-Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClEltwiseAddKernelDescriptor &,
-                             ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
+Status add_kcomp_eltwise_op(ClKernelBlueprint &kernel_blueprint, const ClElementwiseKernelDescriptor &desc,
+                            ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
 {
     kernel_blueprint.impl().add_component(
-        std::make_unique<ClElementwiseAddKernelComponent>(
+        std::make_unique<ClElementwiseKernelComponent>(
             &kernel_blueprint,
+            desc,
             SharedVarLink{ src0_id, SharedVarIO::Input },
             SharedVarLink{ src1_id, SharedVarIO::Input },
             SharedVarLink{ dst_id, SharedVarIO::Output }));
 
     return Status{};
 }
+
+Status add_kcomp_floor(ClKernelBlueprint &kernel_blueprint, const ClFloorKernelDescriptor &,
+                       ArgumentID src_id, ArgumentID &dst_id)
+{
+    kernel_blueprint.impl().add_component(
+        std::make_unique<ClFloorKernelComponent>(
+            &kernel_blueprint,
+            SharedVarLink{ src_id, SharedVarIO::Input },
+            SharedVarLink{ dst_id, SharedVarIO::Output }));
+
+    return Status{};
+}
+
 Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &)
 {
     return Status{};

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 067e973..463fc5e 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h

@@ -59,9 +59,13 @@
 };
 
 ///// Kernel Components /////
-/** Component: Eltwise Add */
-Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClEltwiseAddKernelDescriptor &, ArgumentID src0_id,
-                             ArgumentID src1_id, ArgumentID &dst_id);
+/** Component: Eltwise Operator */
+Status add_kcomp_eltwise_op(ClKernelBlueprint &, const ClElementwiseKernelDescriptor &, ArgumentID src0_id,
+                            ArgumentID src1_id, ArgumentID &dst_id);
+
+/** Component: Floor */
+Status add_kcomp_floor(ClKernelBlueprint &, const ClFloorKernelDescriptor &, ArgumentID src_id,
+                       ArgumentID &dst_id);
 
 /** Component: Activation */
 Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id);

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
index b63e216..811cd79 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp

@@ -237,7 +237,7 @@
             T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0);
 
             // c = c + bias[broadcasted]
-            T_ADD_BROADCAST_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+            T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
         )_";
     }
 

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp
similarity index 82%
rename from src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
rename to src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp
index 965a68f..24a9eee 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp

@@ -23,7 +23,7 @@
  */
 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
 
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -34,17 +34,17 @@
 {
 namespace dynamic_fusion
 {
-ComponentType ClElementwiseAddKernelComponent::get_component_type() const
+ComponentType ClElementwiseKernelComponent::get_component_type() const
 {
     return ComponentType::Simple;
 }
 
-std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
+std::set<std::string> ClElementwiseKernelComponent::get_headers_list() const
 {
     return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
 }
 
-Window ClElementwiseAddKernelComponent::get_window() const
+Window ClElementwiseKernelComponent::get_window() const
 {
     const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
     const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
@@ -64,7 +64,7 @@
     return win;
 }
 
-std::string ClElementwiseAddKernelComponent::get_component_code() const
+std::string ClElementwiseKernelComponent::get_component_code() const
 {
     std::string code;
     const bool  is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
@@ -72,7 +72,7 @@
     if(is_root)
     {
         return R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
     // IN_0(LHS)            {{lhs}}
     // IN_1(RHS)            {{rhs}}
     // OUT(dst, accum)      {{dst}}
@@ -87,19 +87,19 @@
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile);
 
 #if defined(IS_BROADCAST)
-        T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
 #else // !defined(IS_BROADCAST)
-        T_ADD({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
 #endif // defined(IS_BROADCAST)
 
     }
-    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
 )_";
     }
     else
     {
         return R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
     // IN_0/Out(Accumulator)   {{acc}}
     // IN_1(Addend)        {{addend}}
 
@@ -110,17 +110,17 @@
         T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile);
 
 #if defined(IS_BROADCAST)
-        T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
 #else // !defined(IS_BROADCAST)
-        T_ADD({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
 #endif // defined(IS_BROADCAST)
     }
-    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
 )_";
     }
 }
 
-CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
+CLBuildOptions ClElementwiseKernelComponent::generate_build_options() const
 {
     const auto t_src_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
     const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
@@ -137,7 +137,7 @@
     return build_opts;
 }
 
-std::string ClElementwiseAddKernelComponent::generate_config_id() const
+std::string ClElementwiseKernelComponent::generate_config_id() const
 {
     auto        t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
     std::string config_id{};
@@ -151,7 +151,7 @@
     return config_id;
 }
 
-void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+void ClElementwiseKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
 {
     const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
     vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
@@ -162,7 +162,7 @@
     }
 }
 
-ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+ClElementwiseKernelComponent::TagLUT ClElementwiseKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
 {
     TagLUT     lut{};
     const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
@@ -199,6 +199,18 @@
     // Local build options
     lut["meta_kernel_id"] = id();
     lut["DATA_TYPE"]      = get_cl_type_from_data_type(t_dst_info->data_type());
+
+    switch(_desc.eltwise.op)
+    {
+        case ArithmeticOperation::DIV:
+            lut["ELTWISE_OP"] = "DIV";
+            break;
+        case ArithmeticOperation::ADD:
+            lut["ELTWISE_OP"] = "ADD";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
+    }
     return lut;
 }
 } // namespace dynamic_fusion

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h
similarity index 84%
rename from src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
rename to src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h
index 5f8b156..91b14ff 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h

@@ -34,11 +34,11 @@
 {
 namespace dynamic_fusion
 {
-class ClElementwiseAddKernelComponent : public IClKernelComponent
+class ClElementwiseKernelComponent : public IClKernelComponent
 {
 public:
-    ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
-        : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
+    ClElementwiseKernelComponent(ClKernelBlueprint *blueprint, const ClElementwiseKernelDescriptor &desc, const Link &lhs, const Link &rhs, const Link &dst)
+        : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
     {
     }
 
@@ -63,9 +63,10 @@
     }
 
 private:
-    Link _lhs{};
-    Link _rhs{};
-    Link _dst{};
+    ClElementwiseKernelDescriptor _desc{};
+    Link                          _lhs{};
+    Link                          _rhs{};
+    Link                          _dst{};
 };
 
 } // namespace dynamic_fusion

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp
new file mode 100644
index 0000000..87cc110
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ComponentType ClFloorKernelComponent::get_component_type() const
+{
+    return ComponentType::Simple;
+}
+
+std::set<std::string> ClFloorKernelComponent::get_headers_list() const
+{
+    return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
+}
+
+Window ClFloorKernelComponent::get_window() const
+{
+    const ITensorInfo *src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+    ITensorInfo       *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src_info, dst_info);
+    auto_init_if_empty(*dst_info, src_info->tensor_shape(), 1, src_info->data_type());
+
+    const unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
+    Window             win                               = calculate_max_window(*dst_info, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
+std::string ClFloorKernelComponent::get_component_code() const
+{
+    return R"_(
+    //------------------ START KERNEL {{meta_kernel_id}} FLOOR ---------------------
+    // IN_0(src)            {{src}}
+    // OUT(dst, accum)      {{dst}}
+    TILE({{DATA_TYPE}}, M0, N0, {{dst}});
+    {
+        TILE({{DATA_TYPE}}, M0, N0, src_tile);
+
+        T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{src}}, cout, mout, 1, {{src}}_stride_y, src_tile);
+        T_FLOOR({{DATA_TYPE}}, M0, N0, src_tile, {{dst}});
+    }
+
+    //------------------ END KERNEL {{meta_kernel_id}} FLOOR ---------------------
+)_";
+}
+
+CLBuildOptions ClFloorKernelComponent::generate_build_options() const
+{
+    CLBuildOptions build_opts{};
+
+    const auto n0 = _blueprint->impl().get_execution_window().x().step();
+    const auto m0 = _blueprint->impl().get_execution_window().y().step();
+
+    const auto         dst_info         = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClFloorKernelComponent::generate_config_id() const
+{
+    auto        t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
+    config_id += "_";
+    config_id += support::cpp11::to_string(t_dst_info->dimension(0));
+    config_id += "_";
+    config_id += support::cpp11::to_string(t_dst_info->dimension(1));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(t_dst_info->data_layout()));
+    return config_id;
+}
+
+void ClFloorKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+    vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+    vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClFloorKernelComponent::TagLUT ClFloorKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+{
+    TagLUT     lut{};
+    const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+    // Arguments and global shared variables
+    lut["src"]            = vtable.get(_src);
+    lut["dst"]            = vtable.get(_dst);
+    lut["meta_kernel_id"] = id();
+    lut["DATA_TYPE"]      = get_cl_type_from_data_type(t_dst_info->data_type());
+    return lut;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h
similarity index 82%
copy from src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
copy to src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h
index 5f8b156..5463e23 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h

@@ -23,8 +23,8 @@
  */
 #ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
 
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
 
@@ -34,11 +34,11 @@
 {
 namespace dynamic_fusion
 {
-class ClElementwiseAddKernelComponent : public IClKernelComponent
+class ClFloorKernelComponent : public IClKernelComponent
 {
 public:
-    ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
-        : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
+    ClFloorKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+        : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
     {
     }
 
@@ -51,7 +51,7 @@
 
     virtual std::vector<Link> get_links() const override
     {
-        return { _lhs, _rhs, _dst };
+        return { _src, _dst };
     }
 
     virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
@@ -59,17 +59,16 @@
 
     virtual std::string name() const override
     {
-        return "eltwise_add_" + std::to_string(id());
+        return "floor_" + std::to_string(id());
     }
 
 private:
-    Link _lhs{};
-    Link _rhs{};
+    Link _src{};
     Link _dst{};
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
 #endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
\ No newline at end of file

diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
index 26e5052..3f99dd5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h

@@ -27,7 +27,8 @@
 #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
 
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h"
 #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
 
 #endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H

diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
index a335e5a..bd88afd 100644
--- a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
+++ b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp

@@ -198,7 +198,7 @@
     node->set_method(method);
 }
 
-Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
+Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
 {
     auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() });
     check_dependency_graph_op_success(graph, id.first);
@@ -224,7 +224,36 @@
     tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get());
     tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get());
     tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
-    graph.impl()->add_node<AddContent>(id.second, desc, tensors);
+    graph.impl()->add_node<ElementwiseContent>(id.second, desc, tensors);
+    check_multiple_roots(graph);
+
+    return op_node;
+}
+
+Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst)
+{
+    auto id = graph.impl()->graph.add_operator({ src.id() }, { dst.id() });
+    check_dependency_graph_op_success(graph, id.first);
+
+    Operator op_node(id.second);
+
+    // Infer TensorInfo
+    auto             node_src = graph.impl()->tensors[src.id()]->get_tensor_info();
+    OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get();
+
+    if(node_dst->get_tensor_info()->total_size() == 0)
+    {
+        auto_init_if_empty(*(node_dst->get_tensor_info()), *node_src);
+    }
+
+    // Check execution space
+    auto dst_info = node_dst->get_tensor_info();
+    check_execution_shape(graph, *dst_info);
+
+    ITensorDescPack<OpTensorContent> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[src.id()].get());
+    tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
+    graph.impl()->add_node<FloorContent>(id.second, desc, tensors);
     check_multiple_roots(graph);
 
     return op_node;

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
index a9ccf90..f10e97e 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h

@@ -42,14 +42,24 @@
     Conv2dDescriptor conv2d{};
 };
 
-struct ClEltwiseAddKernelDescriptor
+struct ClElementwiseKernelDescriptor
 {
-    friend bool operator==(const ClEltwiseAddKernelDescriptor &desc0, const ClEltwiseAddKernelDescriptor &desc1)
+    friend bool operator==(const ClElementwiseKernelDescriptor &desc0, const ClElementwiseKernelDescriptor &desc1)
     {
-        return desc0.add == desc1.add;
+        return desc0.eltwise == desc1.eltwise;
     }
-    AddDescriptor add{};
+    ElementwiseDescriptor eltwise{};
 };
+
+struct ClFloorKernelDescriptor
+{
+    friend bool operator==(const ClFloorKernelDescriptor &desc0, const ClFloorKernelDescriptor &desc1)
+    {
+        return desc0.floor == desc1.floor;
+    }
+    FloorDescriptor floor{};
+};
+
 struct ClActivationKernelDescriptor
 {
     friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &)

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
index de58ce7..cab51a2 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp

@@ -124,7 +124,7 @@
     return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
 }
 
-Status ClAddKernel::generate(ClKernelBlueprint &bp) const
+Status ClElementwiseKernel::generate(ClKernelBlueprint &bp) const
 {
     const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
@@ -137,11 +137,11 @@
     ArgumentID dst_id;
     add_tensor(bp, dst->desc, dst_id, dst->id);
 
-    add_kcomp_eltwise_add(bp, desc, lhs_id, rhs_id, dst_id);
+    add_kcomp_eltwise_op(bp, desc, lhs_id, rhs_id, dst_id);
     return Status{};
 }
 
-Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
+Status ClElementwiseKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
 {
     // 1. Check validity
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
@@ -186,9 +186,61 @@
     return Status{};
 }
 
-bool ClAddKernel::operator==(const ClKernel &other) const
+bool ClElementwiseKernel::operator==(const ClKernel &other) const
 {
-    const auto converted = *utils::cast::polymorphic_downcast<const ClAddKernel *>(&other);
+    const auto converted = *utils::cast::polymorphic_downcast<const ClElementwiseKernel *>(&other);
+    return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
+}
+
+Status ClFloorKernel::generate(ClKernelBlueprint &bp) const
+{
+    const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ArgumentID src_id;
+    add_tensor(bp, src->desc, src_id, src->id);
+    ArgumentID dst_id;
+    add_tensor(bp, dst->desc, dst_id, dst->id);
+
+    add_kcomp_floor(bp, desc, src_id, dst_id);
+    return Status{};
+}
+
+Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    // 1. Check validity
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    // dst shape is correct
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(src->tensor_shape(), dst->tensor_shape(), 0), "Wrong shape for dst");
+
+    // 2. Check support level
+
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
+
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+bool ClFloorKernel::operator==(const ClKernel &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const ClFloorKernel *>(&other);
     return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
 }
 
@@ -202,6 +254,7 @@
     }
     return kernels;
 }
+
 std::vector<ClKernel *> traverse(ClKernelGraph &graph)
 {
     std::vector<ClKernel *> kernels;

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
index 54e01ea..c3580cf 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h

@@ -139,16 +139,16 @@
     ClDirectConv2dKernelDescriptor desc{};
 };
 
-struct ClAddKernel : public ClKernel
+struct ClElementwiseKernel : public ClKernel
 {
 public:
     Complexity complexity() const override
     {
         return Complexity::Simple;
     }
-    ClAddKernel()           = default;
-    ~ClAddKernel() override = default;
-    ClAddKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClEltwiseAddKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+    ClElementwiseKernel()           = default;
+    ~ClElementwiseKernel() override = default;
+    ClElementwiseKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClElementwiseKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
         : ClKernel{ graph, id, config, tensors }, desc{ desc }
     {
     }
@@ -156,7 +156,27 @@
     bool operator==(const ClKernel &other) const override;
     Status generate(ClKernelBlueprint &bp) const override;
 
-    ClEltwiseAddKernelDescriptor desc{};
+    ClElementwiseKernelDescriptor desc{};
+};
+
+struct ClFloorKernel : public ClKernel
+{
+public:
+    Complexity complexity() const override
+    {
+        return Complexity::Simple;
+    }
+    ClFloorKernel()           = default;
+    ~ClFloorKernel() override = default;
+    ClFloorKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClFloorKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+        : ClKernel{ graph, id, config, tensors }, desc{ desc }
+    {
+    }
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+    bool operator==(const ClKernel &other) const override;
+    Status generate(ClKernelBlueprint &bp) const override;
+
+    ClFloorKernelDescriptor desc{};
 };
 
 struct ClKernelGraph

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
index f971196..274a251 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp

@@ -113,9 +113,14 @@
     return std::make_tuple(conv2d0.pad, conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.pad, conv2d1.stride, conv2d1.dilation);
 }
 
-bool operator==(const AddDescriptor &, const AddDescriptor &)
+bool operator==(const ElementwiseDescriptor &ed0, const ElementwiseDescriptor &ed1)
 {
-    return std::make_tuple() == std::make_tuple(); // Currently two Add ops are always the same
+    return ed0.op == ed1.op; // Compare Arithmatic Operations of two ElementwiseDescriptor objects
+}
+
+bool operator==(const FloorDescriptor &, const FloorDescriptor &)
+{
+    return std::make_tuple() == std::make_tuple(); // Currently two Floor ops are always the same
 }
 
 bool Conv2dContent::operator==(const OperatorContent &other) const
@@ -124,9 +129,15 @@
     return desc == converted.desc;
 }
 
-bool AddContent::operator==(const OperatorContent &other) const
+bool ElementwiseContent::operator==(const OperatorContent &other) const
 {
-    const auto converted = *utils::cast::polymorphic_downcast<const AddContent *>(&other);
+    const auto converted = *utils::cast::polymorphic_downcast<const ElementwiseContent *>(&other);
+    return desc == converted.desc;
+}
+
+bool FloorContent::operator==(const OperatorContent &other) const
+{
+    const auto converted = *utils::cast::polymorphic_downcast<const FloorContent *>(&other);
     return desc == converted.desc;
 }
 
@@ -311,7 +322,7 @@
     return Status{};
 }
 
-Status AddContent::translate(ClKernelGraph &kernel_graph) const
+Status ElementwiseContent::translate(ClKernelGraph &kernel_graph) const
 {
     const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
@@ -338,16 +349,46 @@
     DependencyGraph::Id add_id;
     ClKernelConfig      config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
 
-    st = ClAddKernel::validate(lhs->desc, rhs->desc, dst->desc);
+    st = ClElementwiseKernel::validate(lhs->desc, rhs->desc, dst->desc);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
 
-    st = kernel_graph.add_kernel<ClAddKernel>(config, ClEltwiseAddKernelDescriptor{ desc }, tensors, add_id);
+    st = kernel_graph.add_kernel<ClElementwiseKernel>(config, ClElementwiseKernelDescriptor{ desc }, tensors, add_id);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
     ARM_COMPUTE_UNUSED(add_id);
 
     return Status{};
 }
 
+Status FloorContent::translate(ClKernelGraph &kernel_graph) const
+{
+    const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    ITensorDescPack<ClKernelTensor> tensors;
+
+    DependencyGraph::Id src_id;
+    auto                st = add_kernel_tensor(kernel_graph, *_graph, *src, src_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(src_id));
+
+    DependencyGraph::Id dst_id;
+    st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
+
+    DependencyGraph::Id add_id;
+    ClKernelConfig      config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
+
+    st = ClFloorKernel::validate(src->desc, dst->desc);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+    st = kernel_graph.add_kernel<ClFloorKernel>(config, ClFloorKernelDescriptor{ desc }, tensors, add_id);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+    return Status{};
+}
+
 std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph)
 {
     std::vector<const OperatorContent *> ops;

diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
index 2786d61..b303cdb 100644
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h

@@ -157,19 +157,19 @@
     Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const;
 };
 
-class AddContent : public OperatorContent
+class ElementwiseContent : public OperatorContent
 {
 public:
-    AddContent() = default;
-    AddContent(const OperatorGraph::Implementation *graph, Id id, const AddDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+    ElementwiseContent() = default;
+    ElementwiseContent(const OperatorGraph::Implementation *graph, Id id, const ElementwiseDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
         : OperatorContent(graph, id, tensors), desc(desc)
     {
     }
-    ~AddContent()                  = default;
-    AddContent(const AddContent &) = default;
-    AddContent &operator=(const AddContent &) = default;
-    AddContent(AddContent &&)                 = default;
-    AddContent &operator=(AddContent &&) = default;
+    ~ElementwiseContent()                          = default;
+    ElementwiseContent(const ElementwiseContent &) = default;
+    ElementwiseContent &operator=(const ElementwiseContent &) = default;
+    ElementwiseContent(ElementwiseContent &&)                 = default;
+    ElementwiseContent &operator=(ElementwiseContent &&) = default;
     bool operator==(const OperatorContent &other) const override;
     OperatorComplexity complexity() const override
     {
@@ -178,7 +178,31 @@
     Status translate(ClKernelGraph &kernel_graph) const override;
 
 private:
-    AddDescriptor desc{};
+    ElementwiseDescriptor desc{};
+};
+
+class FloorContent : public OperatorContent
+{
+public:
+    FloorContent() = default;
+    FloorContent(const OperatorGraph::Implementation *graph, Id id, const FloorDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+        : OperatorContent(graph, id, tensors), desc(desc)
+    {
+    }
+    ~FloorContent()                    = default;
+    FloorContent(const FloorContent &) = default;
+    FloorContent &operator=(const FloorContent &) = default;
+    FloorContent(FloorContent &&)                 = default;
+    FloorContent &operator=(FloorContent &&) = default;
+    bool operator==(const OperatorContent &other) const override;
+    OperatorComplexity complexity() const override
+    {
+        return OperatorComplexity::Simple;
+    }
+    Status translate(ClKernelGraph &kernel_graph) const override;
+
+private:
+    FloorDescriptor desc{};
 };
 
 struct OperatorGraph::Implementation

diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index 96a845c..3ffbc07 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp

@@ -74,8 +74,9 @@
     ClExecutionDescriptor exec_desc{};
     Status                st{};
 
-    const auto data_type = DataType::F32;
-    const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ };
+    const auto data_type    = DataType::F32;
+    const auto conv_info    = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ };
+    const auto eltwise_info = ElementwiseDescriptor{ ArithmeticOperation::ADD };
 
     const auto width     = 7U;
     const auto height    = 6U;
@@ -99,7 +100,7 @@
     const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
 
     const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info };
-    const ClEltwiseAddKernelDescriptor   eltwise_add_desc{};
+    const ClElementwiseKernelDescriptor  eltwise_add_desc{ eltwise_info };
     const TileDescriptor                 store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
 
     ArgumentID src_id{ g_arg_placeholder };
@@ -119,7 +120,7 @@
     st = add_tensor(bp, &dst_info, dst_id);
 
     st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
-    st = add_kcomp_eltwise_add(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id);
+    st = add_kcomp_eltwise_op(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id);
     st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id);
 
     exec_desc.skip_sliding_window = true;

diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp
new file mode 100644
index 0000000..2b8f69e
--- /dev/null
+++ b/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp

@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#include "arm_compute/core/TensorInfo.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/validation/reference/Floor.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Operator_Floor_1_F32, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = floor(input)
+     */
+    const auto data_type    = DataType::F32;
+    const auto data_layout  = DataLayout::NHWC;
+    const auto t_shape      = TensorShape(32, 16);
+    auto       t_input_info = TensorInfo(t_shape, 1, data_type, data_layout);
+    auto       t_dst_info   = TensorInfo();
+
+    FloorDescriptor floor_desc{};
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{ t_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+
+    auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw = t_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    auto       ref_t_dst_nchw = reference::floor_layer(ref_t_input_nchw);
+    const auto ref_t_dst      = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    CLScheduler::get().default_reinit();
+    const auto    cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    OperatorGraph op_graph;
+
+    const auto op_t_input = add_tensor(op_graph, t_input_info);
+    const auto op_t_dst   = add_tensor(op_graph, t_dst_info);
+
+    add_op_floor(op_graph, floor_desc, op_t_input, op_t_dst);
+
+    const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+    ClWorkload              workload;
+    build(workload, op_graph, workload_ctx);
+
+    ClCompositeOperator op;
+    op.configure(cl_compile_ctx, workload);
+
+    // Construct tensors
+    CLTensor t_input{};
+    CLTensor t_dst{};
+
+    // Init tensors
+    t_input.allocator()->init(t_input_info);
+    t_dst.allocator()->init(t_dst_info);
+
+    // Allocate and fill tensors
+    t_input.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    // "Pack" tensors
+    OpTensorBinding bp_tensors({ { op_t_input, &t_input },
+        { op_t_dst, &t_dst }
+    });
+
+    // Populate prepare and run pack-maps (including allocating aux tensors)
+    ClAuxTensorData aux_tensor_data{};
+    TensorPackMap   prepare_pack_map{};
+    TensorPackMap   run_pack_map{};
+    bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
+
+    op.prepare(prepare_pack_map);
+    op.run(run_pack_map);
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
\ No newline at end of file

diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
index fe8d23e..3a8b7c8 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp

@@ -77,8 +77,8 @@
     auto       t_acc_info       = TensorInfo(); // Intermediate tensor for cond3
     auto       t_dst_info       = TensorInfo();
 
-    Conv2dDescriptor conv2d_desc{};
-    AddDescriptor    add_desc{};
+    Conv2dDescriptor      conv2d_desc{};
+    ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
 
     // Create reference
     SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
@@ -119,7 +119,7 @@
 
     auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
     force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
-    add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
@@ -180,8 +180,8 @@
     auto       t_acc_info       = TensorInfo(t_dst_shape, 1, data_type, data_layout);
     auto       t_dst_info       = TensorInfo(t_dst_shape, 1, data_type, data_layout);
 
-    Conv2dDescriptor conv2d_desc{};
-    AddDescriptor    add_desc{};
+    Conv2dDescriptor      conv2d_desc{};
+    ElementwiseDescriptor add_desc{};
 
     OperatorGraph op_graph;
 
@@ -192,7 +192,7 @@
     const auto op_t_dst       = add_tensor(op_graph, t_dst_info);
 
     auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
-    add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
     force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
@@ -290,7 +290,7 @@
     auto t_dst_info    = TensorInfo();
 
     OperatorGraph op_graph;
-    const auto    add_desc = AddDescriptor{};
+    const auto    add_desc = ElementwiseDescriptor{};
 
     const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
     const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info);
@@ -300,9 +300,9 @@
     const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred
     const auto op_t_dst    = add_tensor(op_graph, t_dst_info);
 
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
@@ -334,7 +334,7 @@
 
     OperatorGraph op_graph;
     const auto    conv2d_desc = Conv2dDescriptor{};
-    const auto    add_desc    = AddDescriptor{};
+    const auto    add_desc    = ElementwiseDescriptor{};
 
     const auto op_t_l0_0_input  = add_tensor(op_graph, t_l0_0_input_info);
     const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info);
@@ -345,8 +345,8 @@
     const auto op_t_dst         = add_tensor(op_graph, t_dst_info);
 
     add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
@@ -374,7 +374,7 @@
 
     OperatorGraph op_graph;
     const auto    conv2d_desc = Conv2dDescriptor{};
-    const auto    add_desc    = AddDescriptor{};
+    const auto    add_desc    = ElementwiseDescriptor{};
 
     const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
     const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info);
@@ -382,7 +382,7 @@
     const auto op_t_state1 = add_tensor(op_graph, state1_info);
 
     add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1);
-    add_op_elementwise_add(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0);
+    add_op_elementwise_op(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0);
 
     const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
     ClWorkload              workload;
commit	b1fcefddf3f59219a9d7930d607175b7e6c39347	[log] [tgz]
author	Michalis Spyrou <michalis.spyrou@arm.com>	Wed Jun 15 19:02:28 2022 +0100
committer	Michalis Spyrou <michalis.spyrou@arm.com>	Mon Jun 27 14:05:05 2022 +0000
tree	34e95efded15194b3c8abe4ba3da308c3259301d
parent	41eb2d92c89274200d59ff97653e2bd66819b310 [diff]