Adding GpuAdd to dynamic fusion operators

- Provide support for Add operator
- Auto initialize the destination tensor before testing fusion in conv2d
and elementwise binary ops.

Resolves: COMPMID-5518
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: Ibd815020f02b57f88eea7c2921bdcf98605d99c5
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8617
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/datasets/DynamicFusionDataset.h b/tests/datasets/DynamicFusionDataset.h
new file mode 100644
index 0000000..5a1453b
--- /dev/null
+++ b/tests/datasets/DynamicFusionDataset.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TESTS_DATASETS_DYNAMICFUSIONDATASET
+#define TESTS_DATASETS_DYNAMICFUSIONDATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class DynamicFusionThreeInputs
+{
+public:
+    using type = std::tuple<TensorShape, TensorShape, TensorShape>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator shape0_it,
+                 std::vector<TensorShape>::const_iterator shape1_it,
+                 std::vector<TensorShape>::const_iterator shape2_it)
+            : _shape0_it{ std::move(shape0_it) },
+              _shape1_it{ std::move(shape1_it) },
+              _shape2_it{ std::move(shape2_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "shape0=" << *_shape0_it << ":";
+            description << "shape1=" << *_shape1_it << ":";
+            description << "shape2=" << *_shape2_it << ":";
+
+            return description.str();
+        }
+
+        DynamicFusionThreeInputs::type operator*() const
+        {
+            return std::make_tuple(*_shape0_it, *_shape1_it, *_shape2_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_shape0_it;
+            ++_shape1_it;
+            ++_shape2_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _shape0_it;
+        std::vector<TensorShape>::const_iterator _shape1_it;
+        std::vector<TensorShape>::const_iterator _shape2_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_shape0_shapes.begin(), _shape1_shapes.begin(), _shape2_shapes.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_shape0_shapes.size(), std::min(_shape1_shapes.size(), _shape2_shapes.size()));
+    }
+
+    void add_config(TensorShape shape0, TensorShape shape1, TensorShape shape2)
+    {
+        _shape0_shapes.emplace_back(std::move(shape0));
+        _shape1_shapes.emplace_back(std::move(shape1));
+        _shape2_shapes.emplace_back(std::move(shape2));
+    }
+
+protected:
+    DynamicFusionThreeInputs()                            = default;
+    DynamicFusionThreeInputs(DynamicFusionThreeInputs &&) = default;
+
+private:
+    std::vector<TensorShape> _shape0_shapes{};
+    std::vector<TensorShape> _shape1_shapes{};
+    std::vector<TensorShape> _shape2_shapes{};
+};
+
+class DynamicFusionElementwiseBinaryTwoOpsSmallShapes final : public DynamicFusionThreeInputs
+{
+public:
+    DynamicFusionElementwiseBinaryTwoOpsSmallShapes()
+    {
+        add_config(TensorShape{ 9U, 9U, 5U }, TensorShape{ 9U, 9U, 5U }, TensorShape{ 9U, 9U, 5U });
+        add_config(TensorShape{ 9U, 9U, 5U }, TensorShape{ 1U, 1U, 1U } /* Broadcast in X, Y, Z*/, TensorShape{ 9U, 9U, 5U });
+        add_config(TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 1U, 1U } /* Broadcast in Y and Z*/, TensorShape{ 27U, 13U, 2U });
+        add_config(TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 1U, 1U } /* Broadcast in Y and Z*/);
+    }
+};
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* TESTS_DATASETS_DYNAMICFUSIONDATASET */
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index e4277a9..047457c 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -212,6 +212,25 @@
     }
 };
 
+/** Data set containing small tensor shapes. */
+class SmallShapesNoBatches final : public ShapeDataset
+{
+public:
+    SmallShapesNoBatches()
+        : ShapeDataset("Shape",
+    {
+        // Batch size 1
+        TensorShape{ 3U, 11U },
+                     TensorShape{ 1U, 16U },
+                     TensorShape{ 27U, 13U, 7U },
+                     TensorShape{ 7U, 7U, 17U },
+                     TensorShape{ 33U, 13U, 2U },
+                     TensorShape{ 11U, 11U, 3U }
+    })
+    {
+    }
+};
+
 /** Data set containing pairs of tiny tensor shapes that are broadcast compatible. */
 class TinyShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
 {
@@ -282,6 +301,44 @@
     }
 };
 
+class TemporaryLimitedSmallShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
+{
+public:
+    TemporaryLimitedSmallShapesBroadcast()
+        : ZipDataset<ShapeDataset, ShapeDataset>(
+              ShapeDataset("Shape0",
+    {
+        TensorShape{ 9U, 9U, 5U },
+                     TensorShape{ 27U, 13U, 2U },
+    }),
+    ShapeDataset("Shape1",
+    {
+        TensorShape{ 1U, 1U, 1U },  // Broadcast in X, Y, Z
+        TensorShape{ 27U, 1U, 1U }, // Broadcast in Y and Z
+    }))
+    {
+    }
+};
+
+class TemporaryLimitedLargeShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
+{
+public:
+    TemporaryLimitedLargeShapesBroadcast()
+        : ZipDataset<ShapeDataset, ShapeDataset>(
+              ShapeDataset("Shape0",
+    {
+        TensorShape{ 127U, 25U, 5U },
+                     TensorShape{ 485, 40U, 10U }
+    }),
+    ShapeDataset("Shape1",
+    {
+        TensorShape{ 1U, 1U, 1U },   // Broadcast in X, Y, Z
+        TensorShape{ 485U, 1U, 1U }, // Broadcast in Y, Z
+    }))
+    {
+    }
+};
+
 /** Data set containing medium tensor shapes. */
 class MediumShapes final : public ShapeDataset
 {
@@ -359,6 +416,19 @@
     }
 };
 
+/** Data set containing large tensor shapes. */
+class LargeShapesNoBatches final : public ShapeDataset
+{
+public:
+    LargeShapesNoBatches()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 582U, 131U, 2U },
+    })
+    {
+    }
+};
+
 /** Data set containing pairs of large tensor shapes that are broadcast compatible. */
 class LargeShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
 {
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index 036f28b..0b81dac 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -28,24 +28,14 @@
 #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/gpu/cl/operators/ClAdd.h"
-#include "src/gpu/cl/operators/ClConv2d.h"
 
 #include "tests/CL/CLAccessor.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/dynamic_fusion/Utils.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/Permute.h"
 
-#ifdef ARM_COMPUTE_ASSERTS_ENABLED
-#include "tests/SimpleTensorPrinter.h"
-#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-
 using namespace arm_compute::experimental::dynamic_fusion;
 using namespace arm_compute::test::validation::utils;
 
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
new file mode 100644
index 0000000..3743fbb
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(ADD)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // S16 is valid data type for Add
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // S32 is valid data type for Add
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting not allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching not supported
+                                                      }),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching not supported
+                                                      })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, false, true, true, false, false, true, false})),
+               input1_info, input2_info, output_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Fuse Elementwise Add
+    auto          lhs_info         = sketch.create_tensor_info(input1_info);
+    auto          rhs_info         = sketch.create_tensor_info(input2_info);
+    auto          dst_info         = sketch.create_tensor_info(output_info);
+    bool res = bool(GpuAdd::validate_op(sketch, &lhs_info, &rhs_info, &dst_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(ValidateRhsInplace, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32), // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                      }),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32), // Broadcasting not allowed for rhs
+                                                      })),
+               framework::dataset::make("Expected", { true, false})),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Fuse Elementwise Add
+    auto          lhs_info         = sketch.create_tensor_info(input1_info);
+    auto          rhs_info         = sketch.create_tensor_info(input2_info);
+    bool res = bool(GpuAdd::validate_op(sketch, &lhs_info, &rhs_info, &rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+DATA_TEST_CASE(ValidateLhsInplace, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32), // Broadcasting not allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                      }),
+               framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32), // Broadcasting allowed for rhs
+                                                      })),
+               framework::dataset::make("Expected", { false, true})),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Fuse Elementwise Add
+    auto          lhs_info         = sketch.create_tensor_info(input1_info);
+    auto          rhs_info         = sketch.create_tensor_info(input2_info);
+    bool res = bool(GpuAdd::validate_op(sketch, &lhs_info, &rhs_info, &lhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+RelativeTolerance<float>            tolerance_f32(0.01f);                 /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr float                     tolerance_num = 0.01f;                /**< Tolerance number */
+
+template <typename T>
+using DynamicFusionAddOpFixture = DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+template <typename T>
+using DynamicFusionAddOpBroadcastFixture = DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+template <typename T>
+using DynamicFusionGpuFuseTwoAddOpsFixture = DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuAdd, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp, DynamicFusionAddOpFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(
+                                                                                                                       framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                       datasets::SmallShapesNoBatches()),
+                                                                                                                   framework::dataset::make("DataType", { DataType::F32 })),
+                                                                                                                   framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp, DynamicFusionAddOpFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(
+                                                                                                                     framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                     datasets::LargeShapesNoBatches()),
+                                                                                                                 framework::dataset::make("DataType", { DataType::F32 })),
+                                                                                                                 framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                       datasets::TemporaryLimitedSmallShapesBroadcast()),
+                       framework::dataset::make("DataType", { DataType::F32 })),
+                       framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                       datasets::TemporaryLimitedLargeShapesBroadcast()),
+                       framework::dataset::make("DataType", { DataType::F32 })),
+                       framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps, DynamicFusionGpuFuseTwoAddOpsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                       datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                       framework::dataset::make("DataType", { DataType::F32 })),
+                       framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp, DynamicFusionAddOpFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                    datasets::SmallShapesNoBatches()),
+                                                                                                                    framework::dataset::make("DataType", { DataType::F16 })),
+                                                                                                            framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                       datasets::TemporaryLimitedSmallShapesBroadcast()),
+                       framework::dataset::make("DataType", { DataType::F16 })),
+                       framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                        datasets::SmallShapesNoBatches()),
+                                                                                                                        framework::dataset::make("DataType", { DataType::S32 })),
+                                                                                                                framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                        datasets::SmallShapesNoBatches()),
+                                                                                                                        framework::dataset::make("DataType", { DataType::S16 })),
+                                                                                                                framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionAddOpFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                        datasets::LargeShapesNoBatches()),
+                                                                                                                        framework::dataset::make("DataType", { DataType::S16 })),
+                                                                                                                framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }),
+                                                                                                                        datasets::SmallShapesNoBatches()),
+                                                                                                                        framework::dataset::make("DataType", { DataType::U8 })),
+                                                                                                                framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE_END() // ADD
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index 1f9319b..bfb9735 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -22,21 +22,8 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
-#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
-
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
-#include "tests/Globals.h"
-#include "tests/IAccessor.h"
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -46,12 +33,6 @@
 #include "tests/datasets/SmallConvolutionLayerDataset.h"
 #include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h"
 
-#ifdef ARM_COMPUTE_ASSERTS_ENABLED
-#include "tests/SimpleTensorPrinter.h"
-#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
 namespace arm_compute
 {
 namespace test
@@ -60,7 +41,7 @@
 {
 TEST_SUITE(CL)
 TEST_SUITE(DYNAMIC_FUSION)
-TEST_SUITE(GPU_CONV2D)
+TEST_SUITE(CONV2D)
 
 RelativeTolerance<float>            tolerance_f32(0.01f);                 /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
 RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
@@ -79,7 +60,6 @@
 }
 TEST_SUITE_END() // FP32
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
@@ -90,9 +70,8 @@
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // FP16
-#endif           //  __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-TEST_SUITE_END() // GPU_CONV2D
+TEST_SUITE_END() // CONV2D
 TEST_SUITE_END() // DYNAMIC_FUSION
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h
index b052248..e437c44 100644
--- a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h
@@ -21,32 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DYNAMIC_FUSION_FIXTURE
-#define ARM_COMPUTE_TEST_DYNAMIC_FUSION_FIXTURE
+#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE
+#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
 #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
 #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
 
-#include "src/gpu/cl/operators/ClAdd.h"
-#include "src/gpu/cl/operators/ClConv2d.h"
-
 #include "tests/CL/CLAccessor.h"
-
-#include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
 #include "tests/framework/Macros.h"
-
 #include "tests/validation/Validation.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/Permute.h"
 
 using namespace arm_compute::experimental::dynamic_fusion;
@@ -136,10 +127,10 @@
             tensor->allocator()->allocate(); // Use ACL allocated memory
         }
         // Construct user tensors
-        CLTensor t_input{};
-        CLTensor t_weight{};
-        CLTensor t_bias{};
-        CLTensor t_dst{};
+        TensorType t_input{};
+        TensorType t_weight{};
+        TensorType t_bias{};
+        TensorType t_dst{};
 
         // Initialize user tensors
         t_input.allocator()->init(input_info);
@@ -152,9 +143,10 @@
         t_weight.allocator()->allocate();
         t_bias.allocator()->allocate();
         t_dst.allocator()->allocate();
-        fill(CLAccessor(t_input), 0);
-        fill(CLAccessor(t_weight), 1);
-        fill(CLAccessor(t_bias), 2);
+
+        fill(AccessorType(t_input), 0);
+        fill(AccessorType(t_weight), 1);
+        fill(AccessorType(t_bias), 2);
 
         // Run runtime
         runtime.run({ &t_input, &t_weight, &t_bias, &t_dst });
@@ -187,15 +179,11 @@
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
     DataType         _data_type{};
-    DataType         _weights_data_type{};
     DataType         _bias_data_type{};
-    DataType         _output_data_type{};
     DataLayout       _data_layout{};
     QuantizationInfo _quantization_info{};
     QuantizationInfo _weight_quantization_info{};
     bool             _is_quantized = false;
-    bool             _is_bfloat16  = false;
-    bool             _mixed_layout = false;
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -207,10 +195,10 @@
                const PadStrideInfo &info, const Size2D &dialation, DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info)
     {
         DynamicFusionGpuConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, output_shape, bias_shape, info, dialation,
-                                                                                                        data_type, data_layout, quantization_info, quantization_info);
+                                                                                                         data_type, data_layout, quantization_info, quantization_info);
     }
 };
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DYNAMIC_FUSION_FIXTURE */
+#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE */
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
new file mode 100644
index 0000000..d112377
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE
+#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryValidationGenericFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, TensorShape shape2, const DataType data_type, const bool is_inplace)
+    {
+        _op         = op;
+        _is_inplace = is_inplace;
+        _data_type  = data_type;
+        _fuse       = shape2.total_size() != 0;
+        ARM_COMPUTE_ERROR_ON_MSG(_fuse && _is_inplace, "In place for fusing case not supported yet.");
+        _target    = compute_target(shape0, shape1, shape2);
+        _reference = compute_reference(shape0, shape1, shape2);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        if(is_data_type_float(tensor.data_type()))
+        {
+            switch(_op)
+            {
+                case ArithmeticOperation::DIV:
+                    library->fill_tensor_uniform_ranged(tensor, i, { std::pair<float, float>(-0.001f, 0.001f) });
+                    break;
+                case ArithmeticOperation::POWER:
+                    library->fill_tensor_uniform(tensor, i, 0.0f, 5.0f);
+                    break;
+                default:
+                    library->fill_tensor_uniform(tensor, i);
+            }
+        }
+        else if(tensor.data_type() == DataType::S32)
+        {
+            switch(_op)
+            {
+                case ArithmeticOperation::DIV:
+                    library->fill_tensor_uniform_ranged(tensor, i, { std::pair<int32_t, int32_t>(-1U, 1U) });
+                    break;
+                default:
+                    library->fill_tensor_uniform(tensor, i);
+            }
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    TensorType compute_target(TensorShape shape0, TensorShape shape1, TensorShape shape2)
+    {
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+        GpuWorkloadSketch sketch{ &gpu_ctx };
+        TensorInfo        dst_info{};
+        TensorInfo        dst_info_fuse{};
+
+        // Fuse first element wise binary Op
+        auto       lhs_info = sketch.create_tensor_info(shape0, 1, _data_type);
+        auto       rhs_info = sketch.create_tensor_info(TensorInfo(shape1, 1, _data_type));
+        TensorInfo rhs_info_fuse;
+
+        // Testing root case while in-place
+        if(!_is_inplace)
+        {
+            dst_info = sketch.create_tensor_info(TensorInfo(1, _data_type));
+
+            FunctionType::create_op(sketch, &lhs_info, &rhs_info, &dst_info);
+        }
+        else
+        {
+            FunctionType::create_op(sketch, &lhs_info, &rhs_info, &lhs_info);
+        }
+
+        if(_fuse)
+        {
+            // Fuse first element wise binary Op
+            rhs_info_fuse = sketch.create_tensor_info(TensorInfo(shape2, 1, _data_type));
+            dst_info_fuse = sketch.create_tensor_info();
+            FunctionType::create_op(sketch, &dst_info, &rhs_info_fuse, &dst_info_fuse);
+        }
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for(auto &data : runtime.get_auxiliary_tensors())
+        {
+            TensorType   *tensor      = data.first;
+            AuxMemoryInfo aux_mem_req = data.second;
+            tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+            tensor->allocator()->allocate();
+        }
+
+        // Construct user tensors
+        TensorType t_lhs{};
+        TensorType t_rhs{};
+        TensorType t_rhs_fuse{};
+        TensorType t_dst{};
+        TensorType t_dst_fuse{};
+
+        // Initialize user tensors
+        t_lhs.allocator()->init(lhs_info);
+        t_rhs.allocator()->init(rhs_info);
+        if(!_is_inplace)
+        {
+            t_dst.allocator()->init(dst_info);
+            if(_fuse)
+            {
+                t_rhs_fuse.allocator()->init(rhs_info_fuse);
+                t_dst_fuse.allocator()->init(dst_info_fuse);
+            }
+        }
+
+        // Allocate and fill user tensors
+        // Instead of using ACL allocator, the user can choose to import memory into the tensors
+        t_lhs.allocator()->allocate();
+        t_rhs.allocator()->allocate();
+        if(!_is_inplace)
+        {
+            t_dst.allocator()->allocate();
+            if(_fuse)
+            {
+                t_rhs_fuse.allocator()->allocate();
+                t_dst_fuse.allocator()->allocate();
+            }
+        }
+
+        fill(AccessorType(t_lhs), 0);
+        fill(AccessorType(t_rhs), 1);
+        if(_fuse)
+        {
+            fill(AccessorType(t_rhs_fuse), 2);
+        }
+        // Run runtime
+        if(_is_inplace)
+        {
+            runtime.run({ &t_lhs, &t_rhs, &t_lhs });
+        }
+        else
+        {
+            if(_fuse)
+            {
+                runtime.run({ &t_lhs, &t_rhs, &t_rhs_fuse, &t_dst_fuse });
+            }
+            else
+            {
+                runtime.run({ &t_lhs, &t_rhs, &t_dst });
+            }
+        }
+
+        if(_is_inplace)
+        {
+            return t_lhs;
+        }
+        else if(_fuse)
+        {
+            return t_dst_fuse;
+        }
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(TensorShape shape0, TensorShape shape1, TensorShape shape2)
+    {
+        const TensorShape out_shape      = TensorShape::broadcast_shape(shape0, shape1);
+        const TensorShape out_shape_fuse = TensorShape::broadcast_shape(out_shape, shape1);
+
+        // Create reference
+        SimpleTensor<T> ref_lhs{ shape0, _data_type, 1, QuantizationInfo() };
+        SimpleTensor<T> ref_rhs{ shape1, _data_type, 1, QuantizationInfo() };
+        SimpleTensor<T> ref_rhs_fuse{ shape2, _data_type, 1, QuantizationInfo() };
+        SimpleTensor<T> ref_dst{ out_shape, _data_type, 1, QuantizationInfo() };
+        SimpleTensor<T> ref_dst_fuse{ out_shape_fuse, _data_type, 1, QuantizationInfo() };
+        // Fill reference
+        fill(ref_lhs, 0);
+        fill(ref_rhs, 1);
+
+        reference::arithmetic_operation<T>(_op, ref_lhs, ref_rhs, ref_dst, ConvertPolicy::WRAP);
+        if(_fuse)
+        {
+            fill(ref_rhs_fuse, 2);
+            reference::arithmetic_operation<T>(_op, ref_dst, ref_rhs_fuse, ref_dst_fuse, ConvertPolicy::WRAP);
+        }
+        SimpleTensor<T> *ret = _fuse ? &ref_dst_fuse : &ref_dst;
+        return *ret;
+    }
+
+    ArithmeticOperation _op{ ArithmeticOperation::ADD };
+    TensorType          _target{};
+    SimpleTensor<T>     _reference{};
+    DataType            _data_type{};
+    DataLayout          _data_layout{};
+    bool                _is_inplace{ false };
+    bool                _fuse{ false };
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryOneOpValidationFixture : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(ArithmeticOperation op, TensorShape shape, const DataType data_type, const bool is_inplace)
+    {
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape, shape, TensorShape(), data_type, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, const DataType data_type, const bool is_inplace)
+    {
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, TensorShape(), data_type, is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, TensorShape shape2, const DataType data_type, const bool is_inplace)
+    {
+        DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, shape2, data_type, is_inplace);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE */