Add Subtraction operator to Dynamic Fusion interface

Partially-Resolves: COMPMID-5518

Change-Id: I8358784815bcac461d50e384fa7bc96f476d3983
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Signed-off-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9045
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Dynamic-Fusion: SiCong Li <sicong.li@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 78aa64a..23b19c2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -625,6 +625,7 @@
         "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
index 796fd6f..33eded4 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
@@ -79,7 +79,7 @@
     static Status is_supported_op(const GpuWorkloadContext &context,
                                   const ITensorInfo        *lhs,
                                   const ITensorInfo        *rhs);
-    /**  Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
      *
      * Parameters are similar to @ref GpuAdd::create_op()
      *
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
index 1ba05ae..83b004b 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST
 #define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST
 
-#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 
 namespace arm_compute
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
index 69c7a3a..0f50127 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
@@ -24,10 +24,13 @@
 #ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESHAPE
 #define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESHAPE
 
-#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h"
+
 namespace arm_compute
 {
+/** Forward declaration */
+class ITensorInfo;
+
 namespace experimental
 {
 namespace dynamic_fusion
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
index f9661c1..2579d10 100644
--- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
@@ -25,7 +25,6 @@
 #ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE
 #define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE
 
-#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
 
 namespace arm_compute
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
new file mode 100644
index 0000000..6f8c2d0
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuSub final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst           |
+     * |:--------------|:--------------|:-------------|
+     * |F16            |F16            |F16           |
+     * |F32            |F32            |F32           |
+     * |S32            |S32            |S32           |
+     * |S16            |S16            |S16           |
+     * |U8             |U8             |U8            |
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch Workload sketch into which the operator will be fused
+     * @param[in]     lhs    Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in]     rhs    Right hand side tensor info. Same as @p lhs.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
+                                  ITensorInfo       *lhs,
+                                  ITensorInfo       *rhs);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] lhs     Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in] rhs     Right hand side tensor info. Same as @p lhs.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *lhs,
+                                  const ITensorInfo        *rhs);
+
+    /** Validate the operator and check if its configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuSub::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *rhs,
+                              const ITensorInfo       *lhs);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB */
diff --git a/filelist.json b/filelist.json
index a66d2a3..aec4fa8 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2241,6 +2241,7 @@
       "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp",
+      "src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index baee0d5..1e4dddd 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -21,11 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
+#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
 
 // *INDENT-OFF*
 // clang-format off
-#ifndef ARM_COMPUTE_TILE_HELPERS_H
-#define ARM_COMPUTE_TILE_HELPERS_H
 
 #define TILE_VECTOR_SIZE1 1
 #define TILE_VECTOR_SIZE2 2
@@ -1062,6 +1062,7 @@
 #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
 
 #define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
+#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
 #define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
 #define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
 
@@ -1129,6 +1130,9 @@
 #define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 #define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 
+#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+
 #define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 
 #define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
@@ -1196,6 +1200,7 @@
     })
 
 #define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
+#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 #define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 #define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
 
@@ -1288,4 +1293,4 @@
         })                                                                                             \
     })
 
-#endif // ARM_COMPUTE_TILE_HELPERS_H
+#endif /* SRC_CORE_CL_CL_KERNELS_TILE_HELPERS */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 2611d6d..b21c7c3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -38,6 +38,7 @@
 std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops
 {
     ElementwiseBinaryCommonAttributes::ElementwiseOp::Add,
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
     ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul
 };
 }
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
new file mode 100644
index 0000000..8240008
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuSub::validate_op(const GpuWorkloadSketch &sketch,
+                           const ITensorInfo       *lhs,
+                           const ITensorInfo       *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Sub then call the elementwise common validate_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+    return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuSub::is_supported_op(const GpuWorkloadContext &context,
+                               const ITensorInfo        *lhs,
+                               const ITensorInfo        *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Sub then call the elementwise common is_supported_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+    return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch,
+                               ITensorInfo       *lhs,
+                               ITensorInfo       *rhs)
+{
+    // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
+    // Set the elementwise operation to Sub then call the elementwise common create_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+    return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 0dd7ca5..52164ba 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -185,6 +185,9 @@
         case Attributes::ElementwiseOp::Add:
             lut["ELTWISE_OP"] = "ADD";
             break;
+        case Attributes::ElementwiseOp::Sub:
+            lut["ELTWISE_OP"] = "SUB";
+            break;
         case Attributes::ElementwiseOp::Mul:
             lut["ELTWISE_OP"] = "MUL";
             break;
diff --git a/tests/validation/CL/ArithmeticAddition.cpp b/tests/validation/CL/ArithmeticAddition.cpp
index 45632dc..1ed3a10 100644
--- a/tests/validation/CL/ArithmeticAddition.cpp
+++ b/tests/validation/CL/ArithmeticAddition.cpp
@@ -41,6 +41,9 @@
 {
 namespace validation
 {
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/Add.cpp from the dynamic fusion interface.
+ * Please check there for any differences in the coverage
+ */
 namespace
 {
 /** Input data sets **/
diff --git a/tests/validation/CL/ArithmeticSubtraction.cpp b/tests/validation/CL/ArithmeticSubtraction.cpp
index 6a82471..5825ce2 100644
--- a/tests/validation/CL/ArithmeticSubtraction.cpp
+++ b/tests/validation/CL/ArithmeticSubtraction.cpp
@@ -41,6 +41,9 @@
 {
 namespace validation
 {
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/Sub.cpp from the dynamic fusion interface.
+ * Please check there for any differences in the coverage
+ */
 namespace
 {
 /** Input data sets **/
diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index 84aa2e7..62ff15a 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp
@@ -36,6 +36,9 @@
 {
 namespace validation
 {
+/** Synced with tests/validation/dynamic_fusion/gpu/cl/Mul.cpp from the dynamic fusion interface.
+ * Please check there for any differences in the coverage
+ */
 namespace
 {
 namespace
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
index 52ba052..afe5ee4 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp
@@ -43,7 +43,7 @@
 {
 /* Synced with tests/validation/CL/ArithmeticAddition.cpp from the standard interface.
  *
- * Difference | Why the difference
+ * Difference          | Why the difference
  * No quantized tests  | Not supported yet
  * No in place tests   | Not supported yet
  * No activation tests | Not needed in dynamic fusion interface
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
new file mode 100644
index 0000000..977e011
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Sub.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/datasets/DynamicFusionDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/* Synced with tests/validation/CL/ArithmeticSubtraction.cpp from the standard interface.
+ *
+ * Difference          | Why the difference
+ * No quantized tests  | Not supported yet
+ * No in place tests   | Not supported yet
+ * No activation tests | Not needed in dynamic fusion interface
+ *
+ */
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(SUB)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U32),    // Unsupported data type U32
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),    // Unsupported data type QASYMM8
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid data type combination
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),    // S16 is valid data type for Sub
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),    // S32 is valid data type for Sub
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching shapes
+                                                        TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for lhs
+                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32),    // Broadcast Y dimension is not allowed
+                                                        TensorInfo(TensorShape( 3U,  8U, 9U), 1, DataType::S16),    // Broadcast Z dimension is not allowed
+                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed
+                                                      }),
+               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8_SIGNED),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(32U,  1U, 1U), 1, DataType::F32),    // Broadcasting allowed for rhs
+                                                       TensorInfo(TensorShape(15U,  1U, 3U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape( 3U,  8U, 1U), 1, DataType::S16),
+                                                       TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("Expected", { true, false, false, false, false, true, true, false, true, true, false, false, true })),
+               input1_info, input2_info, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Validate Elementwise Sub
+    auto          lhs_info         = sketch.create_tensor_info(input1_info);
+    auto          rhs_info         = sketch.create_tensor_info(input2_info);
+
+    bool res = bool(GpuSub::validate_op(sketch, &lhs_info, &rhs_info));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using DynamicFusionCLSubFixture = DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+template <typename T>
+using DynamicFusionCLSubBroadcastFixture = DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+template <typename T>
+using DynamicFusionCLSubTwoOpsFixture = DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuSub, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLSubFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLargeOneOp,
+                       DynamicFusionCLSubFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::TemporaryLimitedLargeShapesBroadcast()),
+                                       framework::dataset::make("DataType", { DataType::F32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallTwoOps,
+                       DynamicFusionCLSubTwoOpsFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                                       datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()),
+                                               framework::dataset::make("DataType", { DataType::F32 })),
+                                       framework::dataset::make("InPlace", { false })),
+                               framework::dataset::make("FuseTwoOps", { true })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmallOneOp,
+                       DynamicFusionCLSubFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp,
+                       DynamicFusionCLSubBroadcastFixture<half>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::TemporaryLimitedSmallShapesBroadcast()),
+                                       framework::dataset::make("DataType", { DataType::F16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+TEST_SUITE_END() // FP16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::S32 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
+
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::S16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       DynamicFusionCLSubFixture<int16_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::LargeShapes()),
+                                       framework::dataset::make("DataType", { DataType::S16 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // S16
+
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       DynamicFusionCLSubFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::SUB }),
+                                               datasets::SmallShapes()),
+                                       framework::dataset::make("DataType", { DataType::U8 })),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
+
+TEST_SUITE_END() // SUB
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute