Dynamic Fusion Pooling Layer 2d

- Adds Dynamic fusion PoolingLayer2D as Unfusable Operator
- Indices are not supported
- Adds tests for F32/F16 Datatypes

Resolves : [COMPMID-5520]

Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I0d112545eb9209c836bf9ea153069f8627531e0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8893
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 8867d5b..b02d104 100644
--- a/Android.bp
+++ b/Android.bp
@@ -591,6 +591,7 @@
         "src/dynamic_fusion/sketch/attributes/CastAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
@@ -608,6 +609,7 @@
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
@@ -617,6 +619,7 @@
         "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
@@ -627,6 +630,7 @@
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
         "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
new file mode 100644
index 0000000..be30781
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_POOL2DATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_POOL2DATTRIBUTES
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+class Pool2dAttributes
+{
+public:
+    /* Get Pooling Type */
+    PoolingType pool_type() const;
+
+    /* Set Pooling Type */
+    Pool2dAttributes pool_type(PoolingType pool_type);
+
+    /* Get 2D Pool Size */
+    Size2D pool_size() const;
+
+    /* Set 2D Pool size */
+    Pool2dAttributes pool_size(const Size2D &pool_size);
+
+    /* Get Padding */
+    Padding2D pad() const;
+
+    /* Set Padding */
+    Pool2dAttributes pad(const Padding2D &padding);
+
+    /* Get Stride */
+    Size2D stride() const;
+
+    /* Set Stride */
+    Pool2dAttributes stride(const Size2D &stride);
+
+    /* Get exclude padding */
+    bool exclude_padding() const;
+
+    /* Set exclude padding */
+    Pool2dAttributes exclude_padding(bool exclude_padding);
+
+private:
+    PoolingType _pool_type{};
+    Padding2D   _pad{};
+    Size2D      _pool_size{};
+    Size2D      _stride{ 1U, 1U };
+    bool        _exclude_padding{ true };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_POOL2DATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
new file mode 100644
index 0000000..16d88af
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+class GpuWorkloadContext;
+
+/** Operator backend specific settings
+*/
+class GpuPool2dSettings
+{
+public:
+    /* Get mixed_precision*/
+    bool mixed_precision() const;
+
+    /* Set mixed_precision */
+    GpuPool2dSettings &mixed_precision(bool mixed_precision);
+
+private:
+    bool _mixed_precision{ false };
+};
+
+/** Operator interface. */
+class GpuPool2d final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = Pool2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a operator */
+    using Settings = GpuPool2dSettings;
+
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor
+     * @param[out]    dst        Destination tensor
+     * @param[in]     attributes Operator attributes
+     * @param[in]     settings   Operator settings
+     */
+    static void create_op(GpuWorkloadSketch &sketch,
+                          ITensorInfo       *src,
+                          ITensorInfo       *dst,
+                          const Attributes &attributes,
+                          const Settings    &settings);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in]  context    Workload context within which the operator is running
+     * @param[in]  src        Left hand side tensor info. Data types supported: F16/F32.
+     * @param[out] dst        Destination tensor info. Data types supported: F16/F32.
+     *                        If an uninitialized ITensorInfo is passed in, it will be auto-initialized
+     * @param[in]  attributes Operator attributes
+     * @param[in]  settings   Operator settings
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const ITensorInfo        *dst,
+                                  const Attributes         &attributes,
+                                  const Settings           &settings);
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     * Similar to @ref GpuPool2d::create_op()
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *dst,
+                              const Attributes        &attributes,
+                              const Settings          &settings);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D */
diff --git a/filelist.json b/filelist.json
index ee6f7e3..8be2571 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2206,6 +2206,7 @@
       "src/dynamic_fusion/sketch/attributes/CastAttributes.cpp",
       "src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp",
       "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp",
+      "src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp",
       "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
       "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
       "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
@@ -2222,6 +2223,7 @@
       "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp",
       "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp",
       "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
       "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
       "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
       "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
@@ -2232,6 +2234,7 @@
       "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
+      "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
       "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
@@ -2240,6 +2243,7 @@
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
new file mode 100644
index 0000000..c28791f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+PoolingType Pool2dAttributes::pool_type() const
+{
+    return _pool_type;
+}
+
+Pool2dAttributes Pool2dAttributes::pool_type(PoolingType pool_type)
+{
+    _pool_type = pool_type;
+    return *this;
+}
+
+Padding2D Pool2dAttributes::pad() const
+{
+    return _pad;
+}
+
+Pool2dAttributes Pool2dAttributes::pad(const Padding2D &pad)
+{
+    _pad = pad;
+    return *this;
+}
+
+Size2D Pool2dAttributes::pool_size() const
+{
+    return _pool_size;
+}
+
+Pool2dAttributes Pool2dAttributes::pool_size(const Size2D &pool_size)
+{
+    _pool_size = pool_size;
+    return *this;
+}
+
+Size2D Pool2dAttributes::stride() const
+{
+    return _stride;
+}
+
+Pool2dAttributes Pool2dAttributes::stride(const Size2D &stride)
+{
+    _stride = stride;
+    return *this;
+}
+
+bool Pool2dAttributes::exclude_padding() const
+{
+    return _exclude_padding;
+}
+
+Pool2dAttributes Pool2dAttributes::exclude_padding(bool exclude_padding)
+{
+    _exclude_padding = exclude_padding;
+    return *this;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
new file mode 100644
index 0000000..2b01803
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentPool2d::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), "Unsupported Pooling type");
+
+    // 1. Check validity
+    // Check if pooling is valid
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                       misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+ClComponentPool2d::ClComponentPool2d(
+    ComponentId                      id,
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+    : IGpuKernelComponent{ id, properties, tensors },
+      _component_writer{ std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings) }
+{
+}
+ClComponentPool2d::~ClComponentPool2d()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentPool2d::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
new file mode 100644
index 0000000..896048e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Pool2dAttributes;
+
+/** Forward declaration */
+class ClTemplatePool2d;
+
+class ClComponentPool2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = Pool2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = GpuPool2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Constructor
+     *
+     * @param[in]     id         Unique Component Identifier within a workload
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     */
+    ClComponentPool2d(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentPool2d() override;
+
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentPool2d(const ClComponentPool2d &component) = delete;
+
+    /** Prevent instances of this class from being copied */
+    ClComponentPool2d &operator=(const ClComponentPool2d &component) = delete;
+
+    /** Allow instances of this class to be move constructed */
+    ClComponentPool2d(ClComponentPool2d &&component) = default;
+
+    /** Allow instances of this class to be moved */
+    ClComponentPool2d &operator=(ClComponentPool2d &&component) = default;
+
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Unfusable;
+    }
+
+private:
+    std::unique_ptr<ClTemplatePool2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
new file mode 100644
index 0000000..a07ad00
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr GpuOperatorType operator_type = GpuOperatorType::Unfusable;
+} // namespace
+
+GpuPool2dSettings &GpuPool2dSettings::mixed_precision(bool mixed_precision)
+{
+    _mixed_precision = mixed_precision;
+    return *this;
+}
+
+bool GpuPool2dSettings::mixed_precision() const
+{
+    return _mixed_precision;
+}
+
+Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *dst,
+                              const Pool2dAttributes &attributes,
+                              const GpuPool2dSettings &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate = *dst;
+    {
+        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes, settings);
+}
+
+Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const ITensorInfo        *dst,
+                                  const Pool2dAttributes   &attributes,
+                                  const GpuPool2dSettings &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    // Check exclude padding is not false
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), "Exclude padding must be set to true in Attributes!");
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate = *dst;
+    {
+        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Check components
+    if(context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Validate Component
+        {
+            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentPool2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+void GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
+                          ITensorInfo             *src,
+                          ITensorInfo             *dst,
+                          const Pool2dAttributes &attributes,
+                          const GpuPool2dSettings &settings)
+{
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, dst, attributes, settings));
+    ARM_COMPUTE_LOG_PARAMS(src, dst, attributes, settings);
+
+    // Auto initialize dst tensor
+    {
+        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())); // use the default DimensionRoundingType
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentPool2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
new file mode 100644
index 0000000..5df4438
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplatePool2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+// Shape indexes for NHWC Datalayout
+constexpr static int32_t batch_idx   = 3;
+constexpr static int32_t height_idx  = 2;
+constexpr static int32_t width_idx   = 1;
+constexpr static int32_t channel_idx = 0;
+}
+ClTemplatePool2d::ClTemplatePool2d(ComponentId                      id,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
+    : IGpuTemplateComponentWriter{ id, tensors },
+      _src{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+std::string ClTemplatePool2d::get_name() const
+{
+    return "pool2d";
+}
+
+std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    // Condition to use 2x2 optimized kernel
+    if(_attributes.pool_size() == Size2D(2, 2))
+    {
+        return get_2x2_kernel_code();
+    }
+    else
+    {
+        return get_MxN_kernel_code();
+    }
+}
+
+std::string ClTemplatePool2d::get_MxN_kernel_code() const
+{
+    const auto pool_type          = _attributes.pool_type();
+    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+
+    // Define pool op macro.
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+
+    // Kernel start
+    // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// OUT(dst, accum)      {{dst}}
+
+{
+    const int idx_out_c = g_ind_0;
+    const int idx_out_w = g_ind_1;
+)_";
+
+    // Add macro for POOL_OP
+    code += "\n" + pool_op + "\n";
+
+    code += R"_(
+    const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
+    const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
+)_";
+
+    // Define common variables.
+    code += R"_(
+    __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
+
+    __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
+
+    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
+    res0 = {{INITIAL_VALUE}};
+
+    const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
+    const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
+
+    const int pool_x_s = max((int)0, -idx_in_w);
+    const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
+    const int pool_y_s = max((int)0, -idx_in_h);
+    const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
+)_";
+
+    // Determine filter size depending on if padding is excluded or not
+    if(_attributes.exclude_padding())
+    {
+        code += R"_(
+    const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+)_";
+    }
+    else
+    {
+        code += R"_(
+    const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
+)_";
+    }
+
+    // Loop through pool size
+    // if global pooling
+    if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
+    {
+        // Begin loop
+        code += R"_(
+    // Global pooling path
+    for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
+    {
+    #pragma unroll 8
+        for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
+        {
+            VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
+            data0;
+)_";
+    }
+    else // if local pooling size
+    {
+        code += R"_(
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+    #pragma unroll 8
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+            VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
+            data0;
+)_";
+    } // end else
+
+    // if condition inside loop - use 32bit acc if mixed_precision.
+    // End loop through pooling section.
+    if(fp_mixed_precision)
+    {
+        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+        code += R"_(
+            data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
+            res0 = POOL_OP(res0, data0);
+        }
+    }
+)_";
+    }
+    else // load data, compute result and end loop
+    {
+        code += R"_(
+            data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
+            res0 = POOL_OP(res0, data0);
+        }
+    }
+)_";
+    }
+
+    // For Pool AVG ONLY, divide pool output by filter size
+    if(pool_type == PoolingType::AVG)
+    {
+        code += R"_(
+    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
+)_";
+    }
+
+    // If mixed precision convert datatype before storing. Then end kernel.
+    if(fp_mixed_precision)
+    {
+        code += R"_(
+    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
+    STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
+)_";
+    }
+    else
+    {
+        // Store data
+        code += R"_(
+    STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
+)_";
+    }
+
+    code += R"_(
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+}
+)_";
+
+    return code;
+}
+
+std::string ClTemplatePool2d::get_2x2_kernel_code() const
+{
+    const auto  pool_type          = _attributes.pool_type();
+    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// OUT(dst, accum)      {{dst}}
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
+
+{
+    const int idx_out_c = g_ind_0;
+    const int idx_out_w = g_ind_1;
+)_";
+
+    // Add pool op macro
+    code += "\n" + pool_op + "\n";
+
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    code += R"_(
+    const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
+    const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
+)_";
+
+    code += R"_(
+    const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
+    const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
+
+    __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
+    __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
+                                           {{dst}}_stride_w;
+    const int pool_x_s = max((int)0, -idx_in_w);
+    const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
+    const int pool_y_s = max((int)0, -idx_in_h);
+    const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
+
+    const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+    const int x0 = pool_x_s + idx_in_w;
+    const int y0 = pool_y_s + idx_in_h;
+    const int x1 = pool_x_e - 1 + idx_in_w;
+    const int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
+)_";
+
+    if(fp_mixed_precision)
+    {
+        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+        code += R"_(
+    data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
+    data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
+    data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
+    data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
+)_";
+    }
+    else
+    {
+        code += R"_(
+    data0         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z));
+    data1         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z));
+    data2         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z));
+    data3         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z));
+)_";
+    }
+
+    if(pool_type != PoolingType::MAX)
+    {
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        code += R"_(
+    if(filter_size != 4)
+    {
+        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
+        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
+
+        data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_s));
+        data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_s));
+        data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_e));
+        data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_e));
+    }
+)_";
+    }
+
+    code += R"_(
+    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
+)_";
+
+    if(pool_type == PoolingType::AVG)
+    {
+        // If avg pooling divide result accordingly.
+        if(_attributes.exclude_padding())
+        {
+            code += R"_(
+    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
+)_";
+        }
+        else
+        {
+            code += R"_(
+    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
+)_";
+        }
+    }
+
+    // Store result
+    if(fp_mixed_precision)
+    {
+        code += R"_(
+    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
+    STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
+)_";
+    }
+    else
+    {
+        code += R"_(
+    STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
+)_";
+    }
+
+    code += R"_(
+    //------------------ END KERNEL {{meta_kernel_id}} ---------------------
+}
+#undef SELECT_TYPE
+)_";
+
+    return code;
+}
+
+void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    vtable.declare_variable(
+        comp_group,
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        "src");
+
+    vtable.declare_variable(
+        comp_group,
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        "dst");
+}
+
+TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    TagLUT lut{};
+    // Arguments and global shared variables
+    lut["src"] = vtable.get_variable(_src);
+    lut["dst"] = vtable.get_variable(_dst);
+
+    // Local build options
+    lut["meta_kernel_id"] = id();
+
+    // Retrieve relevant data
+    const auto padding                = _attributes.pad();
+    const auto stride                 = _attributes.stride();
+    const auto pool_size              = _attributes.pool_size();
+    const auto data_type              = _src->data_type();
+    const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
+
+    // pool specific
+    lut["STRIDE_X"]    = stride.x();
+    lut["STRIDE_Y"]    = stride.y();
+    lut["PAD_X"]       = padding.left;
+    lut["PAD_Y"]       = padding.top;
+    lut["POOL_SIZE_X"] = pool_size.width;
+    lut["POOL_SIZE_Y"] = pool_size.height;
+
+    // Datatypes and variables
+    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
+    lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
+    lut["SRC_WIDTH"]     = _src->dimension(width_idx);
+    lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
+    lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? float_to_string_with_full_precision(std::numeric_limits<float>::lowest()) : std::string("0");
+
+    // Tensor specific data
+    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
+
+    return lut;
+}
+
+CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
+{
+    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0               = root_window.x().step();
+    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+    CLBuildOptions build_opts{};
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClTemplatePool2d::get_config_id() const
+{
+    const DataType   data_type   = _src->data_type();
+    const DataLayout data_layout = _src->data_layout();
+
+    std::string config_id{};
+    config_id += "pooling_layer_2d_";
+    config_id += lower_string(string_from_data_type(data_type));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(data_layout));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
+
+    return config_id;
+}
+
+std::set<std::string> ClTemplatePool2d::get_headers_list() const
+{
+    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
+}
+
+Window ClTemplatePool2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+    const auto         output_shape = _dst->tensor_shape();
+    const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+
+    // Create and configure kernel window
+    auto win = calculate_max_window(output_shape, Steps(vec_size));
+    win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
new file mode 100644
index 0000000..ef1c100
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplatePool2d final : public IGpuTemplateComponentWriter
+{
+public:
+    using Attributes = ClComponentPool2d::Attributes;
+    using Settings   = ClComponentPool2d::Settings;
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    ClTemplatePool2d(ComponentId                      id,
+                     const ArgumentPack<ITensorInfo> &tensors,
+                     const Attributes                &attributes,
+                     const Settings                  &settings);
+
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplatePool2d(const ClTemplatePool2d &direct_conv2d) = delete;
+
+    /** Prevent instances of this class from being copied */
+    ClTemplatePool2d &operator=(const ClTemplatePool2d &direct_conv2d) = delete;
+
+    /** Allow instances of this class to be move constructed */
+    ClTemplatePool2d(ClTemplatePool2d &&direct_conv2d) = default;
+
+    /** Allow instances of this class to be moved */
+    ClTemplatePool2d &operator=(ClTemplatePool2d &&direct_conv2d) = default;
+
+    /** Generate kernel component name */
+    std::string get_name() const override;
+
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+
+    /** Generate the component config id string used for tuning */
+    std::string get_config_id() const override;
+
+    /** Generate the header list used in the component */
+    std::set<std::string> get_headers_list() const override;
+
+    /** Generate the execution window for the component */
+    Window get_window() const override;
+
+private:
+    /** Generate pooling kernel template code optimized for 2x2 pooling
+     *
+     * @return std::String Component code
+     */
+    std::string get_2x2_kernel_code() const;
+
+    /** Generate generalised pooling kernel template code for MxN pooling
+     *
+     * @return std::String Component code
+     */
+    std::string get_MxN_kernel_code() const;
+
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D */
diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h
index 063dbdc..d317ec7 100644
--- a/src/dynamic_fusion/utils/Utils.h
+++ b/src/dynamic_fusion/utils/Utils.h
@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_UTILS_UTILS
 
 #include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
 
 namespace arm_compute
 {
@@ -32,25 +33,37 @@
 {
 namespace dynamic_fusion
 {
-bool is_user_tensor(const ITensorInfo *tensor_info)
+inline bool is_user_tensor(const ITensorInfo *tensor_info)
 {
     return tensor_info->id() > ITensorInfo::invalid_tensor_id;
 }
 
-bool is_intermediate_tensor(const ITensorInfo *tensor_info)
+inline bool is_intermediate_tensor(const ITensorInfo *tensor_info)
 {
     return tensor_info->id() < ITensorInfo::invalid_tensor_id;
 }
 
-bool is_valid_tensor(const ITensorInfo *tensor_info)
+inline bool is_valid_tensor(const ITensorInfo *tensor_info)
 {
     return tensor_info->has_valid_id();
 }
 
-bool is_invalid_tensor(const ITensorInfo *tensor_info)
+inline bool is_invalid_tensor(const ITensorInfo *tensor_info)
 {
     return !is_valid_tensor(tensor_info);
 }
+
+/** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo
+*/
+inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, bool mixed_precision = false, DataLayout data_layout = DataLayout::NHWC)
+{
+    // Create PadStrideInfo
+    const Size2D        stride  = pool_attr.stride();
+    const Padding2D     padding = pool_attr.pad();
+    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, arm_compute::DimensionRoundingType::FLOOR);
+
+    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, pool_attr.exclude_padding(), mixed_precision);
+}
 }
 }
 }
diff --git a/tests/datasets/dynamic_fusion/PoolingLayerDataset.h b/tests/datasets/dynamic_fusion/PoolingLayerDataset.h
new file mode 100644
index 0000000..c4911f4
--- /dev/null
+++ b/tests/datasets/dynamic_fusion/PoolingLayerDataset.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "utils/TypePrinter.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+
+using Pool2dAttributes = arm_compute::experimental::dynamic_fusion::Pool2dAttributes;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+
+class DynamicFusionPoolingLayerDataset
+{
+public:
+    using type = std::tuple<TensorShape, Pool2dAttributes>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator      src_it,
+                 std::vector<Pool2dAttributes>::const_iterator infos_it)
+            : _src_it{ std::move(src_it) },
+              _infos_it{ std::move(infos_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "In=" << *_src_it << ":";
+            description << "Info=" << *_infos_it << ":";
+            return description.str();
+        }
+
+        DynamicFusionPoolingLayerDataset::type operator*() const
+        {
+            return std::make_tuple(*_src_it, *_infos_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_src_it;
+            ++_infos_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator      _src_it;
+        std::vector<Pool2dAttributes>::const_iterator _infos_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_src_shapes.begin(), _infos.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_src_shapes.size(), _infos.size());
+    }
+
+    void add_config(TensorShape src, Pool2dAttributes info)
+    {
+        _src_shapes.emplace_back(std::move(src));
+        _infos.emplace_back(std::move(info));
+    }
+
+protected:
+    DynamicFusionPoolingLayerDataset()                       = default;
+    DynamicFusionPoolingLayerDataset(DynamicFusionPoolingLayerDataset &&) = default;
+
+private:
+    std::vector<TensorShape>      _src_shapes{};
+    std::vector<Pool2dAttributes> _infos{};
+};
+
+// Special pooling dataset
+class PoolingLayerDatasetSpecialDynamicFusion final : public DynamicFusionPoolingLayerDataset
+{
+public:
+    PoolingLayerDatasetSpecialDynamicFusion()
+    {
+        // NCHW DataLayout 
+        // Special cases
+        add_config(TensorShape(2U, 3U, 4U, 1U), Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).stride(Size2D(3,3)));
+        add_config(TensorShape(60U, 52U, 3U, 2U), Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(100,100)).stride(Size2D(5,5)).pad(Padding2D(50,50,50,50)));
+        // Asymmetric padding
+        add_config(TensorShape(112U, 112U, 32U), Pool2dAttributes().pool_type(PoolingType::MAX).pool_size(Size2D(3,3)).pad(Padding2D(0,1,0,1)).stride(Size2D(2,2)));
+        add_config(TensorShape(14U, 14U, 832U), Pool2dAttributes().pool_type(PoolingType::MAX).pool_size(Size2D(2,2)).stride(Size2D(1,1)).pad(Padding2D(0,0,0,0)));
+
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
new file mode 100644
index 0000000..a7772ae
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/dynamic_fusion/PoolingLayerDataset.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(POOL2D)
+
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
+constexpr AbsoluteTolerance<float> tolerance_f16(0.01f);  /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
+
+const auto PoolingLayerDatasetFP = combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3) })),
+                                                           framework::dataset::make("Pad", { Padding2D() })),
+                                                   framework::dataset::make("Stride", { Size2D(1, 1), Size2D(2, 1), Size2D(5, 7) })),
+                                           framework::dataset::make("ExcludePadding", { true }));
+
+const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", { true, false });
+
+template <typename T>
+using DynamicFusionGpuPool2dFixture = DynamicFusionGpuPool2dValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+
+template <typename T>
+using DFSpecialGpuPool2dFixture = DynamicFusionGpuPool2dSpecialValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+
+template <typename T>
+using DFPoolMixedPrecisionFixture = DynamicFusionGpuPool2dMixedPrecisionValidationFixture<CLTensor, CLAccessor, GpuPool2d, T>;
+// *INDENT-OFF*
+// clang-format off
+
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+            framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Mismatching data type
+                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Invalid pad/size combination
+                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Invalid pad/size combination
+                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid parameters, unsupported pooling
+                                                    TensorInfo(TensorShape(5U, 15U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid Non-rectangular Global Pooling
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Invalid output Global Pooling
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Invalid - Quantized not supported.
+                                                    TensorInfo(TensorShape(5U, 13U, 13U), 1, DataType::F32, DataLayout::NHWC),     // Valid global pooling
+                                                    TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32, DataLayout::NCHW),     // Unsupported data layout
+                                                }),
+            framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F16, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(2U, 30U, 11U), 1, DataType::F32, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(2U, 25U, 16U), 1, DataType::F32, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(5U, 1U, 1U), 1, DataType::F32, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(5U, 2U, 2U), 1, DataType::F32, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(5U, 12U, 12U), 1, DataType::QASYMM8, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(5U, 1U, 1U), 1, DataType::F32, DataLayout::NHWC),
+                                                    TensorInfo(TensorShape(1U, 1U, 5U), 1, DataType::F32, DataLayout::NHWC),
+                                                })),
+            framework::dataset::make("Pool2dAttributes", {
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(3,3)).pad(Padding2D(0,0,0,0)).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D(2,2,0,0)).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D(0,0,2,2)).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::L2).pool_size(Size2D(3,3)).pad(Padding2D(0,0,0,0)).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(15U, 13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::MAX).pool_size(Size2D(13U, 13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(2,2)).pad(Padding2D()).stride(Size2D(1,1)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
+                                                    Pool2dAttributes().pool_type(PoolingType::AVG).pool_size(Size2D(13U,13U)),
+                                                })),
+            framework::dataset::make("Expected", { false, false, false, false, true, false, false, true, false })),
+            input_info, output_info, pool2d_attr, expected)
+{
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Declare GpuPool2d settings
+    const GpuPool2dSettings &settings = GpuPool2dSettings().mixed_precision(false);
+
+    // Validate Pool2d Configuration
+    auto                   src_info    = sketch.create_tensor_info(input_info);
+    auto                   dst_info    = sketch.create_tensor_info(output_info);
+    bool                   res         = bool(GpuPool2d::validate_op(sketch, &src_info, &dst_info, pool2d_attr, settings));
+    ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS);
+}
+
+// clang-format on
+// *INDENT-ON*
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuPool2dFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallNoneUnitShapes(), PoolingLayerDatasetFP),
+                                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuPool2dFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), PoolingLayerDatasetFP),
+                                                                                                                framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSpecial, DFSpecialGpuPool2dFixture<float>, framework::DatasetMode::ALL, combine(datasets::PoolingLayerDatasetSpecialDynamicFusion(),
+                                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE(GlobalPooling)
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuPool2dFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                                                   framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
+                                                                                                            TensorShape(27U, 13U, 2U, 4U)
+                                                                                                          }),
+                                                                   framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::MAX })),
+                                                               framework::dataset::make("PoolingSize", { Size2D(27, 13) })),
+                                                       framework::dataset::make("Pad", { Padding2D() })),
+                                               framework::dataset::make("Stride", { Size2D(1, 1) })),
+                                       framework::dataset::make("ExcludePadding", true)),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuPool2dFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                                                   framework::dataset::make("InputShape", { TensorShape(79U, 37U, 11U),
+                                                                                                            TensorShape(79U, 37U, 11U, 4U)
+                                                                                                          }),
+                                                                   framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::MAX })),
+                                                               framework::dataset::make("PoolingSize", { Size2D(79, 37) })),
+                                                       framework::dataset::make("Pad", { Padding2D() })),
+                                               framework::dataset::make("Stride", { Size2D(1, 1) })),
+                                       framework::dataset::make("ExcludePadding", true)),
+                               framework::dataset::make("DataType", DataType::F32)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, DFPoolMixedPrecisionFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(), PoolingLayerDatasetFP),
+                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
+                                                                                                               pool_fp_mixed_precision_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, DFPoolMixedPrecisionFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), PoolingLayerDatasetFP),
+                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
+                                                                                                             pool_fp_mixed_precision_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+TEST_SUITE(GlobalPooling)
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuPool2dFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                                                   framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
+                                                                                                            TensorShape(27U, 13U, 2U, 4U)
+                                                                                                          }),
+                                                                   framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::MAX })),
+                                                               framework::dataset::make("PoolingSize", { Size2D(27, 13) })),
+                                                       framework::dataset::make("Pad", { Padding2D() })),
+                                               framework::dataset::make("Stride", { Size2D(1, 1) })),
+                                       framework::dataset::make("ExcludePadding", true)),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuPool2dFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                                                   framework::dataset::make("InputShape", { TensorShape(79U, 37U, 11U),
+                                                                                                            TensorShape(79U, 37U, 11U, 4U)
+                                                                                                          }),
+                                                                   framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::MAX })),
+                                                               framework::dataset::make("PoolingSize", { Size2D(79, 37) })),
+                                                       framework::dataset::make("Pad", { Padding2D() })),
+                                               framework::dataset::make("Stride", { Size2D(1, 1) })),
+                                       framework::dataset::make("ExcludePadding", true)),
+                               framework::dataset::make("DataType", DataType::F16)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // GlobalPooling
+TEST_SUITE_END() // FP16
+TEST_SUITE_END() // FLOAT
+
+TEST_SUITE_END() // POOL2D
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // CL
+}
+}
+}
diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
new file mode 100644
index 0000000..efb67f8
--- /dev/null
+++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/Pool2dFixture.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE
+#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/PoolingLayer.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dValidationGenericFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, const Pool2dAttributes &pool_attr, DataType data_type, bool mixed_precision)
+    {
+        _target    = compute_target(input_shape, pool_attr, data_type, mixed_precision);
+        _reference = compute_reference(input_shape, convert_pool_attr_to_pool_info(pool_attr, mixed_precision), data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F16:
+            {
+                arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<float> distribution(-1.0f, 1.0f);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+                library->fill_tensor_uniform(tensor, i);
+        }
+    }
+
+    // Given input is in nchw format
+    TensorType compute_target(TensorShape input_shape, const Pool2dAttributes &pool_attr, const DataType data_type, bool mixed_precision)
+    {
+        CLScheduler::get().default_reinit();
+
+        // Change shape due to NHWC data layout, test shapes are NCHW
+        permute(input_shape, PermutationVector(2U, 0U, 1U));
+
+        // Create a new workload sketch
+        auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+        auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+        GpuWorkloadSketch sketch{ &gpu_ctx };
+
+        // Create sketch tensors
+        auto input_info = sketch.create_tensor_info(TensorInfo(input_shape, 1, data_type, DataLayout::NHWC));
+        auto dst_info   = sketch.create_tensor_info();
+
+        // Create Pool2dSettings
+        GpuPool2dSettings pool_settings = GpuPool2dSettings().mixed_precision(mixed_precision);
+
+        FunctionType::create_op(sketch, &input_info, &dst_info, pool_attr, pool_settings);
+
+        // Configure runtime
+        ClWorkloadRuntime runtime;
+        runtime.configure(sketch);
+        // (Important) Allocate auxiliary tensor memory if there are any
+        for(auto &data : runtime.get_auxiliary_tensors())
+        {
+            auto       tensor      = data.first;
+            const auto aux_mem_req = data.second;
+            tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+            tensor->allocator()->allocate(); // Use ACL allocated memory
+        }
+        // Construct user tensors
+        TensorType t_input{};
+        TensorType t_dst{};
+
+        // Initialize user tensors
+        t_input.allocator()->init(input_info);
+        t_dst.allocator()->init(dst_info);
+
+        // Allocate and fill user tensors
+        t_input.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill(AccessorType(t_input), 0);
+
+        // Run runtime
+        runtime.run({ &t_input, &t_dst });
+        return t_dst;
+    }
+
+    SimpleTensor<T> compute_reference(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src(shape, data_type, 1, QuantizationInfo());
+        // Fill reference
+        fill(src, 0);
+        return reference::pooling_layer<T>(src, pool_info, QuantizationInfo(), nullptr, DataLayout::NCHW);
+    }
+
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dValidationFixture : public DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, PoolingType pool_type, Size2D pool_size, Padding2D pad, Size2D stride, bool exclude_padding, DataType data_type)
+    {
+        DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape,
+                                                                                                         Pool2dAttributes().pool_type(pool_type).pool_size(pool_size).pad(pad).stride(stride).exclude_padding(exclude_padding),
+                                                                                                         data_type, false);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dMixedPrecisionValidationFixture : public DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, PoolingType pool_type, Size2D pool_size, Padding2D pad, Size2D stride, bool exclude_padding, DataType data_type, bool mixed_precision)
+    {
+        DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape,
+                                                                                                         Pool2dAttributes().pool_type(pool_type).pool_size(pool_size).pad(pad).stride(stride).exclude_padding(exclude_padding),
+                                                                                                         data_type, mixed_precision);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class DynamicFusionGpuPool2dSpecialValidationFixture : public DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, Pool2dAttributes pool_attr, DataType data_type)
+    {
+        DynamicFusionGpuPool2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, pool_attr, data_type, false);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_POOL2DFIXTURE */
diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 9e671e3..378d91d 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,6 @@
 template <typename T, typename ACC_T, typename std::enable_if<is_floating_point<T>::value, int>::type>
 SimpleTensor<T> pooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info, SimpleTensor<uint32_t> *indices, DataLayout data_layout)
 {
-    ARM_COMPUTE_ERROR_ON(info.is_global_pooling && (src.shape().x() != src.shape().y()));
     // Create reference
     SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), src.data_type(), 1 };
     auto            pooled_shape = compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info);
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 8f9c249..2978a23 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -42,8 +42,10 @@
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
@@ -3407,6 +3409,65 @@
     return str.str();
 }
 
+/** Formatted output of the arm_compute::experimental::dynamic_fusion::Pool2dAttributes type.
+ *
+ * @param[out] os          Output stream.
+ * @param[in]  pool2d_attr arm_compute::experimental::dynamic_fusion::Pool2dAttributes type to output.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::Pool2dAttributes &pool2d_attr)
+{
+    os << "Pool2dAttributes="
+       << "["
+       << "PoolingType=" << pool2d_attr.pool_type() << ","
+       << "PoolSize=" << pool2d_attr.pool_size() << ","
+       << "Padding=" << pool2d_attr.pad() << ","
+       << "Stride=" << pool2d_attr.stride() << ","
+       << "ExcludePadding" << pool2d_attr.exclude_padding() << "]";
+
+    return os;
+}
+
+/** Formatted output of the arm_compute::experimental::dynamic_fusion::Pool2dAttributes type.
+ *
+ * @param[in] pool2d_attr arm_compute::experimental::dynamic_fusion::Pool2dAttributes type to output.
+ *
+ * @return Formatted string.
+ */
+inline std::string to_string(const experimental::dynamic_fusion::Pool2dAttributes &pool2d_attr)
+{
+    std::stringstream str;
+    str << pool2d_attr;
+    return str.str();
+}
+
+/** Formatted output of the arm_compute::experimental::dynamic_fusion::GpuPool2dSettings type
+ *
+ * @param[out] os       Output stream
+ * @param[in]  settings arm_compute::dynamic_fusion::GpuPool2dSettings type to output
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::GpuPool2dSettings &settings)
+{
+    os << "Settings="
+       << "["
+       << "FPMixedPrecision=" << settings.mixed_precision() << "]";
+    return os;
+}
+
+/** Formatted output of the arm_compute::experimental::dynamic_fusion::GpuPool2dSettings type.
+ *
+ * @param[in] settings arm_compute::experimental::dynamic_fusion::GpuPool2dSettings type to output.
+ *
+ * @return Formatted string.
+ */
+inline std::string to_string(const experimental::dynamic_fusion::GpuPool2dSettings &settings)
+{
+    std::stringstream str;
+    str << settings;
+    return str.str();
+}
+
 /** Formatted output of the arm_compute::experimental::dynamic_fusion::Conv2dAttributes type.
  *
  * @param[out] os          Output stream.
@@ -3424,6 +3485,7 @@
 
     return os;
 }
+
 /** Formatted output of the arm_compute::experimental::dynamic_fusion::Conv2dAttributes type.
  *
  * @param[in] conv2d_attr arm_compute::experimental::dynamic_fusion::Conv2dAttributes type to output.