IVGCVSW-6509 Front End + Reference Workload implementation

Subtask of story: IVGCVSW-6164 Add a Pooling3d FrontEnd and Ref Implementation

* Add front end
* Add reference workload
* Add corresponding unit tests

Change-Id: Icce4146dd0a06a1da46a2def00a82d343e171750
Signed-off-by: Tamas Nyiri <tamas.nyiri@arm.com>
diff --git a/src/armnn/BackendHelper.cpp b/src/armnn/BackendHelper.cpp
index c3cebdd..f561b93 100644
--- a/src/armnn/BackendHelper.cpp
+++ b/src/armnn/BackendHelper.cpp
@@ -646,6 +646,14 @@
     return m_LayerSupport->IsPooling2dSupported(input, output, descriptor, reasonIfUnsupported.value());
 }
 
+bool LayerSupportHandle::IsPooling3dSupported(const TensorInfo& input,
+                                              const TensorInfo& output,
+                                              const Pooling3dDescriptor& descriptor,
+                                              Optional<std::string&> reasonIfUnsupported)
+{
+    return m_LayerSupport->IsPooling3dSupported(input, output, descriptor, reasonIfUnsupported.value());
+}
+
 bool LayerSupportHandle::IsPreCompiledSupported(const TensorInfo& input,
                                                 const PreCompiledDescriptor& descriptor,
                                                 Optional<std::string&> reasonIfUnsupported)
diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp
index 49c39b3..607c83b 100644
--- a/src/armnn/LayersFwd.hpp
+++ b/src/armnn/LayersFwd.hpp
@@ -53,6 +53,7 @@
 #include "layers/PadLayer.hpp"
 #include "layers/PermuteLayer.hpp"
 #include "layers/Pooling2dLayer.hpp"
+#include "layers/Pooling3dLayer.hpp"
 #include "layers/PreCompiledLayer.hpp"
 #include "layers/PreluLayer.hpp"
 #include "layers/QuantizeLayer.hpp"
@@ -152,6 +153,7 @@
 DECLARE_LAYER(Pad)
 DECLARE_LAYER(Permute)
 DECLARE_LAYER(Pooling2d)
+DECLARE_LAYER(Pooling3d)
 DECLARE_LAYER(PreCompiled)
 DECLARE_LAYER(Prelu)
 DECLARE_LAYER(Quantize)
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 17a1da1..d3a7f97 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -208,6 +208,12 @@
     return pNetworkImpl->AddPooling2dLayer(pooling2dDescriptor, name);
 }
 
+IConnectableLayer* INetwork::AddPooling3dLayer(const Pooling3dDescriptor& pooling3dDescriptor,
+                                               const char* name)
+{
+    return pNetworkImpl->AddPooling3dLayer(pooling3dDescriptor, name);
+}
+
 IConnectableLayer* INetwork::AddActivationLayer(const ActivationDescriptor& activationDescriptor,
                                                 const char* name)
 {
@@ -2033,6 +2039,12 @@
     return m_Graph->AddLayer<Pooling2dLayer>(pooling2dDescriptor, name);
 }
 
+IConnectableLayer* NetworkImpl::AddPooling3dLayer(const Pooling3dDescriptor& pooling3dDescriptor,
+    const char* name)
+{
+    return m_Graph->AddLayer<Pooling3dLayer>(pooling3dDescriptor, name);
+}
+
 IConnectableLayer* NetworkImpl::AddActivationLayer(const ActivationDescriptor& activationDescriptor,
     const char* name)
 {
diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp
index 818a765..959d88d 100644
--- a/src/armnn/Network.hpp
+++ b/src/armnn/Network.hpp
@@ -167,6 +167,9 @@
     IConnectableLayer* AddPooling2dLayer(const Pooling2dDescriptor& pooling2dDescriptor,
                                          const char* name = nullptr);
 
+    IConnectableLayer* AddPooling3dLayer(const Pooling3dDescriptor& pooling3dDescriptor,
+                                         const char* name = nullptr);
+
     IConnectableLayer* AddPreluLayer(const char* name = nullptr);
 
     IConnectableLayer* AddQuantizeLayer(const char* name = nullptr);
diff --git a/src/armnn/layers/Pooling3dLayer.cpp b/src/armnn/layers/Pooling3dLayer.cpp
new file mode 100644
index 0000000..884f8e0
--- /dev/null
+++ b/src/armnn/layers/Pooling3dLayer.cpp
@@ -0,0 +1,131 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Pooling3dLayer.hpp"
+
+#include "LayerCloneBase.hpp"
+
+#include <armnn/TypesUtils.hpp>
+
+#include <armnnUtils/DataLayoutIndexed.hpp>
+
+#include <backendsCommon/WorkloadData.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+using namespace armnnUtils;
+
+namespace armnn
+{
+
+Pooling3dLayer::Pooling3dLayer(const Pooling3dDescriptor& param, const char* name)
+    : LayerWithParameters(1, 1, LayerType::Pooling3d, param, name)
+{
+}
+
+std::unique_ptr<IWorkload> Pooling3dLayer::CreateWorkload(const IWorkloadFactory& factory) const
+{
+    Pooling3dQueueDescriptor descriptor;
+    SetAdditionalInfo(descriptor);
+
+    return factory.CreatePooling3d(descriptor, PrepInfoAndDesc(descriptor));
+}
+
+Pooling3dLayer* Pooling3dLayer::Clone(Graph& graph) const
+{
+    return CloneBase<Pooling3dLayer>(graph, m_Param, GetName());
+}
+
+std::vector<TensorShape> Pooling3dLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const
+{
+    ARMNN_ASSERT(inputShapes.size() == 1);
+    const TensorShape& inputShape = inputShapes[0];
+    const DataLayoutIndexed dimensionIndices = m_Param.m_DataLayout;
+
+    // If we support multiple batch dimensions in the future, then this assert will need to change.
+    ARMNN_ASSERT_MSG(inputShape.GetNumDimensions() == 5, "Pooling3dLayer will always have 5D input.");
+
+    unsigned int inWidth = inputShape[dimensionIndices.GetWidthIndex()];
+    unsigned int inHeight = inputShape[dimensionIndices.GetHeightIndex()];
+    unsigned int inDepth = inputShape[dimensionIndices.GetDepthIndex()];
+    unsigned int inChannels = inputShape[dimensionIndices.GetChannelsIndex()];
+    unsigned int inBatchSize = inputShape[0];
+
+    bool isGlobalPooling = (m_Param.m_StrideX==0 && m_Param.m_StrideY==0 && m_Param.m_StrideZ==0);
+    unsigned int outWidth = 1;
+    unsigned int outHeight = 1;
+    unsigned int outDepth = 1;
+    if (!isGlobalPooling)
+    {
+        ARMNN_ASSERT_MSG(m_Param.m_StrideX!=0 && m_Param.m_StrideY!=0 && m_Param.m_StrideZ!=0,
+                         "Stride can only be zero when performing global pooling");
+
+        auto CalcSize = [](auto inSize, auto lowPad, auto highPad, auto poolSize, auto stride, auto outputShapeRounding)
+            {
+                unsigned int readSize = inSize + lowPad + highPad - poolSize;
+                float div = static_cast<float>(readSize) / static_cast<float>(stride);
+
+                unsigned int size = 0;
+                switch (outputShapeRounding)
+                {
+                    case OutputShapeRounding::Ceiling:
+                        size = static_cast<unsigned int>(ceil(div)) + 1;
+                        break;
+                    case OutputShapeRounding ::Floor:
+                        size = static_cast<unsigned int>(floor(div)) + 1;
+                        break;
+                    default:
+                        ARMNN_ASSERT_MSG(false, "Unsupported Output Shape Rounding");
+                }
+
+                // Makes sure that border operations will start from inside the input and not the padded area.
+                // This is what CL does...
+                if ((size - 1)*stride >= inSize + lowPad)
+                {
+                    --size;
+                }
+
+                return size;
+            };
+
+        outWidth = CalcSize(inWidth, m_Param.m_PadLeft, m_Param.m_PadRight, m_Param.m_PoolWidth, m_Param.m_StrideX,
+                            m_Param.m_OutputShapeRounding);
+        outHeight = CalcSize(inHeight, m_Param.m_PadTop, m_Param.m_PadBottom, m_Param.m_PoolHeight, m_Param.m_StrideY,
+                            m_Param.m_OutputShapeRounding);
+        outDepth = CalcSize(inDepth, m_Param.m_PadFront, m_Param.m_PadBack, m_Param.m_PoolDepth, m_Param.m_StrideZ,
+                            m_Param.m_OutputShapeRounding);
+    }
+    unsigned int outChannels = inChannels;
+    unsigned int outBatchSize = inBatchSize;
+
+    TensorShape tensorShape = m_Param.m_DataLayout == armnn::DataLayout::NDHWC ?
+        TensorShape( { outBatchSize, outDepth, outHeight, outWidth, outChannels } ) :
+        TensorShape( { outBatchSize, outChannels, outDepth, outHeight, outWidth });
+
+    return std::vector<TensorShape>({ tensorShape });
+}
+
+void Pooling3dLayer::ValidateTensorShapesFromInputs()
+{
+    VerifyLayerConnections(1, CHECK_LOCATION());
+
+    const TensorShape& outputShape = GetOutputSlot(0).GetTensorInfo().GetShape();
+
+    VerifyShapeInferenceType(outputShape, m_ShapeInferenceMethod);
+
+    auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() });
+
+    ARMNN_ASSERT(inferredShapes.size() == 1);
+
+    ValidateAndCopyShape(outputShape, inferredShapes[0], m_ShapeInferenceMethod, "Pooling3dLayer");
+}
+
+ARMNN_NO_DEPRECATE_WARN_BEGIN
+void Pooling3dLayer::Accept(ILayerVisitor& visitor) const
+{
+    visitor.VisitPooling3dLayer(this, GetParameters(), GetName());
+}
+ARMNN_NO_DEPRECATE_WARN_END
+
+} // namespace armnn
diff --git a/src/armnn/layers/Pooling3dLayer.hpp b/src/armnn/layers/Pooling3dLayer.hpp
new file mode 100644
index 0000000..0aa4853
--- /dev/null
+++ b/src/armnn/layers/Pooling3dLayer.hpp
@@ -0,0 +1,52 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "LayerWithParameters.hpp"
+
+namespace armnn
+{
+
+/// This layer represents a pooling 3d operation.
+class Pooling3dLayer : public LayerWithParameters<Pooling3dDescriptor>
+{
+public:
+    /// Makes a workload for the Pooling3d type.
+    /// @param [in] graph The graph where this layer can be found.
+    /// @param [in] factory The workload factory which will create the workload.
+    /// @return A pointer to the created workload, or nullptr if not created.
+    virtual std::unique_ptr<IWorkload> CreateWorkload(const IWorkloadFactory& factory) const override;
+
+    /// Creates a dynamically-allocated copy of this layer.
+    /// @param [in] graph The graph into which this layer is being cloned.
+    Pooling3dLayer* Clone(Graph& graph) const override;
+
+    /// Check if the input tensor shape(s)
+    /// will lead to a valid configuration of @ref Pooling3dLayer.
+    /// @param [in] shapeInferenceMethod Indicates if output shape shall be overwritten or just validated.
+    void ValidateTensorShapesFromInputs() override;
+
+    /// By default returns inputShapes if the number of inputs are equal to number of outputs,
+    /// otherwise infers the output shapes from given input shapes and layer properties.
+    /// @param [in] inputShapes The input shapes layer has.
+    /// @return A vector to the inferred output shape.
+    std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override;
+
+    ARMNN_NO_DEPRECATE_WARN_BEGIN
+    void Accept(ILayerVisitor& visitor) const override;
+    ARMNN_NO_DEPRECATE_WARN_END
+
+
+protected:
+    /// Constructor to create a Pooling3dLayer.
+    /// @param [in] param Pooling3dDescriptor to configure the pooling3d operation.
+    /// @param [in] name Optional name for the layer.
+    Pooling3dLayer(const Pooling3dDescriptor& param, const char* name);
+
+    /// Default destructor
+    ~Pooling3dLayer() = default;
+};
+
+} // namespace
diff --git a/src/armnn/test/InferOutputTests.cpp b/src/armnn/test/InferOutputTests.cpp
index 5365b83..f8d8e89 100644
--- a/src/armnn/test/InferOutputTests.cpp
+++ b/src/armnn/test/InferOutputTests.cpp
@@ -47,6 +47,9 @@
 // TransposeConvolution2D
 ARMNN_SIMPLE_TEST_CASE(TransposeConvolution2dInferOutputShape, TransposeConvolution2dInferOutputShapeTest)
 
+// Pooling3D
+ARMNN_SIMPLE_TEST_CASE(Pooling3dInferOutputShape, Pooling3dInferOutputShapeTest)
+
 // QLstm
 ARMNN_SIMPLE_TEST_CASE(QLstmInferOutputShape, QLstmInferOutputShapeTest)
 
diff --git a/src/armnn/test/InferOutputTests.hpp b/src/armnn/test/InferOutputTests.hpp
index e2c8545..6435d87 100644
--- a/src/armnn/test/InferOutputTests.hpp
+++ b/src/armnn/test/InferOutputTests.hpp
@@ -565,6 +565,40 @@
     CHECK(expectedOutputShape == depthwiseConvolution2dLayer->InferOutputShapes(shapes).at(0));
 }
 
+void Pooling3dInferOutputShapeTest()
+{
+    armnn::Graph graph;
+
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolDepth = 2;
+    descriptor.m_PoolHeight = 2;
+    descriptor.m_PoolWidth = 2;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 1;
+    descriptor.m_StrideX = 2;
+    descriptor.m_StrideY = 2;
+    descriptor.m_StrideZ = 2;
+    descriptor.m_DataLayout = armnn::DataLayout::NDHWC;
+
+    armnn::Pooling3dLayer* const pooling3dLayer =
+            graph.AddLayer<armnn::Pooling3dLayer>(descriptor, "pooling3d");
+
+    std::vector<armnn::TensorShape> shapes;
+    const std::vector<unsigned int> inputSize = {1, 4, 4, 4, 1};
+    armnn::TensorShape inputShape(5, inputSize.data());
+    shapes.push_back(inputShape);
+
+    const std::vector<unsigned int> expectedOutputSizes = {1, 3, 3, 3, 1};
+    armnn::TensorShape expectedOutputShape(5, expectedOutputSizes.data());
+
+    CHECK(expectedOutputShape == pooling3dLayer->InferOutputShapes(shapes).at(0));
+}
+
 // QLstm
 void QLstmInferOutputShapeImpl(const armnn::QLstmDescriptor descriptor,
                                const std::vector<armnn::TensorShape>& inputShapes,
diff --git a/src/armnnUtils/TensorUtils.cpp b/src/armnnUtils/TensorUtils.cpp
index 505c9f8..5b5b2bd 100644
--- a/src/armnnUtils/TensorUtils.cpp
+++ b/src/armnnUtils/TensorUtils.cpp
@@ -55,6 +55,27 @@
     }
 }
 
+TensorInfo GetTensorInfo(unsigned int numberOfBatches,
+                                unsigned int numberOfChannels,
+                                unsigned int depth,
+                                unsigned int height,
+                                unsigned int width,
+                                const DataLayout dataLayout,
+                                const DataType dataType)
+{
+    switch (dataLayout)
+    {
+        case DataLayout::NDHWC:
+            return TensorInfo({numberOfBatches, depth, height, width, numberOfChannels}, dataType);
+        case DataLayout::NCDHW:
+            return TensorInfo({numberOfBatches, numberOfChannels, depth, height, width}, dataType);
+        default:
+            throw InvalidArgumentException("Unknown data layout ["
+                                                  + std::to_string(static_cast<int>(dataLayout)) +
+                                                  "]", CHECK_LOCATION());
+    }
+}
+
 std::pair<float, float> FindMinMax(ITensorHandle* tensorHandle)
 {
     auto tensor_data = static_cast<const float *>(tensorHandle->Map(true));
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index ca1acc3..220590e 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -433,6 +433,14 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
+bool LayerSupportBase::IsPooling3dSupported(const TensorInfo&, // input
+                                            const TensorInfo&, // output
+                                            const Pooling3dDescriptor&, // descriptor
+                                            Optional<std::string&> reasonIfUnsupported) const
+{
+    return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
+}
+
 bool LayerSupportBase::IsPreCompiledSupported(const TensorInfo&, // input
                                               const PreCompiledDescriptor&, // descriptor
                                               Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index fc2906f..ef947aa 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -267,6 +267,11 @@
                               const Pooling2dDescriptor& descriptor,
                               Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsPooling3dSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const Pooling3dDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsPreCompiledSupported(const TensorInfo& input,
                                 const PreCompiledDescriptor& descriptor,
                                 Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 2716c82..eb2ff4e 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1531,6 +1531,34 @@
     ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
 }
 
+void Pooling3dQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"Pooling3dQueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 5, "input");
+    ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 5, "output");
+
+    std::vector<DataType> supportedTypes =
+    {
+        DataType::BFloat16,
+        DataType::Float32,
+        DataType::Float16,
+        DataType::QAsymmS8,
+        DataType::QAsymmU8,
+        DataType::QSymmS16
+    };
+
+    ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
+    ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+}
+
+
 void ResizeBilinearQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     const std::string descriptorName{"ResizeBilinearQueueDescriptor"};
diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp
index 4e56aaf..15c79e3 100644
--- a/src/backends/backendsCommon/WorkloadData.hpp
+++ b/src/backends/backendsCommon/WorkloadData.hpp
@@ -193,6 +193,13 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+// Pooling 3D layer workload data.
+struct Pooling3dQueueDescriptor : QueueDescriptorWithParameters<Pooling3dDescriptor>
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
+
 // Convolution 2D layer workload data.
 struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters<Convolution2dDescriptor>
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 55ce355..ef2a348 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -831,6 +831,17 @@
                                                              reason);
             break;
         }
+        case LayerType::Pooling3d:
+        {
+            auto cLayer = PolymorphicDowncast<const Pooling3dLayer*>(&layer);
+            const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+            result = layerSupportObject.IsPooling3dSupported(OverrideDataType(input, dataType),
+                                                             OverrideDataType(output, dataType),
+                                                             cLayer->GetParameters(),
+                                                             reason);
+            break;
+        }
         case LayerType::PreCompiled:
         {
             auto cLayer = PolymorphicDowncast<const PreCompiledLayer*>(&layer);
@@ -1781,6 +1792,12 @@
     return std::unique_ptr<IWorkload>();
 }
 
+std::unique_ptr<IWorkload> IWorkloadFactory::CreatePooling3d(const Pooling3dQueueDescriptor& /*descriptor*/,
+                                                             const WorkloadInfo& /*info*/) const
+{
+    return std::unique_ptr<IWorkload>();
+}
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreatePreCompiled(const PreCompiledQueueDescriptor& /*descriptor*/,
                                                                const WorkloadInfo& /*info*/) const
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.hpp b/src/backends/backendsCommon/WorkloadFactory.hpp
index df4bcd6..d624d1b 100644
--- a/src/backends/backendsCommon/WorkloadFactory.hpp
+++ b/src/backends/backendsCommon/WorkloadFactory.hpp
@@ -207,6 +207,9 @@
     virtual std::unique_ptr<IWorkload> CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
                                                        const WorkloadInfo&           info) const;
 
+    virtual std::unique_ptr<IWorkload> CreatePooling3d(const Pooling3dQueueDescriptor& descriptor,
+                                                       const WorkloadInfo&           info) const;
+
     virtual std::unique_ptr<IWorkload> CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
                                                          const WorkloadInfo& info) const;
 
diff --git a/src/backends/backendsCommon/WorkloadFactoryBase.hpp b/src/backends/backendsCommon/WorkloadFactoryBase.hpp
index ef507a6..4a67df5 100644
--- a/src/backends/backendsCommon/WorkloadFactoryBase.hpp
+++ b/src/backends/backendsCommon/WorkloadFactoryBase.hpp
@@ -200,6 +200,10 @@
                                                const WorkloadInfo& /*info*/) const override
     { return nullptr; }
 
+    std::unique_ptr<IWorkload> CreatePooling3d(const Pooling3dQueueDescriptor& /*descriptor*/,
+                                               const WorkloadInfo& /*info*/) const override
+    { return nullptr; }
+
     std::unique_ptr<IWorkload> CreatePreCompiled(const PreCompiledQueueDescriptor& /*descriptor*/,
                                                  const WorkloadInfo& /*info*/) const override
     { return nullptr; }
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 56c9d65..206faf5 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -85,6 +85,7 @@
     test/layerTests/NormalizationTestImpl.cpp \
     test/layerTests/PadTestImpl.cpp \
     test/layerTests/Pooling2dTestImpl.cpp \
+    test/layerTests/Pooling3dTestImpl.cpp \
     test/layerTests/RankTestImpl.cpp \
     test/layerTests/ReductionTestImpl.cpp \
     test/layerTests/ReduceProdTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index cd62242..958f484 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -142,6 +142,8 @@
     layerTests/PermuteTestImpl.hpp
     layerTests/Pooling2dTestImpl.cpp
     layerTests/Pooling2dTestImpl.hpp
+    layerTests/Pooling3dTestImpl.cpp
+    layerTests/Pooling3dTestImpl.hpp
     layerTests/PreluTestImpl.hpp
     layerTests/QuantizeTestImpl.cpp
     layerTests/QuantizeTestImpl.hpp
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index 76312ce..aa55557 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -702,6 +702,8 @@
 
 DECLARE_LAYER_POLICY_2_PARAM(Pooling2d)
 
+DECLARE_LAYER_POLICY_2_PARAM(Pooling3d)
+
 DECLARE_LAYER_POLICY_2_PARAM(PreCompiled)
 
 DECLARE_LAYER_POLICY_1_PARAM(Prelu)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index b51ff33..6bd2943 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -50,6 +50,7 @@
 #include <backendsCommon/test/layerTests/PadTestImpl.hpp>
 #include <backendsCommon/test/layerTests/PermuteTestImpl.hpp>
 #include <backendsCommon/test/layerTests/Pooling2dTestImpl.hpp>
+#include <backendsCommon/test/layerTests/Pooling3dTestImpl.hpp>
 #include <backendsCommon/test/layerTests/PreluTestImpl.hpp>
 #include <backendsCommon/test/layerTests/QuantizeTestImpl.hpp>
 #include <backendsCommon/test/layerTests/RankTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/WorkloadDataValidation.cpp b/src/backends/backendsCommon/test/WorkloadDataValidation.cpp
index 2034a65..a19d12f 100644
--- a/src/backends/backendsCommon/test/WorkloadDataValidation.cpp
+++ b/src/backends/backendsCommon/test/WorkloadDataValidation.cpp
@@ -74,6 +74,27 @@
     CHECK_THROWS_AS(RefPooling2dWorkload(invalidData, invalidInfo), armnn::InvalidArgumentException);
 }
 
+TEST_CASE("RefPooling3dFloat32Workload_Validate_WrongDimTensor")
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {2, 3, 4, 5}; // <- Invalid - input tensor has to be 5D.
+    unsigned int outputShape[] = {2, 3, 4, 5, 6};
+
+    outputTensorInfo = armnn::TensorInfo(5, outputShape, armnn::DataType::Float32);
+    inputTensorInfo  = armnn::TensorInfo(4, inputShape, armnn::DataType::Float32);
+
+    Pooling3dQueueDescriptor invalidData;
+    WorkloadInfo           invalidInfo;
+
+    AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr);
+    AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr);
+
+    // Invalid argument exception is expected, input tensor has to be 5D.
+    CHECK_THROWS_AS(RefPooling3dWorkload(invalidData, invalidInfo), armnn::InvalidArgumentException);
+}
+
 TEST_CASE("SoftmaxQueueDescriptor_Validate_WrongInputHeight")
 {
     unsigned int inputHeight = 1;
diff --git a/src/backends/backendsCommon/test/layerTests/Pooling3dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Pooling3dTestImpl.cpp
new file mode 100644
index 0000000..96a56fd
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/Pooling3dTestImpl.cpp
@@ -0,0 +1,1405 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+
+#include "Pooling3dTestImpl.hpp"
+
+#include <QuantizeHelper.hpp>
+#include <ResolveType.hpp>
+
+#include <armnnUtils/TensorUtils.hpp>
+#include <armnnUtils/DataLayoutIndexed.hpp>
+#include <armnnUtils/Permute.hpp>
+
+#include <armnn/utility/IgnoreUnused.hpp>
+#include <armnn/utility/NumericCast.hpp>
+
+#include <armnn/BackendHelper.hpp>
+#include <backendsCommon/WorkloadInfo.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+using namespace armnnUtils;
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> SimplePooling3dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    armnn::Pooling3dDescriptor descriptor,
+    float qScale,
+    int32_t qOffset,
+    const std::vector<T>& input,
+    const std::vector<T>& outputExpected,
+    const armnn::TensorShape& inputShape,
+    const armnn::TensorShape& outputShape)
+{
+    IgnoreUnused(memoryManager);
+    const armnn::DataLayout dataLayout = descriptor.m_DataLayout;
+    const armnnUtils::DataLayoutIndexed dimensionIndices = dataLayout;
+    auto heightIndex = dimensionIndices.GetHeightIndex();
+    auto widthIndex = dimensionIndices.GetWidthIndex();
+    auto depthIndex = dimensionIndices.GetDepthIndex();
+    auto channelsIndex = dimensionIndices.GetChannelsIndex();
+
+    unsigned int inputDepth      = armnn::numeric_cast<unsigned int>(inputShape[depthIndex]);
+    unsigned int inputHeight     = armnn::numeric_cast<unsigned int>(inputShape[heightIndex]);
+    unsigned int inputWidth      = armnn::numeric_cast<unsigned int>(inputShape[widthIndex]);
+    unsigned int inputChannels   = armnn::numeric_cast<unsigned int>(inputShape[channelsIndex]);
+    unsigned int inputBatchSize  = armnn::numeric_cast<unsigned int>(inputShape[0]);
+
+    unsigned int outputDepth     = armnn::numeric_cast<unsigned int>(outputShape[depthIndex]);
+    unsigned int outputHeight    = armnn::numeric_cast<unsigned int>(outputShape[heightIndex]);
+    unsigned int outputWidth     = armnn::numeric_cast<unsigned int>(outputShape[widthIndex]);
+    unsigned int outputChannels  = armnn::numeric_cast<unsigned int>(outputShape[channelsIndex]);
+    unsigned int outputBatchSize = armnn::numeric_cast<unsigned int>(outputShape[0]);
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(
+        inputBatchSize, inputChannels, inputDepth, inputHeight, inputWidth, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(
+        outputBatchSize, outputChannels, outputDepth, outputHeight, outputWidth, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    LayerTestResult<T, 5> result(outputTensorInfo);
+    std::vector<T> actualOutput(outputTensorInfo.GetNumElements());
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::Pooling3dQueueDescriptor queueDescriptor;
+    queueDescriptor.m_Parameters = descriptor;
+    queueDescriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(queueDescriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    // Don't execute if Pooling is not supported, as an exception will be raised.
+    armnn::BackendId backend = workloadFactory.GetBackendId();
+    std::string reasonIfUnsupported;
+    armnn::LayerSupportHandle handle = armnn::GetILayerSupportByBackendId(backend);
+    result.m_Supported = handle.IsPooling3dSupported(inputTensorInfo,
+                                                     outputTensorInfo,
+                                                     queueDescriptor.m_Parameters,
+                                                     reasonIfUnsupported);
+    if (!result.m_Supported)
+    {
+        return result;
+    }
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePooling3d(queueDescriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get());
+
+    result.m_ActualData = actualOutput;
+    result.m_ExpectedData = outputExpected;
+
+    return result;
+}
+
+//
+// Tests max pooling with the following parameters:
+//
+//   Pooling size: 2x2x2
+//   Stride:       (1,1,1)
+//   input size:   3x3x3
+//   channels:     2
+//   batch size:   2
+//
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> SimpleMaxPooling3dSize2x2x2Stride1x1x1TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = 2;
+    descriptor.m_PoolHeight = 2;
+    descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = 1;
+    descriptor.m_StrideY = 1;
+    descriptor.m_StrideZ = 1;
+    descriptor.m_PadLeft = descriptor.m_PadRight = 0;
+    descriptor.m_PadTop = descriptor.m_PadBottom = 0;
+    descriptor.m_PadFront = descriptor.m_PadBack = 0;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    unsigned int inputWidth = 3;
+    unsigned int inputHeight = 3;
+    unsigned int inputDepth = 3;
+    unsigned int outputWidth =
+        (inputWidth + descriptor.m_PadLeft + descriptor.m_PadRight + descriptor.m_StrideX - descriptor.m_PoolWidth) /
+        descriptor.m_StrideX;
+    unsigned int outputHeight =
+        (inputHeight + descriptor.m_PadTop + descriptor.m_PadBottom + descriptor.m_StrideY - descriptor.m_PoolHeight) /
+        descriptor.m_StrideY;
+    unsigned int outputDepth =
+        (inputDepth + descriptor.m_PadFront + descriptor.m_PadBack + descriptor.m_StrideZ - descriptor.m_PoolDepth) /
+        descriptor.m_StrideZ;
+    unsigned int channels = 2;
+    unsigned int batchSize = 2;
+
+    armnn::TensorInfo inputTensorInfo({ batchSize, channels, inputDepth, inputHeight, inputWidth }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ batchSize, channels, outputDepth, outputHeight, outputWidth }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<float> singleChannelData({
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+        1.0f, 1.0f, 1.0f,
+    });
+
+    // Constructs input data.
+    std::vector<float> inputData;
+    auto negator = [](float f) { return -f; };
+
+    // First image (two channels where the second channel is the negative of the first one).
+    inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end());
+    std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator);
+
+    // Second image (same as first image).
+    inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end());
+    std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator);
+
+    auto input = QuantizedVector<T>(inputData, qScale, qOffset);
+
+    // These were calculated manually.
+    std::vector<T> outputExpected = QuantizedVector<T>(
+            {
+                1.0f, 1.0f,
+                1.0f, 1.0f,
+
+                1.0f, 1.0f,
+                1.0f, 1.0f,
+
+                -1.0f, -1.0f,
+                -1.0f, -1.0f,
+
+                -1.0f, -1.0f,
+                -1.0f, -1.0f,
+
+
+                1.0f, 1.0f,
+                1.0f, 1.0f,
+
+                1.0f, 1.0f,
+                1.0f, 1.0f,
+
+                -1.0f, -1.0f,
+                -1.0f, -1.0f,
+
+                -1.0f, -1.0f,
+                -1.0f, -1.0f,
+            },
+            qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> SimpleMaxPooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout = armnn::DataLayout::NCDHW,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 2;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+    descriptor.m_DataLayout = dataLayout;
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(1, 1, 4, 4, 4, dataLayout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(1, 1, 2, 2, 2, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> inputData(
+        QuantizedVector<T>({
+             1.0f,  2.0f,  5.0f,  6.0f,
+             3.0f,  4.0f,  7.0f,  8.0f,
+             9.0f, 10.0f, 13.0f, 14.0f,
+            11.0f, 12.0f, 15.0f, 16.0f,
+
+            17.0f, 18.0f, 21.0f, 22.0f,
+            19.0f, 20.0f, 23.0f, 24.0f,
+            25.0f, 26.0f, 29.0f, 30.0f,
+            27.0f, 28.0f, 31.0f, 32.0f,
+
+            33.0f, 34.0f, 37.0f, 38.0f,
+            35.0f, 36.0f, 39.0f, 40.0f,
+            41.0f, 42.0f, 45.0f, 46.0f,
+            43.0f, 44.0f, 47.0f, 48.0f,
+
+            49.0f, 50.0f, 53.0f, 54.0f,
+            51.0f, 52.0f, 55.0f, 56.0f,
+            57.0f, 58.0f, 61.0f, 62.0f,
+            59.0f, 60.0f, 63.0f, 64.0f,
+        },
+        qScale, qOffset));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>({
+            20.0f, 24.0f,
+            28.0f, 32.0f,
+
+            52.0f, 56.0f,
+            60.0f, 64.0f,
+        },
+        qScale, qOffset));
+
+    const armnn::PermutationVector NCDHWToNDHWC = { 0, 4, 1, 2, 3 };
+    if (dataLayout == armnn::DataLayout::NDHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCDHWToNDHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+
+        std::vector<T> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCDHWToNDHWC, outputData.data(), tmp1.data(), sizeof(T));
+        outputData = tmp1;
+    }
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        inputData, outputData, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> IgnorePaddingSimpleMaxPooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 2;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3, 3 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = QuantizedVector<T>(
+        {
+            -1.0f, -2.0f,  3.0f,  4.0f,
+            -1.0f, -2.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+
+            -1.0f, -2.0f,  3.0f,  4.0f,
+            -1.0f, -2.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+
+            -1.0f, -2.0f,  3.0f,  4.0f,
+            -1.0f, -2.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+
+            -1.0f, -2.0f,  3.0f,  4.0f,
+            -1.0f, -2.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+        },
+        qScale, qOffset);
+
+    auto outputExpected = QuantizedVector<T>(
+        {
+            -1.0f,  3.0f,  4.0f,
+             1.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -4.0f,
+
+            -1.0f,  3.0f,  4.0f,
+             1.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -4.0f,
+
+            -1.0f,  3.0f,  4.0f,
+             1.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -4.0f,
+        },
+        qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> SimpleAveragePooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCDHW,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 2;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+    descriptor.m_DataLayout = dataLayout;
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(1, 1, 4, 4, 4, dataLayout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(1, 1, 2, 2, 2, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> inputData(
+        QuantizedVector<T>({
+             1.0f,  2.0f,  5.0f,  6.0f,
+             3.0f,  4.0f,  7.0f,  8.0f,
+             9.0f, 10.0f, 13.0f, 14.0f,
+            11.0f, 12.0f, 15.0f, 16.0f,
+
+            17.0f, 18.0f, 21.0f, 22.0f,
+            19.0f, 20.0f, 23.0f, 24.0f,
+            25.0f, 26.0f, 29.0f, 30.0f,
+            27.0f, 28.0f, 31.0f, 32.0f,
+
+            33.0f, 34.0f, 37.0f, 38.0f,
+            35.0f, 36.0f, 39.0f, 40.0f,
+            41.0f, 42.0f, 45.0f, 46.0f,
+            43.0f, 44.0f, 47.0f, 48.0f,
+
+            49.0f, 50.0f, 53.0f, 54.0f,
+            51.0f, 52.0f, 55.0f, 56.0f,
+            57.0f, 58.0f, 61.0f, 62.0f,
+            59.0f, 60.0f, 63.0f, 64.0f,
+        },
+        qScale, qOffset));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>({
+            10.5f, 14.5f,
+            18.5f, 22.5f,
+
+            42.5f, 46.5f,
+            50.5f, 54.5f,
+        },
+        qScale, qOffset));
+
+    const armnn::PermutationVector NCDHWToNDHWC = { 0, 4, 1, 2, 3 };
+    if (dataLayout == armnn::DataLayout::NDHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCDHWToNDHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+
+        std::vector<T> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCDHWToNDHWC, outputData.data(), tmp1.data(), sizeof(T));
+        outputData = tmp1;
+    }
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        inputData, outputData, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> LargeTensorsAveragePooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 100;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 5;
+    descriptor.m_PadLeft = 50;
+    descriptor.m_PadRight = 50;
+    descriptor.m_PadTop = 50;
+    descriptor.m_PadBottom = 50;
+    descriptor.m_PadFront = 50;
+    descriptor.m_PadBack = 50;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 5, 3, 52, 60, 68 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 5, 3, 11, 13, 15 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> input;
+
+    for (unsigned int i = 0 ; i < inputTensorInfo.GetShape().GetNumElements(); ++i)
+    {
+        input.push_back(1);
+    }
+
+    std::vector<T> outputExpected;
+
+    for (unsigned int i = 0 ; i < outputTensorInfo.GetShape().GetNumElements(); ++i)
+    {
+        outputExpected.push_back(1);
+    }
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> IgnorePaddingSimpleAveragePooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 2;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3, 3 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = QuantizedVector<T>(
+        {
+            12.0f, 20.0f, 32.0f, 40.0f,
+            12.0f, 20.0f, 32.0f, 40.0f,
+            12.0f, 20.0f, 32.0f, 40.0f,
+            12.0f, 20.0f, 32.0f, 40.0f,
+
+            24.0f, 40.0f, 64.0f, 80.0f,
+            24.0f, 40.0f, 64.0f, 80.0f,
+            24.0f, 40.0f, 64.0f, 80.0f,
+            24.0f, 40.0f, 64.0f, 80.0f,
+
+            36.0f, 60.0f, 96.0f, 120.0f,
+            36.0f, 60.0f, 96.0f, 120.0f,
+            36.0f, 60.0f, 96.0f, 120.0f,
+            36.0f, 60.0f, 96.0f, 120.0f,
+
+            48.0f, 80.0f, 128.0f, 160.0f,
+            48.0f, 80.0f, 128.0f, 160.0f,
+            48.0f, 80.0f, 128.0f, 160.0f,
+            48.0f, 80.0f, 128.0f, 160.0f,
+        },
+        qScale, qOffset);
+
+    auto outputExpected = QuantizedVector<T>(
+        {
+            1.5f,  6.5f,  5.0f,
+            3.0f,  13.0f,  10.0f,
+            1.5f,  6.5f,  5.0f,
+
+            7.5f,  32.5f,  25.0f,
+            15.0f,  65.0f,  50.0f,
+            7.5f,  32.5f,  25.0f,
+
+            6.0f,  26.0f,  20.0f,
+            12.0f,  52.0f,  40.0f,
+            6.0f,  26.0f,  20.0f,
+        },
+        qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> SimpleL2Pooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCDHW,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 2;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+    descriptor.m_DataLayout = dataLayout;
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(1, 1, 4, 4, 4, dataLayout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(1, 1, 2, 2, 2, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> inputData(
+        QuantizedVector<T>({
+             1.0f,  2.0f,  5.0f,  6.0f,
+             3.0f,  4.0f,  7.0f,  8.0f,
+             9.0f, 10.0f, 13.0f, 14.0f,
+            11.0f, 12.0f, 15.0f, 16.0f,
+
+            17.0f, 18.0f, 21.0f, 22.0f,
+            19.0f, 20.0f, 23.0f, 24.0f,
+            25.0f, 26.0f, 29.0f, 30.0f,
+            27.0f, 28.0f, 31.0f, 32.0f,
+
+            33.0f, 34.0f, 37.0f, 38.0f,
+            35.0f, 36.0f, 39.0f, 40.0f,
+            41.0f, 42.0f, 45.0f, 46.0f,
+            43.0f, 44.0f, 47.0f, 48.0f,
+
+            49.0f, 50.0f, 53.0f, 54.0f,
+            51.0f, 52.0f, 55.0f, 56.0f,
+            57.0f, 58.0f, 61.0f, 62.0f,
+            59.0f, 60.0f, 63.0f, 64.0f,
+        },
+        qScale, qOffset));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>({
+            13.2476412995f, 16.5981926727f,
+            20.1866292382f, 23.9060661758f,
+
+            43.2608367926f, 47.1963981677f,
+            51.1419592898f, 55.0953718564f,
+        },
+        qScale, qOffset));
+
+    const armnn::PermutationVector NCDHWToNDHWC = { 0, 4, 1, 2, 3 };
+    if (dataLayout == armnn::DataLayout::NDHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCDHWToNDHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+
+        std::vector<T> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCDHWToNDHWC, outputData.data(), tmp1.data(), sizeof(T));
+        outputData = tmp1;
+    }
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        inputData, outputData, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> IgnorePaddingSimpleL2Pooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = descriptor.m_PoolDepth = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = descriptor.m_StrideZ = 2;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3, 3 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = QuantizedVector<T>(
+        {
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+
+            2.0f, 3.0f, 4.0f, 5.0f,
+            2.0f, 3.0f, 4.0f, 5.0f,
+            2.0f, 3.0f, 4.0f, 5.0f,
+            2.0f, 3.0f, 4.0f, 5.0f,
+
+            3.0f, 4.0f, 5.0f, 6.0f,
+            3.0f, 4.0f, 5.0f, 6.0f,
+            3.0f, 4.0f, 5.0f, 6.0f,
+            3.0f, 4.0f, 5.0f, 6.0f,
+
+            4.0f, 5.0f, 6.0f, 7.0f,
+            4.0f, 5.0f, 6.0f, 7.0f,
+            4.0f, 5.0f, 6.0f, 7.0f,
+            4.0f, 5.0f, 6.0f, 7.0f,
+        },
+        qScale, qOffset);
+
+    float v111 = float(sqrt(pow(1,2)/8.0f));
+    float v112 = float(sqrt((pow(2,2)+pow(3,2))/8.0f));
+    float v113 = float(sqrt(pow(4,2)/8));
+
+    float v121 = float(sqrt((2*pow(1,2))/8.0f));
+    float v122 = float(sqrt((2*pow(2,2)+2*pow(3,2))/8.0f));
+    float v123 = float(sqrt((2*pow(4,2))/8.0f));
+
+    float v131 = v111;
+    float v132 = v112;
+    float v133 = v113;
+
+    float v211 = float(sqrt((pow(2,2)+pow(3,2))/8.0f));
+    float v212 = float(sqrt((pow(3,2)+2*pow(4,2)+pow(5,2))/8.0f));
+    float v213 = float(sqrt((pow(5,2)+pow(6,2))/8.0f));
+
+    float v221 = float(sqrt((2*pow(2,2)+2*pow(3,2))/8.0f));
+    float v222 = float(sqrt((2*pow(3,2)+4*pow(4,2)+2*pow(5,2))/8.0f));
+    float v223 = float(sqrt((2*pow(5,2)+2*pow(6,2))/8.0f));
+
+    float v231 = v211;
+    float v232 = v212;
+    float v233 = v213;
+
+    float v311 = float(sqrt(pow(4,2)/8.0f));
+    float v312 = float(sqrt((pow(5,2)+pow(6,2))/8.0f));
+    float v313 = float(sqrt(pow(7,2)/8));
+
+    float v321 = float(sqrt((2*pow(4,2))/8.0f));
+    float v322 = float(sqrt((2*pow(5,2)+2*pow(6,2))/8.0f));
+    float v323 = float(sqrt((2*pow(7,2))/8.0f));
+
+    float v331 = v311;
+    float v332 = v312;
+    float v333 = v313;
+
+    auto outputExpected = QuantizedVector<T>(
+        {
+            v111,  v112,  v113,
+            v121,  v122,  v123,
+            v131,  v132,  v133,
+
+            v211,  v212,  v213,
+            v221,  v222,  v223,
+            v231,  v232,  v233,
+
+            v311,  v312,  v313,
+            v321,  v322,  v323,
+            v331,  v332,  v333,
+        },
+        qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> AsymmetricNonSquareMaxPooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 1, 3, 1 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2, 1 }, ArmnnType);
+
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = 1;
+    descriptor.m_PoolHeight = 2;
+    descriptor.m_PoolDepth = 3;
+    descriptor.m_StrideX = 0;
+    descriptor.m_StrideY = 2;
+    descriptor.m_StrideZ = 1;
+    descriptor.m_PadLeft = 0;
+    descriptor.m_PadRight = 0;
+    descriptor.m_PadTop = 2;
+    descriptor.m_PadBottom = 0;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 2;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    // Construct input data.
+    auto input = QuantizedVector<T>(
+        {
+            1.0f, 3.0f, 4.0f,
+        },
+        qScale, qOffset);
+
+    // These were calculated manually.
+    auto outputExpected = QuantizedVector<T>(
+        {
+            0.0f, 3.0f, 0.0f, 3.0f,
+        },
+        qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> AsymmetricNonSquareAveragePooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 1, 3, 1 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2, 1 }, ArmnnType);
+
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = 1;
+    descriptor.m_PoolHeight = 2;
+    descriptor.m_PoolDepth = 3;
+    descriptor.m_StrideX = 0;
+    descriptor.m_StrideY = 2;
+    descriptor.m_StrideZ = 1;
+    descriptor.m_PadLeft = 0;
+    descriptor.m_PadRight = 0;
+    descriptor.m_PadTop = 2;
+    descriptor.m_PadBottom = 0;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 2;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    // Construct input data.
+    auto input = QuantizedVector<T>(
+        {
+            1.0f, 3.0f, 4.0f,
+        },
+        qScale, qOffset);
+
+    // These were calculated manually.
+    auto outputExpected = QuantizedVector<T>(
+        {
+            0.0f, 2.0f, 0.0f, 2.0f,
+        },
+        qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> AsymmetricNonSquareL2Pooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 1, 3, 1 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2, 1 }, ArmnnType);
+
+    armnn::Pooling3dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = 1;
+    descriptor.m_PoolHeight = 2;
+    descriptor.m_PoolDepth = 3;
+    descriptor.m_StrideX = 0;
+    descriptor.m_StrideY = 2;
+    descriptor.m_StrideZ = 1;
+    descriptor.m_PadLeft = 0;
+    descriptor.m_PadRight = 0;
+    descriptor.m_PadTop = 2;
+    descriptor.m_PadBottom = 0;
+    descriptor.m_PadFront = 1;
+    descriptor.m_PadBack = 2;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    // Construct input data.
+    auto input = QuantizedVector<T>(
+        {
+            1.0f, 3.0f, 4.0f,
+        },
+        qScale, qOffset);
+
+    // These were calculated manually.
+    auto outputExpected = QuantizedVector<T>(
+        {
+            0.0f, 2.2360679775f, 0.0f, 2.2360679775f,
+        },
+        qScale, qOffset);
+
+    return SimplePooling3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, tensorHandleFactory, descriptor, qScale, qOffset,
+        input, outputExpected, inputTensorInfo.GetShape(), outputTensorInfo.GetShape());
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> ComparePooling3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm poolingType,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    IgnoreUnused(memoryManager);
+    const unsigned int inputWidth = 16;
+    const unsigned int inputHeight = 32;
+    const unsigned int inputDepth = 48;
+    const unsigned int channelCount = 2;
+    const unsigned int batchSize = 5;
+
+    const unsigned int poolSize = 3;
+    const unsigned int strideX = 2;
+    const unsigned int strideY = 4;
+    const unsigned int strideZ = 6;
+    const unsigned int padX = 0;
+    const unsigned int padY = 0;
+    const unsigned int padZ = 0;
+
+    const unsigned int outputWidth = (inputWidth + 2 * padX + strideX - poolSize) / strideX;
+    const unsigned int outputHeight = (inputHeight + 2 * padY + strideY - poolSize) / strideY;
+    const unsigned int outputDepth = (inputDepth + 2 * padZ + strideZ - poolSize) / strideZ;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { batchSize, channelCount, inputHeight, inputWidth, inputDepth };
+    unsigned int outputShape[] = { batchSize, channelCount, outputHeight, outputWidth, outputDepth };
+
+    inputTensorInfo = armnn::TensorInfo(5, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(5, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> input = MakeRandomTensor<T>(inputTensorInfo, 81715);
+    std::vector<T> actualOutput(outputTensorInfo.GetNumElements());
+    std::vector<T> expectedOutput(outputTensorInfo.GetNumElements());
+
+    LayerTestResult<T, 5> comparisonResult(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::Pooling3dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_PoolType = poolingType;
+    data.m_Parameters.m_PoolWidth = poolSize;
+    data.m_Parameters.m_PoolHeight = poolSize;
+    data.m_Parameters.m_PoolDepth = poolSize;
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_StrideZ = strideZ;
+    data.m_Parameters.m_PadLeft = padX;
+    data.m_Parameters.m_PadRight = padX;
+    data.m_Parameters.m_PadTop = padY;
+    data.m_Parameters.m_PadBottom = padY;
+    data.m_Parameters.m_PadFront = padZ;
+    data.m_Parameters.m_PadBack = padZ;
+    data.m_Parameters.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refTensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refTensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+
+    // Don't execute if Pooling is not supported, as an exception will be raised.
+    armnn::BackendId backend = workloadFactory.GetBackendId();
+    std::string reasonIfUnsupported;
+    armnn::LayerSupportHandle handle = armnn::GetILayerSupportByBackendId(backend);
+    comparisonResult.m_Supported = handle.IsPooling3dSupported(inputTensorInfo,
+                                                               outputTensorInfo,
+                                                               data.m_Parameters,
+                                                               reasonIfUnsupported);
+    if (!comparisonResult.m_Supported)
+    {
+        return comparisonResult;
+    }
+
+    armnn::Pooling3dQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePooling3d(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreatePooling3d(refData, refInfo);
+
+    outputHandleRef->Allocate();
+    inputHandleRef->Allocate();
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+    CopyDataToITensorHandle(inputHandleRef.get(), input.data());
+
+    workload->Execute();
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get());
+    CopyDataFromITensorHandle(expectedOutput.data(), outputHandleRef.get());
+
+    comparisonResult.m_ActualData = actualOutput;
+    comparisonResult.m_ExpectedData = expectedOutput;
+
+    return comparisonResult;
+}
+
+
+} // anonymous namespace
+
+LayerTestResult<float, 5> SimpleMaxPooling3dSize2x2x2Stride1x1x1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return SimpleMaxPooling3dSize2x2x2Stride1x1x1TestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> SimpleMaxPooling3dSize2x2x2Stride1x1x1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return SimpleMaxPooling3dSize2x2x2Stride1x1x1TestCommon<armnn::DataType::QAsymmU8>(
+        workloadFactory, memoryManager, tensorHandleFactory, 0.1f, 128);
+}
+
+LayerTestResult<int16_t, 5> SimpleMaxPooling3dSize2x2x2Stride1x1x1Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return SimpleMaxPooling3dSize2x2x2Stride1x1x1TestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> SimpleMaxPooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleMaxPooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<uint8_t, 5> SimpleMaxPooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleMaxPooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<int16_t, 5> SimpleMaxPooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleMaxPooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<float, 5> IgnorePaddingSimpleMaxPooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleMaxPooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> IgnorePaddingSimpleMaxPooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleMaxPooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory, 1.0f, -5);
+}
+
+LayerTestResult<int16_t, 5> IgnorePaddingSimpleMaxPooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleMaxPooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> SimpleAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleAveragePooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<uint8_t, 5> SimpleAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleAveragePooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<int16_t, 5> SimpleAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleAveragePooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<float, 5> SimpleL2Pooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleL2Pooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<uint8_t, 5> SimpleL2Pooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleL2Pooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<int16_t, 5> SimpleL2Pooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleL2Pooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory, dataLayout);
+}
+
+LayerTestResult<float, 5> LargeTensorsAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return LargeTensorsAveragePooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> LargeTensorsAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return LargeTensorsAveragePooling3dTestCommon<armnn::DataType::QAsymmU8>(
+        workloadFactory, memoryManager, tensorHandleFactory, 0.5, -1);
+}
+
+LayerTestResult<int16_t, 5> LargeTensorsAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return LargeTensorsAveragePooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> IgnorePaddingSimpleAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleAveragePooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> IgnorePaddingSimpleAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleAveragePooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory, 1.0f, -5);
+}
+
+LayerTestResult<int16_t, 5> IgnorePaddingSimpleAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleAveragePooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> IgnorePaddingSimpleL2Pooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleL2Pooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> IgnorePaddingSimpleL2Pooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleL2Pooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory, 1.0f, -5);
+}
+
+LayerTestResult<int16_t, 5> IgnorePaddingSimpleL2Pooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return IgnorePaddingSimpleL2Pooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> AsymmetricNonSquareMaxPooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareMaxPooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> AsymmetricNonSquareMaxPooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareMaxPooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<int16_t, 5> AsymmetricNonSquareMaxPooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareMaxPooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> AsymmetricNonSquareAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareAveragePooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> AsymmetricNonSquareAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareAveragePooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<int16_t, 5> AsymmetricNonSquareAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareAveragePooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> AsymmetricNonSquareL2Pooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareL2Pooling3dTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<uint8_t, 5> AsymmetricNonSquareL2Pooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareL2Pooling3dTestCommon<armnn::DataType::QAsymmU8>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<int16_t, 5> AsymmetricNonSquareL2Pooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return AsymmetricNonSquareL2Pooling3dTestCommon<armnn::DataType::QSymmS16>(
+            workloadFactory, memoryManager, tensorHandleFactory);
+}
+
+LayerTestResult<float, 5> ComparePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm  poolingType)
+{
+    return ComparePooling3dTestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager,  refWorkloadFactory, tensorHandleFactory, refTensorHandleFactory, poolingType);
+}
+
+LayerTestResult<uint8_t, 5> ComparePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm  poolingType)
+{
+    return ComparePooling3dTestCommon<armnn::DataType::QAsymmU8>(
+        workloadFactory, memoryManager,  refWorkloadFactory, tensorHandleFactory, refTensorHandleFactory,
+        poolingType, 0.1f, 128);
+}
+
+LayerTestResult<int16_t, 5> ComparePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm  poolingType)
+{
+    return ComparePooling3dTestCommon<armnn::DataType::QSymmS16>(
+        workloadFactory, memoryManager,  refWorkloadFactory, tensorHandleFactory, refTensorHandleFactory, poolingType);
+}
\ No newline at end of file
diff --git a/src/backends/backendsCommon/test/layerTests/Pooling3dTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/Pooling3dTestImpl.hpp
new file mode 100644
index 0000000..e7cd6b4
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/Pooling3dTestImpl.hpp
@@ -0,0 +1,213 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <armnn/Types.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float,   5> SimpleMaxPooling3dSize2x2x2Stride1x1x1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5>SimpleMaxPooling3dSize2x2x2Stride1x1x1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> SimpleMaxPooling3dSize2x2x2Stride1x1x1Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> SimpleMaxPooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<uint8_t, 5> SimpleMaxPooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<int16_t, 5> SimpleMaxPooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<float,   5> IgnorePaddingSimpleMaxPooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> IgnorePaddingSimpleMaxPooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> IgnorePaddingSimpleMaxPooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> SimpleAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<uint8_t, 5> SimpleAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<int16_t, 5> SimpleAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<float,   5> LargeTensorsAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> LargeTensorsAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> LargeTensorsAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> IgnorePaddingSimpleAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> IgnorePaddingSimpleAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> IgnorePaddingSimpleAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> SimpleL2Pooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<uint8_t, 5> SimpleL2Pooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<int16_t, 5> SimpleL2Pooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<float,   5> IgnorePaddingSimpleL2Pooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> IgnorePaddingSimpleL2Pooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> IgnorePaddingSimpleL2Pooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> AsymmetricNonSquareMaxPooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> AsymmetricNonSquareMaxPooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> AsymmetricNonSquareMaxPooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> AsymmetricNonSquareAveragePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> AsymmetricNonSquareAveragePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> AsymmetricNonSquareAveragePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> AsymmetricNonSquareL2Pooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 5> AsymmetricNonSquareL2Pooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int16_t, 5> AsymmetricNonSquareL2Pooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float,   5> ComparePooling3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm  poolingType);
+
+LayerTestResult<uint8_t, 5> ComparePooling3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm  poolingType);
+
+LayerTestResult<int16_t, 5> ComparePooling3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::ITensorHandleFactory& tensorHandleFactory,
+    const armnn::ITensorHandleFactory& refTensorHandleFactory,
+    armnn::PoolingAlgorithm  poolingType);
+
+
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index b80aa99..0ac2ddc 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -1721,6 +1721,38 @@
     return supported;
 }
 
+bool RefLayerSupport::IsPooling3dSupported(const TensorInfo& input,
+                                           const TensorInfo& output,
+                                           const Pooling3dDescriptor& descriptor,
+                                           Optional<std::string&> reasonIfUnsupported) const
+{
+    IgnoreUnused(descriptor);
+    bool supported = true;
+
+    // Define supported output and inputs types.
+    std::array<DataType,6> supportedTypes =
+    {
+        DataType::BFloat16,
+        DataType::Float32,
+        DataType::Float16,
+        DataType::QAsymmS8,
+        DataType::QAsymmU8,
+        DataType::QSymmS16
+    };
+
+    supported &= CheckSupportRule(TypeAnyOf(input, supportedTypes), reasonIfUnsupported,
+                                  "Reference poolind3d: input is not a supported type.");
+
+    supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported,
+                                  "Reference poolind3d: output is not a supported type.");
+
+    supported &= CheckSupportRule(TypesAreEqual(input, output), reasonIfUnsupported,
+                                  "Reference poolind3d: input and output types are mismatched.");
+
+    return supported;
+}
+
+
 bool RefLayerSupport::IsQLstmSupported(const TensorInfo& input,
                                        const TensorInfo& previousOutputIn,
                                        const TensorInfo& previousCellStateIn,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 53d7907..61d0556 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -250,6 +250,11 @@
                               const Pooling2dDescriptor& descriptor,
                               Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsPooling3dSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const Pooling3dDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsQuantizeSupported(const TensorInfo& input,
                              const TensorInfo& output,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 36dcd21..eff301c 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -543,6 +543,12 @@
     return std::make_unique<RefPooling2dWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreatePooling3d(const Pooling3dQueueDescriptor& descriptor,
+                                                               const WorkloadInfo& info) const
+{
+    return std::make_unique<RefPooling3dWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreatePreCompiled(const PreCompiledQueueDescriptor& /*descriptor*/,
                                                                  const WorkloadInfo& /*info*/) const
 {
diff --git a/src/backends/reference/RefWorkloadFactory.hpp b/src/backends/reference/RefWorkloadFactory.hpp
index a85e8dd..21dfed9 100644
--- a/src/backends/reference/RefWorkloadFactory.hpp
+++ b/src/backends/reference/RefWorkloadFactory.hpp
@@ -201,6 +201,9 @@
     std::unique_ptr<IWorkload> CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
                                                const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreatePooling3d(const Pooling3dQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreatePreCompiled(const PreCompiledQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info) const override;
 
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 7049279..0ddb16a 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -44,6 +44,7 @@
         workloads/MirrorPad.cpp \
         workloads/Pad.cpp \
         workloads/Pooling2d.cpp \
+        workloads/Pooling3d.cpp \
         workloads/PreluImpl.cpp \
         workloads/Reduce.cpp \
         workloads/RefActivationWorkload.cpp \
@@ -84,6 +85,7 @@
         workloads/RefPadWorkload.cpp \
         workloads/RefPermuteWorkload.cpp \
         workloads/RefPooling2dWorkload.cpp \
+        workloads/RefPooling3dWorkload.cpp \
         workloads/RefPreluWorkload.cpp \
         workloads/RefQLstmWorkload.cpp \
         workloads/RefQuantizeWorkload.cpp \
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 5993270..13487dd 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -482,7 +482,7 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(DepthwiseConvolution2dPerAxisQuantTestNhwc, DepthwiseConvolution2dPerAxisQuantTest,
                      DataLayout::NHWC);
 
-// Pooling
+// [ Pooling 2D
 //MaxPooling
 ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling2dSize2x2Stride2x2, SimpleMaxPooling2dSize2x2Stride2x2Test, false)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling2dSize2x2Stride2x2Uint8,
@@ -564,6 +564,68 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquarePooling2d, AsymmetricNonSquarePooling2dTest)
 ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquarePooling2dUint8, AsymmetricNonSquarePooling2dUint8Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquarePooling2dInt16, AsymmetricNonSquarePooling2dInt16Test)
+// Pooling 2D ]
+
+// [ Pooling 3D
+//MaxPooling
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dSize2x2x2Stride1x1x1, SimpleMaxPooling3dSize2x2x2Stride1x1x1Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dSize2x2x2Stride1x1x1Uint8,
+                              SimpleMaxPooling3dSize2x2x2Stride1x1x1Uint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dSize2x2x2Stride1x1x1Int16,
+                              SimpleMaxPooling3dSize2x2x2Stride1x1x1Int16Test)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3d, SimpleMaxPooling3dTest, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dNCDHW, SimpleMaxPooling3dTest, DataLayout::NCDHW)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dUint8, SimpleMaxPooling3dUint8Test, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dInt16, SimpleMaxPooling3dInt16Test, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dUint8NCDHW, SimpleMaxPooling3dUint8Test, DataLayout::NCDHW)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleMaxPooling3dInt16NCDHW, SimpleMaxPooling3dInt16Test, DataLayout::NCDHW)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleMaxPooling3d, IgnorePaddingSimpleMaxPooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleMaxPooling3dUint8, IgnorePaddingSimpleMaxPooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleMaxPooling3dInt16, IgnorePaddingSimpleMaxPooling3dInt16Test)
+
+//AveragePooling
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleAveragePooling3d, SimpleAveragePooling3dTest, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleAveragePooling3dNCDHW, SimpleAveragePooling3dTest, DataLayout::NCDHW)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleAveragePooling3dUint8, SimpleAveragePooling3dUint8Test, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleAveragePooling3dInt16, SimpleAveragePooling3dInt16Test, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleAveragePooling3dUint8NCDHW, SimpleAveragePooling3dUint8Test, DataLayout::NCDHW)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleAveragePooling3dInt16NCDHW, SimpleAveragePooling3dInt16Test, DataLayout::NCDHW)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(LargeTensorsAveragePooling3d, LargeTensorsAveragePooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(LargeTensorsAveragePooling3dUint8, LargeTensorsAveragePooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(LargeTensorsAveragePooling3dInt16, LargeTensorsAveragePooling3dInt16Test)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleAveragePooling3d, IgnorePaddingSimpleAveragePooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleAveragePooling3dUint8, IgnorePaddingSimpleAveragePooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleAveragePooling3dInt16, IgnorePaddingSimpleAveragePooling3dInt16Test)
+
+//L2Pooling
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleL2Pooling3d, SimpleL2Pooling3dTest, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleL2Pooling3dNCDHW, SimpleL2Pooling3dTest, DataLayout::NCDHW)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleL2Pooling3dUint8, SimpleL2Pooling3dUint8Test, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleL2Pooling3dInt16, SimpleL2Pooling3dInt16Test, DataLayout::NDHWC)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleL2Pooling3dUint8NCDHW, SimpleL2Pooling3dUint8Test, DataLayout::NCDHW)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleL2Pooling3dInt16NCDHW, SimpleL2Pooling3dInt16Test, DataLayout::NCDHW)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleL2Pooling3d, IgnorePaddingSimpleL2Pooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleL2Pooling3dUint8, IgnorePaddingSimpleL2Pooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(IgnorePaddingSimpleL2Pooling3dInt16, IgnorePaddingSimpleL2Pooling3dInt16Test)
+
+//NonSquarePooling
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareMaxPooling3d, AsymmetricNonSquareMaxPooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareMaxPooling3dUint8, AsymmetricNonSquareMaxPooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareMaxPooling3dInt16, AsymmetricNonSquareMaxPooling3dInt16Test)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareAveragePooling3d, AsymmetricNonSquareAveragePooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareAveragePooling3dUint8, AsymmetricNonSquareAveragePooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareAveragePooling3dInt16, AsymmetricNonSquareAveragePooling3dInt16Test)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareL2Pooling3d, AsymmetricNonSquareL2Pooling3dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareL2Pooling3dUint8, AsymmetricNonSquareL2Pooling3dUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AsymmNonSquareL2Pooling3dInt16, AsymmetricNonSquareL2Pooling3dInt16Test)
+// Pooling 3D ]
 
 
 // Linear Activation
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index f212522..60d8255 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -58,6 +58,8 @@
     Pad.hpp
     Pooling2d.cpp
     Pooling2d.hpp
+    Pooling3d.cpp
+    Pooling3d.hpp
     PreluImpl.cpp
     PreluImpl.hpp
     Reduce.cpp
@@ -139,6 +141,8 @@
     RefPermuteWorkload.hpp
     RefPooling2dWorkload.cpp
     RefPooling2dWorkload.hpp
+    RefPooling3dWorkload.cpp
+    RefPooling3dWorkload.hpp
     RefPreluWorkload.cpp
     RefPreluWorkload.hpp
     RefQuantizeWorkload.cpp
diff --git a/src/backends/reference/workloads/Pooling3d.cpp b/src/backends/reference/workloads/Pooling3d.cpp
new file mode 100644
index 0000000..3cae2a9
--- /dev/null
+++ b/src/backends/reference/workloads/Pooling3d.cpp
@@ -0,0 +1,328 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Pooling3d.hpp"
+
+#include <armnn/Exceptions.hpp>
+#include <armnn/Types.hpp>
+
+#include <armnnUtils/DataLayoutIndexed.hpp>
+#include <armnn/utility/NumericCast.hpp>
+
+#include <limits>
+#include <algorithm>
+#include <functional>
+
+namespace
+{
+    using PoolingAlgorithm = armnn::PoolingAlgorithm;
+
+    float DefaultInitializer(PoolingAlgorithm algorithm)
+    {
+        switch (algorithm)
+        {
+            case PoolingAlgorithm::Max:
+            {
+                return std::numeric_limits<float>::lowest();
+            }
+            case PoolingAlgorithm::Average:
+            case PoolingAlgorithm::L2:
+            {
+                return 0.0f;
+            }
+            default:
+            {
+                throw armnn::InvalidArgumentException("Unsupported pooling algorithm");
+            }
+        }
+    }
+
+    using Accumulator = std::function<void(float & accu, float value)>;
+
+    Accumulator GetAccumulator(PoolingAlgorithm algorithm)
+    {
+        switch (algorithm)
+        {
+            case PoolingAlgorithm::Max:
+            {
+                return [](float & accu, float value) {
+                    if (value > accu) {
+                        accu = value;
+                    }
+                };
+            }
+
+            case PoolingAlgorithm::Average:
+            {
+                return [](float & accu, float value) {
+                    accu += value;
+                };
+            }
+
+            case PoolingAlgorithm::L2:
+            {
+                return [](float & accu, float value) {
+                    accu += (value*value);
+                };
+            }
+
+            default:
+            {
+                throw armnn::InvalidArgumentException("Unsupported pooling algorithm");
+            }
+        }
+    }
+
+    using Executor = std::function<void(float & accumulated, float kernelSize)>;
+
+    Executor GetExecutor(PoolingAlgorithm algorithm)
+    {
+        switch (algorithm)
+        {
+            case PoolingAlgorithm::Max:
+            {
+                return [](float & /*accumulated*/, float /*kernelSize*/) {};
+            }
+
+            case PoolingAlgorithm::Average:
+            {
+                return [](float & accumulated, float kernelSize) {
+                    accumulated /= kernelSize;
+                };
+            }
+
+            case PoolingAlgorithm::L2:
+            {
+                return [](float & accumulated, float kernelSize) {
+                    accumulated = sqrtf(accumulated / kernelSize);
+                };
+            }
+
+            default:
+            {
+                throw armnn::InvalidArgumentException("Unsupported pooling algorithm");
+            }
+        }
+    }
+
+    bool OnPaddingOnly(int start, int end, int maxRange)
+    {
+        if (end <= 0 || start > maxRange)
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+
+    bool ClampRange(int & start, int & end, int maxRange)
+    {
+        if (start < 0 || end > maxRange)
+        {
+            start = std::min(std::max(start, 0), maxRange);
+            end   = std::min(std::max(end, 0), maxRange);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    int CalculateIndex(int channels, int depth, int height, int width,
+                             int n, int c, int z, int y, int x,
+                            armnnUtils::DataLayoutIndexed dataLayout) {
+        switch (dataLayout.GetDataLayout())
+        {
+            case armnn::DataLayout::NDHWC:
+            {
+                int outputIndex = n * depth * height * width * channels +
+                            z * height * width * channels +
+                            y * width * channels +
+                            x * channels +
+                            c;
+                return outputIndex;
+            }
+            case armnn::DataLayout::NCDHW:
+            {
+                int outputIndex = n * channels * depth * height * width +
+                            c * depth * height * width +
+                            z * height * width +
+                            y * width +
+                            x;
+                return outputIndex;
+            }
+            default:
+            {
+                throw armnn::InvalidArgumentException("Unsupported data layout.");
+            }
+        }
+    }
+}
+
+using namespace armnnUtils;
+
+namespace armnn
+{
+void Pooling3d(Decoder<float>& rInputDecoder,
+               Encoder<float>& rOutputEncoder,
+               const TensorInfo& inputInfo,
+               const TensorInfo& outputInfo,
+               const Pooling3dDescriptor& params)
+{
+    const DataLayoutIndexed dataLayout(params.m_DataLayout);
+
+    auto channelsIndex = dataLayout.GetChannelsIndex();
+
+    auto depthIndex = dataLayout.GetDepthIndex();
+    auto heightIndex = dataLayout.GetHeightIndex();
+    auto widthIndex = dataLayout.GetWidthIndex();
+
+    const int batchSize    = armnn::numeric_cast<int>(outputInfo.GetShape()[0]);
+    const int channels     = armnn::numeric_cast<int>(outputInfo.GetShape()[channelsIndex]);
+
+    const int depthOutput  = armnn::numeric_cast<int>(outputInfo.GetShape()[depthIndex]);
+    const int heightOutput = armnn::numeric_cast<int>(outputInfo.GetShape()[heightIndex]);
+    const int widthOutput  = armnn::numeric_cast<int>(outputInfo.GetShape()[widthIndex]);
+
+    const int depthInput   = armnn::numeric_cast<int>(inputInfo.GetShape()[depthIndex]);
+    const int heightInput  = armnn::numeric_cast<int>(inputInfo.GetShape()[heightIndex]);
+    const int widthInput   = armnn::numeric_cast<int>(inputInfo.GetShape()[widthIndex]);
+
+    const int padLeft      = armnn::numeric_cast<int>(params.m_PadLeft);
+    const int padRight     = armnn::numeric_cast<int>(params.m_PadRight);
+    const int padTop       = armnn::numeric_cast<int>(params.m_PadTop);
+    const int padBottom    = armnn::numeric_cast<int>(params.m_PadBottom);
+    const int padFront     = armnn::numeric_cast<int>(params.m_PadFront);
+    const int padBack      = armnn::numeric_cast<int>(params.m_PadBack);
+
+    const int strideX      = armnn::numeric_cast<int>(params.m_StrideX);
+    const int strideY      = armnn::numeric_cast<int>(params.m_StrideY);
+    const int strideZ      = armnn::numeric_cast<int>(params.m_StrideZ);
+
+    const int poolHeight   = armnn::numeric_cast<int>(params.m_PoolHeight);
+    const int poolWidth    = armnn::numeric_cast<int>(params.m_PoolWidth);
+    const int poolDepth    = armnn::numeric_cast<int>(params.m_PoolDepth);
+
+    float defaultInitializer = DefaultInitializer(params.m_PoolType);
+    Accumulator accumulate = GetAccumulator(params.m_PoolType);
+    Executor execute       = GetExecutor(params.m_PoolType);
+
+    // Check supported padding methods outside the loop to simplify
+    // the inner loop.
+    if (params.m_PaddingMethod != PaddingMethod::Exclude &&
+        params.m_PaddingMethod != PaddingMethod::IgnoreValue)
+    {
+        throw armnn::InvalidArgumentException("Unsupported padding type");
+    }
+
+    const std::vector<float> decodedInputVec = rInputDecoder.DecodeTensor(inputInfo.GetShape());
+
+    for (int n = 0; n < batchSize; n++)
+    {
+        for (int c = 0; c < channels; c++)
+        {
+            for (int zOutput = 0; zOutput < depthOutput; zOutput++)
+            {
+                //  Calculate values independent of the x and y axis
+                int dstart = (zOutput * strideZ) - padFront;
+                int dend = dstart + poolDepth;
+                // Clamp the pooling region inside the valid input area (which includes the padding).
+                // This is necessary because the final pooling in a row may overlap beyond the padding.
+                dend = std::min(dend, depthInput + padBack);
+
+                int depth = dend - dstart;
+                bool dclamped = ClampRange(dstart, dend, depthInput);
+                int depthClamped = dend - dstart;
+
+                for (int yOutput = 0; yOutput < heightOutput; yOutput++)
+                {
+                    int hstart = (yOutput * strideY) - padTop;
+                    int hend = hstart + poolHeight;
+                    // Clamp the pooling region inside the valid input area (which includes the padding).
+                    // This is necessary because the final pooling in a row may overlap beyond the padding.
+                    hend = std::min(hend, heightInput + padBottom);
+
+                    int height = hend - hstart;
+                    bool hclamped = ClampRange(hstart, hend, heightInput);
+                    int heightClamped = hend - hstart;
+
+                    for (int xOutput = 0; xOutput < widthOutput; xOutput++)
+                    {
+                        int wstart = (xOutput * strideX) - padLeft;
+                        int wend = wstart + poolWidth;
+                        // Clamp the pooling region inside the valid input area (which includes the padding).
+                        // This is necessary because the final pooling in a row may overlap beyond the padding.
+                        wend = std::min(wend, widthInput + padRight);
+
+                        int width = wend - wstart;
+                        bool wclamped = ClampRange(wstart, wend, widthInput);
+                        int widthClamped = wend - wstart;
+
+                        float result = defaultInitializer;
+                        float poolAreaSize = armnn::numeric_cast<float>(depth * height * width);
+
+                        // Special case: when the pooling kernel is over a padding region and the padding
+                        //               size is larger or equal to the kernel and the kernel only covers
+                        //               padding and no real values, then we initialize the result as zero
+                        //               by convention. This is because we need to choose a value here and
+                        //               all values we have are padding, which we ignore.
+                        if (OnPaddingOnly(dstart, dend, depthInput) ||
+                            OnPaddingOnly(hstart, hend, heightInput) ||
+                            OnPaddingOnly(wstart, wend, widthInput))
+                        {
+                            result = 0.0f;
+
+                            int outputIndex = CalculateIndex(channels, depthOutput, heightOutput, widthOutput,
+                                n, c, zOutput, yOutput, xOutput, dataLayout);
+
+                            rOutputEncoder[static_cast<unsigned int>(outputIndex)];
+                            rOutputEncoder.Set(result);
+
+                            continue;
+                        }
+
+                        bool clamped = (dclamped | hclamped | wclamped);
+
+                        if (clamped && params.m_PaddingMethod == PaddingMethod::Exclude)
+                        {
+                            // When we exclude the padding, it means we calculate with a smaller
+                            // kernel size, so I changed the divisor here.
+                            poolAreaSize = armnn::numeric_cast<float>(depthClamped * heightClamped * widthClamped);
+                        }
+
+                        for (auto zInput = dstart; zInput < dend; zInput++)
+                        {
+                            for (auto yInput = hstart; yInput < hend; yInput++)
+                            {
+                                for (auto xInput = wstart; xInput < wend; xInput++)
+                                {
+
+                                    int inputIndex = CalculateIndex(channels, depthInput, heightInput, widthInput,
+                                n, c, zInput, yInput, xInput, dataLayout);
+
+                                    accumulate(result, decodedInputVec[static_cast<unsigned int>(inputIndex)]);
+                                }
+                            }
+                        }
+
+                        execute(result, poolAreaSize);
+
+                        int outputIndex = CalculateIndex(channels, depthOutput, heightOutput, widthOutput,
+                            n, c, zOutput, yOutput, xOutput, dataLayout);
+
+                        rOutputEncoder[static_cast<unsigned int>(outputIndex)];
+                        rOutputEncoder.Set(result);
+                    }
+                }
+            }
+        }
+    }
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/Pooling3d.hpp b/src/backends/reference/workloads/Pooling3d.hpp
new file mode 100644
index 0000000..dd3c919
--- /dev/null
+++ b/src/backends/reference/workloads/Pooling3d.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/Descriptors.hpp>
+#include <armnn/Tensor.hpp>
+
+#include "BaseIterator.hpp"
+
+namespace armnn
+{
+/// Computes the Pooling3d operation.
+void Pooling3d(Decoder<float>& rInputDecoder,
+               Encoder<float>& rOutputEncoder,
+               const TensorInfo& inputInfo,
+               const TensorInfo& outputInfo,
+               const Pooling3dDescriptor& params);
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefPooling3dWorkload.cpp b/src/backends/reference/workloads/RefPooling3dWorkload.cpp
new file mode 100644
index 0000000..d1e00aa
--- /dev/null
+++ b/src/backends/reference/workloads/RefPooling3dWorkload.cpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefPooling3dWorkload.hpp"
+
+#include "Pooling3d.hpp"
+#include "RefWorkloadUtils.hpp"
+
+#include "Profiling.hpp"
+#include "BaseIterator.hpp"
+
+namespace armnn
+{
+void RefPooling3dWorkload::Execute() const
+{
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefPooling3dWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefPooling3dWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefPooling3dWorkload_Execute");
+
+    const TensorInfo& inputInfo  = GetTensorInfo(inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
+
+    auto inputDecoder  = MakeDecoder<float>(inputInfo,  inputs[0] ->Map());
+    auto outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
+
+    Pooling3d(*inputDecoder,
+              *outputEncoder,
+              inputInfo,
+              outputInfo,
+              m_Data.m_Parameters);
+}
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefPooling3dWorkload.hpp b/src/backends/reference/workloads/RefPooling3dWorkload.hpp
new file mode 100644
index 0000000..1188af2
--- /dev/null
+++ b/src/backends/reference/workloads/RefPooling3dWorkload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+#include "Decoders.hpp"
+#include "Encoders.hpp"
+
+namespace armnn
+{
+class RefPooling3dWorkload : public BaseWorkload<Pooling3dQueueDescriptor>
+{
+public:
+    using BaseWorkload<Pooling3dQueueDescriptor>::BaseWorkload;
+
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
+};
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 914137c..700a1d6 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -14,6 +14,7 @@
 #include "FullyConnected.hpp"
 #include "Gather.hpp"
 #include "Pooling2d.hpp"
+#include "Pooling3d.hpp"
 #include "RefActivationWorkload.hpp"
 #include "RefArgMinMaxWorkload.hpp"
 #include "RefBatchNormalizationWorkload.hpp"
@@ -51,6 +52,7 @@
 #include "RefMeanWorkload.hpp"
 #include "RefNormalizationWorkload.hpp"
 #include "RefPooling2dWorkload.hpp"
+#include "RefPooling3dWorkload.hpp"
 #include "RefPermuteWorkload.hpp"
 #include "RefPadWorkload.hpp"
 #include "RefPreluWorkload.hpp"