backends/reference: Add ReduceSum operation support

This patch addes ReduceSum operation support for reference backend,
which computes the sum of elements across dimensions of a tensor.

Changelog v1:
- Fix file header descriptions.

Chagelog v2:
- Fix line limit issue.
- Fix type conversion issue.

Changelog v3:
- Remove tabs.
- Modify newly added file headers.

Changelog v4:
- Symbol on header isn't allowed so drop it from newly added file headers.

Changelog v5:
- Remove tabs, fix the use of brackets and align lines correctly.

Changelog v6:
- Add serializer and deserializer support.

Changelog v7:
- Fix build error add missed code.

Changelog v8:
- Rename ReduceSumDecriptor to ReduceDescriptor
    - Update m_KeepDims field data type to bool on ReduceDescriptor
    - Add ReduceOperation field to ReduceDescriptor

- Rename ReduceSumLayer to ReduceLayer
    - Update ReduceLayer to use ReduceDescriptor
    - Update ReduceLayer::ValidateTensorShapesFromInputs() function

- Rename RefReduceSumWokload to RefReduceWorkload
    - Update workload to use ReduceDescriptor
    - Update workload to use Decoders and Encoders

- Remove ReduceSum.hpp and ReduceSum.cpp
- Added Reduce.hpp and Reduce.cpp
     - Move Mean.cpp (which is implementing REDUCE_MEAN) functionality to Reduce.cpp
     - Update RefMeanWorkload to call Reduce function with ReduceOperation::Mean argument

- Remove Mean.hpp and Mean.cpp
- Update the Serializer/Deserializer ArmnnSchema.fbs for ReduceLayer, ReduceDescriptor, and ReduceOperation
- Update Serializer and Deserializer for serializing/parsing ReduceLayer
- Added TfLiter parser Sum test for REDUCE_SUM operator
- Make corresponding changes on front-end and Ref backend to support REDUCE_SUM operator

Changelog v9:
- Fixed build errors.

Change-Id: I8c8e034f3df73f9565b3c18eff51ecca6c542195
Signed-off-by: Inki Dae <inki.dae@samsung.com>
Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index 5435910..77067d9 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -512,6 +512,14 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
+bool LayerSupportBase::IsReduceSupported(const TensorInfo& /*input*/,
+                                         const TensorInfo& /*output*/,
+                                         const ReduceDescriptor& /*descriptor*/,
+                                         Optional<std::string&> reasonIfUnsupported) const
+{
+    return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
+}
+
 bool LayerSupportBase::IsReshapeSupported(const TensorInfo&, // input
                                           const TensorInfo&, // output
                                           const ReshapeDescriptor&, // descriptor
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index 7b873e3..e04d657 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -315,6 +315,11 @@
                          const TensorInfo& output,
                          Optional<std::string&> reasonIfUnsupported) const override;
 
+    bool IsReduceSupported(const TensorInfo& input,
+                           const TensorInfo& output,
+                           const ReduceDescriptor& descriptor,
+                           Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsReshapeSupported(const TensorInfo& input,
                             const TensorInfo& output,
                             const ReshapeDescriptor& descriptor,
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index d795e32..b51099f 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -3633,4 +3633,31 @@
     }
 }
 
+void ReduceQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"ReduceQueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 4, "input");
+
+    std::vector<DataType> supportedTypes =
+    {
+        DataType::BFloat16,
+        DataType::Float16,
+        DataType::Float32,
+        DataType::QAsymmS8,
+        DataType::QAsymmU8,
+        DataType::QSymmS16,
+        DataType::Signed32
+    };
+
+    ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
+    ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+}
+
 } // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp
index 0a232dc..8a2dd1f 100644
--- a/src/backends/backendsCommon/WorkloadData.hpp
+++ b/src/backends/backendsCommon/WorkloadData.hpp
@@ -668,4 +668,9 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+struct ReduceQueueDescriptor : QueueDescriptorWithParameters<ReduceDescriptor>
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 } // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 3a8a2ae..19281a8 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -1220,6 +1220,18 @@
 
             break;
         }
+        case LayerType::Reduce:
+        {
+            auto cLayer = PolymorphicDowncast<const ReduceLayer*>(&layer);
+            const TensorInfo& input  = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+
+            result = layerSupportObject->IsReduceSupported(OverrideDataType(input, dataType),
+                                                           OverrideDataType(output, dataType),
+                                                           cLayer->GetParameters(),
+                                                           reason);
+            break;
+        }
         default:
         {
             ARMNN_ASSERT_MSG(false, "WorkloadFactory did not recognise type of layer.");
@@ -1593,6 +1605,12 @@
     return std::unique_ptr<IWorkload>();
 }
 
+std::unique_ptr<IWorkload> IWorkloadFactory::CreateReduce(const ReduceQueueDescriptor& /*descriptor*/,
+                                                          const WorkloadInfo& /*info*/) const
+{
+    return std::unique_ptr<IWorkload>();
+}
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& /*descriptor*/,
                                                            const WorkloadInfo& /*info*/) const
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.hpp b/src/backends/backendsCommon/WorkloadFactory.hpp
index 2e813e9..6ab6d2c 100644
--- a/src/backends/backendsCommon/WorkloadFactory.hpp
+++ b/src/backends/backendsCommon/WorkloadFactory.hpp
@@ -231,6 +231,9 @@
     virtual std::unique_ptr<IWorkload> CreateRank(const RankQueueDescriptor& descriptor,
                                                   const WorkloadInfo& info) const;
 
+    virtual std::unique_ptr<IWorkload> CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                                    const WorkloadInfo& info) const;
+
     virtual std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                                      const WorkloadInfo& info) const;
 
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 7254d21..3b6299d 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -75,6 +75,7 @@
     test/layerTests/PadTestImpl.cpp \
     test/layerTests/Pooling2dTestImpl.cpp \
     test/layerTests/RankTestImpl.cpp \
+    test/layerTests/ReduceSumTestImpl.cpp \
     test/layerTests/ReshapeTestImpl.cpp \
     test/layerTests/ResizeTestImpl.cpp \
     test/layerTests/RsqrtTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 7894895..b20ef2d 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -137,6 +137,8 @@
     layerTests/QuantizeTestImpl.hpp
     layerTests/RankTestImpl.cpp
     layerTests/RankTestImpl.hpp
+    layerTests/ReduceSumTestImpl.cpp
+    layerTests/ReduceSumTestImpl.hpp
     layerTests/ReshapeTestImpl.cpp
     layerTests/ReshapeTestImpl.hpp
     layerTests/ResizeTestImpl.cpp
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index 1492a80..c7d1dd2 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -677,6 +677,8 @@
 
 DECLARE_LAYER_POLICY_1_PARAM(Subtraction)
 
+DECLARE_LAYER_POLICY_2_PARAM(Reduce)
+
 DECLARE_LAYER_POLICY_1_PARAM(Switch)
 
 DECLARE_LAYER_POLICY_2_PARAM(Transpose)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index e9eb5b9..d87a3b0 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -48,6 +48,7 @@
 #include <backendsCommon/test/layerTests/PreluTestImpl.hpp>
 #include <backendsCommon/test/layerTests/QuantizeTestImpl.hpp>
 #include <backendsCommon/test/layerTests/RankTestImpl.hpp>
+#include <backendsCommon/test/layerTests/ReduceSumTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ReshapeTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ResizeTestImpl.hpp>
 #include <backendsCommon/test/layerTests/RsqrtTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp
new file mode 100644
index 0000000..4edbd11
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp
@@ -0,0 +1,344 @@
+//
+// Copyright © 2020 Samsung Electronics Co Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ReduceSumTestImpl.hpp"
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory,
+        const armnn::TensorInfo inputTensorInfo,
+        const armnn::TensorInfo outputTensorInfo,
+        const std::vector<float>& inputData,
+        const std::vector<float>& outputData,
+        const std::vector<int32_t> vAxis,
+        const armnn::ReduceOperation reduceOperation)
+{
+    IgnoreUnused(memoryManager);
+    auto inputTensor = MakeTensor<T, 4>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputData, inputTensorInfo));
+
+    LayerTestResult<float, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<float, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ReduceQueueDescriptor descriptor;
+    std::vector<uint32_t> updated_idx;
+    uint32_t resolvedAxis = 0;
+    for (uint32_t i = 0; i < vAxis.size(); ++i)
+    {
+        if (vAxis[i] <  0)
+        {
+            resolvedAxis = inputTensorInfo.GetNumDimensions() + static_cast<uint32_t>(vAxis[i]);
+        } else
+        {
+            resolvedAxis = static_cast<uint32_t>(vAxis[i]);
+        }
+
+        updated_idx.push_back(resolvedAxis);
+    }
+
+    descriptor.m_Parameters.m_vAxis = updated_idx;
+    descriptor.m_Parameters.m_ReduceOperation = reduceOperation;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateReduce(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), inputTensor.origin());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(result.output.origin(), outputHandle.get());
+
+    return result;
+}
+
+} // namespace
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceSumSimpleTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 1, 1, 5 };
+    const armnn::TensorShape outputShape{ 1, 1, 1, 1};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues({ 5.0f, 2.0f, 8.0f, 10.0f, 9.0f });
+    std::vector<float> outputValues({ 34.0f });
+
+    return ReduceTestCommon<ArmnnType>(workloadFactory,
+                                       memoryManager,
+                                       tensorHandleFactory,
+                                       inputTensorInfo,
+                                       outputTensorInfo,
+                                       inputValues,
+                                       outputValues,
+                                       { -1 },
+                                       armnn::ReduceOperation::Sum);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceSumSingleAxisTest1(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 3, 2, 4 };
+    const armnn::TensorShape outputShape{ 1, 1, 2, 4};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues({  1.0f,   2.0f,   3.0f,   4.0f,
+                                      5.0f,   6.0f,   7.0f,   8.0f,
+
+                                     10.0f,  20.0f,  30.0f,  40.0f,
+                                     50.0f,  60.0f,  70.0f,  80.0f,
+
+                                    100.0f, 200.0f, 300.0f, 400.0f,
+                                    500.0f, 600.0f, 700.0f, 800.0f });
+    std::vector<float> outputValues({ 111.0f, 222.0f, 333.0f, 444.0f,
+                                      555.0f, 666.0f, 777.0f, 888.0f });
+
+    return ReduceTestCommon<ArmnnType>(workloadFactory,
+                                       memoryManager,
+                                       tensorHandleFactory,
+                                       inputTensorInfo,
+                                       outputTensorInfo,
+                                       inputValues,
+                                       outputValues,
+                                       { 1 },
+                                       armnn::ReduceOperation::Sum);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceSumSingleAxisTest2(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 6, 3, 4 };
+    const armnn::TensorShape outputShape{ 1, 1, 3, 4};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues( {7, 8, 6, 1,
+                                     1, 1, 8, 7,
+                                     3, 7, 7, 7,
+
+                                     6, 8, 4, 7,
+                                     3, 8, 7, 3,
+                                     5, 8, 8, 8,
+
+
+                                     7, 8, 2, 7,
+                                     3, 8, 5, 6,
+                                     8, 4, 2, 7,
+
+                                     1, 6, 7, 2,
+                                     8, 3, 3, 1,
+                                     7, 6, 2, 6,
+
+
+                                     5, 3, 4, 8,
+                                     7, 8, 2, 4,
+                                     6, 6, 2, 8,
+
+                                     2, 2, 7, 2,
+                                     5, 3, 6, 3,
+                                     6, 1, 8, 8});
+    std::vector<float> outputValues({  28.0f, 35.0f, 30.0f, 27.0f,
+                                       27.0f, 31.0f, 31.0f, 24.0f,
+                                       35.0f, 32.0f, 29.0f, 44.0f});
+
+    return ReduceTestCommon<ArmnnType>(workloadFactory,
+                                       memoryManager,
+                                       tensorHandleFactory,
+                                       inputTensorInfo,
+                                       outputTensorInfo,
+                                       inputValues,
+                                       outputValues,
+                                       { 1 },
+                                       armnn::ReduceOperation::Sum);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceSumSingleAxisTest3(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 6, 3, 4 };
+    const armnn::TensorShape outputShape{ 1, 6, 3, 1};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues( {7, 8, 6, 1,
+                                     1, 1, 8, 7,
+                                     3, 7, 7, 7,
+
+                                     6, 8, 4, 7,
+                                     3, 8, 7, 3,
+                                     5, 8, 8, 8,
+
+
+                                     7, 8, 2, 7,
+                                     3, 8, 5, 6,
+                                     8, 4, 2, 7,
+
+                                     1, 6, 7, 2,
+                                     8, 3, 3, 1,
+                                     7, 6, 2, 6,
+
+
+                                     5, 3, 4, 8,
+                                     7, 8, 2, 4,
+                                     6, 6, 2, 8,
+
+                                     2, 2, 7, 2,
+                                     5, 3, 6, 3,
+                                     6, 1, 8, 8});
+    std::vector<float> outputValues({  22.0f, 17.0f, 24.0f,
+                                       25.0f, 21.0f, 29.0f,
+
+                                       24.0f, 22.0f, 21.0f,
+                                       16.0f, 15.0f, 21.0f,
+
+                                       20.0f, 21.0f, 22.0f,
+                                       13.0f, 17.0f, 23.0f});
+
+    return ReduceTestCommon<ArmnnType>(workloadFactory,
+                                       memoryManager,
+                                       tensorHandleFactory,
+                                       inputTensorInfo,
+                                       outputTensorInfo,
+                                       inputValues,
+                                       outputValues,
+                                       { 3 },
+                                       armnn::ReduceOperation::Sum);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceSumMultipleAxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 3, 2, 4 };
+    const armnn::TensorShape outputShape{ 1, 1, 1, 4};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues({  1.0f,   2.0f,   3.0f,   4.0f,
+                                      5.0f,   6.0f,   7.0f,   8.0f,
+
+                                     10.0f,  20.0f,  30.0f,  40.0f,
+                                     50.0f,  60.0f,  70.0f,  80.0f,
+
+                                    100.0f, 200.0f, 300.0f, 400.0f,
+                                    500.0f, 600.0f, 700.0f, 800.0f });
+    std::vector<float> outputValues({ 666.0f, 888.0f, 1110.0f, 1332.0f });
+
+    return ReduceTestCommon<ArmnnType>(workloadFactory,
+                                       memoryManager,
+                                       tensorHandleFactory,
+                                       inputTensorInfo,
+                                       outputTensorInfo,
+                                       inputValues,
+                                       outputValues,
+                                       { 1, 2 },
+                                       armnn::ReduceOperation::Sum);
+}
+
+// Explicit template specializations
+
+template LayerTestResult<float, 4>
+ReduceSumSimpleTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceSumSingleAxisTest1<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceSumSingleAxisTest2<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceSumSingleAxisTest3<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceSumMultipleAxisTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
diff --git a/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.hpp
new file mode 100644
index 0000000..db23240
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.hpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2020 Samsung Electronics Co Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceSumSimpleTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceSumSingleAxisTest1(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceSumSingleAxisTest2(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceSumSingleAxisTest3(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceSumMultipleAxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index bdaaafb..992ae71 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -1706,6 +1706,36 @@
            "Reference rank: input type not supported.");
 }
 
+bool RefLayerSupport::IsReduceSupported(const TensorInfo& input,
+                                        const TensorInfo& output,
+                                        const ReduceDescriptor& descriptor,
+                                        Optional<std::string&> reasonIfUnsupported) const
+{
+    IgnoreUnused(descriptor);
+    bool supported = true;
+    std::array<DataType,7> supportedTypes =
+    {
+        DataType::BFloat16,
+        DataType::Float32,
+        DataType::Float16,
+        DataType::QAsymmS8,
+        DataType::QAsymmU8,
+        DataType::QSymmS16,
+        DataType::Signed32
+    };
+
+    supported &= CheckSupportRule(TypeAnyOf(input, supportedTypes), reasonIfUnsupported,
+                                  "Reference Reduce: input type not supported");
+
+    supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported,
+                                  "Reference Reduce: output type not supported");
+
+    supported &= CheckSupportRule(TypesAreEqual(input, output), reasonIfUnsupported,
+                                  "Reference Reduce: input and output types not matching");
+
+    return supported;
+}
+
 bool RefLayerSupport::IsReshapeSupported(const TensorInfo& input,
                                          const TensorInfo& output,
                                          const ReshapeDescriptor& descriptor,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 6b64408..b75b778 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -275,6 +275,11 @@
                          const TensorInfo& output,
                          Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsReduceSupported(const TensorInfo& input,
+                           const TensorInfo& output,
+                           const ReduceDescriptor& descriptor,
+                           Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsReshapeSupported(const TensorInfo& input,
                             const TensorInfo& output,
                             const ReshapeDescriptor& descriptor,
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 468aeb3..fde6c86 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -580,6 +580,12 @@
     return std::make_unique<RefRankWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                                            const WorkloadInfo& info) const
+{
+    return std::make_unique<RefReduceWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                                              const WorkloadInfo& info) const
 {
diff --git a/src/backends/reference/RefWorkloadFactory.hpp b/src/backends/reference/RefWorkloadFactory.hpp
index 41cefd3..c22d87f 100644
--- a/src/backends/reference/RefWorkloadFactory.hpp
+++ b/src/backends/reference/RefWorkloadFactory.hpp
@@ -223,6 +223,9 @@
     std::unique_ptr<IWorkload> CreateRank(const RankQueueDescriptor& descriptor,
                                           const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                              const WorkloadInfo& info) const override;
 
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index b4aa3a0..9676509 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -38,11 +38,11 @@
         workloads/InstanceNorm.cpp \
         workloads/LogSoftmax.cpp \
         workloads/LstmUtils.cpp \
-        workloads/Mean.cpp \
         workloads/Concatenate.cpp \
         workloads/Pad.cpp \
         workloads/Pooling2d.cpp \
         workloads/PreluImpl.cpp \
+        workloads/Reduce.cpp \
         workloads/RefActivationWorkload.cpp \
         workloads/RefArgMinMaxWorkload.cpp \
         workloads/RefBatchNormalizationWorkload.cpp \
@@ -81,6 +81,7 @@
         workloads/RefPreluWorkload.cpp \
         workloads/RefQLstmWorkload.cpp \
         workloads/RefQuantizeWorkload.cpp \
+        workloads/RefReduceWorkload.cpp \
         workloads/RefReshapeWorkload.cpp \
         workloads/RefResizeBilinearWorkload.cpp \
         workloads/RefResizeWorkload.cpp \
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 502e0cb..d5e0f82 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -2234,4 +2234,11 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(LogicalAndBroadcast3, LogicalAndBroadcast3Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(LogicalOrBroadcast3, LogicalOrBroadcast3Test)
 
+// ReduceSum
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumFloat32, ReduceSumSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_1, ReduceSumSingleAxisTest1<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_2, ReduceSumSingleAxisTest2<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_3, ReduceSumSingleAxisTest3<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumMultipleAxisFloat32, ReduceSumMultipleAxisTest<DataType::Float32>)
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 1b20e5b..1f4298b 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -44,8 +44,6 @@
     LstmUtils.hpp
     LstmUtils.cpp
     Maximum.hpp
-    Mean.cpp
-    Mean.hpp
     Concatenate.hpp
     Concatenate.cpp
     Minimum.hpp
@@ -55,6 +53,8 @@
     Pooling2d.hpp
     PreluImpl.cpp
     PreluImpl.hpp
+    Reduce.cpp
+    Reduce.hpp
     RefActivationWorkload.cpp
     RefActivationWorkload.hpp
     RefArgMinMaxWorkload.cpp
@@ -132,6 +132,8 @@
     RefQLstmWorkload.cpp
     RefQLstmWorkload.hpp
     RefRankWorkload.hpp
+    RefReduceWorkload.cpp
+    RefReduceWorkload.hpp
     RefReshapeWorkload.cpp
     RefReshapeWorkload.hpp
     RefResizeBilinearWorkload.cpp
diff --git a/src/backends/reference/workloads/Mean.hpp b/src/backends/reference/workloads/Mean.hpp
deleted file mode 100644
index dfb0302..0000000
--- a/src/backends/reference/workloads/Mean.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//
-// Copyright © 2017 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "armnn/DescriptorsFwd.hpp"
-#include "armnn/Tensor.hpp"
-#include "BaseIterator.hpp"
-
-#include <vector>
-
-namespace armnn
-{
-void Mean(const TensorInfo& inputInfo,
-          const TensorInfo& outputInfo,
-          const std::vector<unsigned int>& axis,
-          Decoder<float>& input,
-          Encoder<float>& output);
-} //namespace armnn
-
diff --git a/src/backends/reference/workloads/Mean.cpp b/src/backends/reference/workloads/Reduce.cpp
similarity index 78%
rename from src/backends/reference/workloads/Mean.cpp
rename to src/backends/reference/workloads/Reduce.cpp
index fe34efe..5375c71 100644
--- a/src/backends/reference/workloads/Mean.cpp
+++ b/src/backends/reference/workloads/Reduce.cpp
@@ -1,13 +1,14 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
-#include "Mean.hpp"
-#include <backendsCommon/WorkloadData.hpp>
+#include "Reduce.hpp"
 
 #include <armnn/utility/NumericCast.hpp>
 
+#include <backendsCommon/WorkloadData.hpp>
+
 #include <cmath>
 #include <cstddef>
 #include <functional>
@@ -15,6 +16,7 @@
 
 namespace armnn
 {
+
 bool NextIndex(const unsigned int numDims, const armnn::TensorShape& dims, std::vector<unsigned int>& current)
 {
     unsigned int carry = 1;
@@ -64,18 +66,16 @@
     }
     return offset;
 }
-} // namespace
 
-namespace armnn
-{
-void Mean(const armnn::TensorInfo& inputInfo,
-          const armnn::TensorInfo& outputInfo,
-          const std::vector<unsigned int>& axis,
-          Decoder<float>& input,
-          Encoder<float>& output)
-{
 
-    unsigned int inputNumDims = inputInfo.GetNumDimensions();
+void Reduce(const TensorInfo& inputInfo,
+            const TensorInfo& outputInfo,
+            Decoder<float>& input,
+            Encoder<float>& output,
+            const std::vector<uint32_t> axis,
+            const ReduceOperation reduceOperation)
+{
+    unsigned int inputNumDims  = inputInfo.GetNumDimensions();
     unsigned int outputNumDims = outputInfo.GetNumDimensions();
 
     armnn::TensorShape outputDims = outputInfo.GetShape();
@@ -106,10 +106,10 @@
     std::vector<unsigned int> resolvedAxis = axis;
     if (resolvedAxis.empty())
     {
-      for (unsigned int idx = 0; idx < inputNumDims; ++idx)
-      {
-          resolvedAxis.push_back(idx);
-      }
+        for (unsigned int idx = 0; idx < inputNumDims; ++idx)
+        {
+            resolvedAxis.push_back(idx);
+        }
     }
     auto numResolvedAxis = armnn::numeric_cast<unsigned int>(resolvedAxis.size());
 
@@ -129,15 +129,23 @@
     {
         unsigned int current = inputDims[resolvedAxis[idx]];
         ARMNN_ASSERT(armnn::numeric_cast<float>(current) <
-              (std::numeric_limits<float>::max() / armnn::numeric_cast<float>(numElementsInAxis)));
+                     (std::numeric_limits<float>::max() / armnn::numeric_cast<float>(numElementsInAxis)));
         numElementsInAxis *= current;
     }
     if (numElementsInAxis > 0) {
         for (unsigned int idx = 0; idx < numOutputs; ++idx)
         {
             output[idx];
-            output.Set(tempSum[idx] / armnn::numeric_cast<float>(numElementsInAxis));
+            if (reduceOperation == ReduceOperation::Sum)
+            {
+                output.Set(tempSum[idx]);
+            }
+            else if (reduceOperation == ReduceOperation::Mean)
+            {
+                output.Set(tempSum[idx] / armnn::numeric_cast<float>(numElementsInAxis));
+            }
         }
     }
 }
-} //namespace armnn
+
+} //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/Reduce.hpp b/src/backends/reference/workloads/Reduce.hpp
new file mode 100644
index 0000000..ad777ad
--- /dev/null
+++ b/src/backends/reference/workloads/Reduce.hpp
@@ -0,0 +1,24 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma  once
+
+#include "BaseIterator.hpp"
+#include "Decoders.hpp"
+#include "Encoders.hpp"
+
+#include <armnn/Tensor.hpp>
+
+namespace armnn
+{
+
+void Reduce(const TensorInfo& inputInfo,
+            const TensorInfo& outputInfo,
+            Decoder<float>& input,
+            Encoder<float>& output,
+            const std::vector<uint32_t> axis,
+            const ReduceOperation reduceOperation);
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefMeanWorkload.cpp b/src/backends/reference/workloads/RefMeanWorkload.cpp
index 375ab39..00e59bc 100644
--- a/src/backends/reference/workloads/RefMeanWorkload.cpp
+++ b/src/backends/reference/workloads/RefMeanWorkload.cpp
@@ -5,7 +5,7 @@
 
 #include "RefMeanWorkload.hpp"
 
-#include "Mean.hpp"
+#include "Reduce.hpp"
 #include "RefWorkloadUtils.hpp"
 
 #include "Profiling.hpp"
@@ -28,7 +28,12 @@
     auto inputDecoder  = MakeDecoder<float>(inputInfo,  m_Data.m_Inputs[0]->Map());
     auto outputEncoder = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
 
-    Mean(inputInfo, outputInfo, m_Data.m_Parameters.m_Axis, *inputDecoder, *outputEncoder);
+    Reduce(inputInfo,
+           outputInfo,
+           *inputDecoder,
+           *outputEncoder,
+           m_Data.m_Parameters.m_Axis,
+           armnn::ReduceOperation::Mean);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefReduceWorkload.cpp b/src/backends/reference/workloads/RefReduceWorkload.cpp
new file mode 100644
index 0000000..7a46ff9
--- /dev/null
+++ b/src/backends/reference/workloads/RefReduceWorkload.cpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2020 Samsung Electronics Co Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefReduceWorkload.hpp"
+
+#include "Reduce.hpp"
+#include "RefWorkloadUtils.hpp"
+#include "BaseIterator.hpp"
+#include "Profiling.hpp"
+
+namespace armnn
+{
+
+RefReduceWorkload::RefReduceWorkload(
+    const ReduceQueueDescriptor& descriptor,
+    const WorkloadInfo& info)
+    : BaseWorkload<ReduceQueueDescriptor>(descriptor, info) {}
+
+void RefReduceWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefReduceWorkload_Execute");
+
+    const TensorInfo& inputInfo  = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+
+    std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map());
+    Decoder<float>& decoder = *decoderPtr;
+
+    std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map());
+    Encoder<float>& encoder = *encoderPtr;
+
+    Reduce(inputInfo,
+           outputInfo,
+           decoder,
+           encoder,
+           m_Data.m_Parameters.m_vAxis,
+           m_Data.m_Parameters.m_ReduceOperation);
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefReduceWorkload.hpp b/src/backends/reference/workloads/RefReduceWorkload.hpp
new file mode 100644
index 0000000..1d551ac
--- /dev/null
+++ b/src/backends/reference/workloads/RefReduceWorkload.hpp
@@ -0,0 +1,23 @@
+//
+// Copyright © 2020 Samsung Electronics Co Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+class RefReduceWorkload : public BaseWorkload<ReduceQueueDescriptor>
+{
+public:
+    explicit RefReduceWorkload(const ReduceQueueDescriptor& descriptor,
+                               const WorkloadInfo& info);
+
+    virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 390b2a8..989644f 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -54,6 +54,7 @@
 #include "RefQLstmWorkload.hpp"
 #include "RefQuantizeWorkload.hpp"
 #include "RefRankWorkload.hpp"
+#include "RefReduceWorkload.hpp"
 #include "RefReshapeWorkload.hpp"
 #include "RefResizeBilinearWorkload.hpp"
 #include "RefResizeWorkload.hpp"