MLCE-347 'REDUCE_MIN, REDUCE_MAX, REDUCE_SUM Support'

* Added TfLiteParser support for REDUCE_MIN and REDUCE_MAX operators
* Added ACL workloads support for REDUCE_MIN, REDUCE_MAX, and REDUCE_SUM operators
* Added TfLite Delegate support for REDUCE_MIN, REDUCE_MAX, and REDUCE_SUM operators

Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Change-Id: I8085d59946bfd4ab78a59a61f899031ae53371a8
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a71de7..c862c55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -760,7 +760,6 @@
             src/armnnTfParser/test/Convolution2d.cpp
             src/armnnTfParser/test/Concat.cpp
             src/armnnTfParser/test/ConcatOfConcats.cpp
-            src/armnnTfLiteParser/test/DepthToSpace.cpp
             src/armnnTfParser/test/DepthwiseConvolution2d.cpp
             src/armnnTfParser/test/Equal.cpp
             src/armnnTfParser/test/ExpandDims.cpp
@@ -808,6 +807,7 @@
              src/armnnTfLiteParser/test/Constant.cpp
              src/armnnTfLiteParser/test/Conv2D.cpp
              src/armnnTfLiteParser/test/DepthwiseConvolution2D.cpp
+             src/armnnTfLiteParser/test/DepthToSpace.cpp
              src/armnnTfLiteParser/test/Dequantize.cpp
              src/armnnTfLiteParser/test/DetectionPostProcess.cpp
              src/armnnTfLiteParser/test/Div.cpp
@@ -825,6 +825,7 @@
              src/armnnTfLiteParser/test/Neg.cpp
              src/armnnTfLiteParser/test/Pack.cpp
              src/armnnTfLiteParser/test/Pad.cpp
+             src/armnnTfLiteParser/test/Reduce.cpp
              src/armnnTfLiteParser/test/Reshape.cpp
              src/armnnTfLiteParser/test/ResizeBilinear.cpp
              src/armnnTfLiteParser/test/ResizeNearestNeighbor.cpp
diff --git a/delegate/CMakeLists.txt b/delegate/CMakeLists.txt
index 777702e..74390c8 100644
--- a/delegate/CMakeLists.txt
+++ b/delegate/CMakeLists.txt
@@ -37,6 +37,7 @@
         src/Pooling.hpp
         src/Quantization.hpp
         src/Redefine.hpp
+        src/Reduce.hpp
         src/Resize.hpp
         src/Round.hpp
         src/Slice.hpp
@@ -143,6 +144,8 @@
         src/test/QuantizationTest.cpp
         src/test/QuantizationTestHelper.hpp
         src/test/RedefineTestHelper.hpp
+        src/test/ReduceTest.cpp
+        src/test/ReduceTestHelper.hpp
         src/test/ReshapeTest.cpp
         src/test/ResizeTest.cpp
         src/test/ResizeTestHelper.hpp
diff --git a/delegate/src/Reduce.hpp b/delegate/src/Reduce.hpp
new file mode 100644
index 0000000..13a11d3
--- /dev/null
+++ b/delegate/src/Reduce.hpp
@@ -0,0 +1,133 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <tensorflow/lite/builtin_ops.h>
+#include <tensorflow/lite/c/builtin_op_data.h>
+#include <tensorflow/lite/c/common.h>
+#include <tensorflow/lite/kernels/internal/tensor_ctypes.h>
+#include <tensorflow/lite/minimal_logging.h>
+
+namespace armnnDelegate
+{
+
+TfLiteStatus VisitReduceOperator(DelegateData& delegateData,
+                                 TfLiteContext* tfLiteContext,
+                                 TfLiteNode* tfLiteNode,
+                                 int nodeIndex,
+                                 int32_t reduceOperatorCode)
+{
+    TF_LITE_ENSURE_STATUS(ValidateNumInputs(tfLiteContext, tfLiteNode, 2, nodeIndex));
+    TF_LITE_ENSURE_STATUS(ValidateNumOutputs(tfLiteContext, tfLiteNode, 1, nodeIndex));
+
+    const TfLiteTensor* tfLiteTensors = tfLiteContext->tensors;
+    const TfLiteTensor& tfLiteInputTensor = tfLiteTensors[tfLiteNode->inputs->data[0]];
+    if (!IsValid(tfLiteContext, tfLiteInputTensor, reduceOperatorCode, nodeIndex))
+    {
+        return kTfLiteError;
+    }
+
+    const TfLiteTensor& tfLiteOutputTensor = tfLiteTensors[tfLiteNode->outputs->data[0]];
+    if (!IsValid(tfLiteContext, tfLiteOutputTensor, reduceOperatorCode, nodeIndex))
+    {
+        return kTfLiteError;
+    }
+
+    const armnn::TensorInfo& inputTensorInfo  = GetTensorInfoForTfLiteTensor(tfLiteInputTensor);
+    const armnn::TensorInfo& outputTensorInfo = GetTensorInfoForTfLiteTensor(tfLiteOutputTensor);
+
+    // Get const axis value from model and set it to descriptor.
+    const TfLiteTensor& tfLiteAxisTensor = tfLiteTensors[tfLiteNode->inputs->data[1]];
+    if (!IsValid(tfLiteContext, tfLiteAxisTensor, reduceOperatorCode, nodeIndex))
+    {
+        return kTfLiteError;
+    }
+
+    const armnn::TensorInfo& axisTensorInfo =   GetTensorInfoForTfLiteTensor(tfLiteAxisTensor);
+    auto* axisTensorData = tflite::GetTensorData<int32_t>(&tfLiteAxisTensor);
+
+    std::vector<int32_t> axis;
+    // Add axis data to vector to be converter to unsigned int and assigned to descriptor axis.
+    if (axisTensorData != nullptr)
+    {
+        for (unsigned int i = 0; i < axisTensorInfo.GetNumElements(); ++i)
+        {
+            axis.emplace_back(axisTensorData[i]);
+        }
+    }
+    else
+    {
+        for (unsigned int i = 0; i < inputTensorInfo.GetNumDimensions(); ++i)
+        {
+            axis.push_back(i);
+        }
+    }
+
+    // Convert the axis to unsigned int and remove duplicates.
+    unsigned int rank = inputTensorInfo.GetNumDimensions();
+    std::set<unsigned int> uniqueAxis;
+    std::transform(axis.begin(),
+                   axis.end(),
+                   std::inserter(uniqueAxis, uniqueAxis.begin()),
+                   [rank](int i)->unsigned int{ return (i + rank) % rank; });
+
+    armnn::ReduceDescriptor desc;
+    desc.m_vAxis.assign(uniqueAxis.begin(), uniqueAxis.end());
+
+    auto* reducerParameters = reinterpret_cast<TfLiteReducerParams*>(tfLiteNode->builtin_data);
+    desc.m_KeepDims = reducerParameters->keep_dims;
+    if (reduceOperatorCode == kTfLiteBuiltinReduceMax)
+    {
+        desc.m_ReduceOperation = armnn::ReduceOperation::Max;
+    }
+    else if (reduceOperatorCode == kTfLiteBuiltinReduceMin)
+    {
+        desc.m_ReduceOperation = armnn::ReduceOperation::Min;
+    }
+    else if (reduceOperatorCode == kTfLiteBuiltinSum)
+    {
+        desc.m_ReduceOperation = armnn::ReduceOperation::Sum;
+    }
+    else
+    {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            tfLiteContext,
+            "TfLiteArmnnDelegate: Unsupported Reduction Operator #%d node #%d: ",
+            reduceOperatorCode, nodeIndex);
+        return kTfLiteError;
+    }
+
+    bool isSupported = false;
+    auto validateFunc = [&](const armnn::TensorInfo& outInfo, bool& isSupported)
+    {
+        FORWARD_LAYER_SUPPORT_FUNC(__func__,
+                                   tfLiteContext,
+                                   IsReduceSupported,
+                                   delegateData.m_Backends,
+                                   isSupported,
+                                   inputTensorInfo,
+                                   outInfo,
+                                   desc);
+    };
+
+    if (!delegateData.m_Network)
+    {
+        validateFunc(outputTensorInfo, isSupported);
+        return isSupported ? kTfLiteOk : kTfLiteError;
+    }
+
+    // Add an Reduce layer
+    armnn::IConnectableLayer* layer = delegateData.m_Network->AddReduceLayer(desc);
+    ARMNN_ASSERT(layer != nullptr);
+
+    armnn::IOutputSlot& outputSlot = layer->GetOutputSlot(0);
+    outputSlot.SetTensorInfo(outputTensorInfo);
+
+    // Connect
+    return Connect(layer, tfLiteNode, delegateData);
+}
+
+} // namespace armnnDelegate
diff --git a/delegate/src/armnn_delegate.cpp b/delegate/src/armnn_delegate.cpp
index 3ebc0cc..2b07fc7 100644
--- a/delegate/src/armnn_delegate.cpp
+++ b/delegate/src/armnn_delegate.cpp
@@ -25,6 +25,7 @@
 #include "Pooling.hpp"
 #include "Quantization.hpp"
 #include "Redefine.hpp"
+#include "Reduce.hpp"
 #include "Resize.hpp"
 #include "Round.hpp"
 #include "Slice.hpp"
@@ -733,6 +734,18 @@
                                         tfLiteNode,
                                         nodeIndex,
                                         kTfLiteBuiltinRank);
+        case kTfLiteBuiltinReduceMax:
+            return VisitReduceOperator(delegateData,
+                                       tfLiteContext,
+                                       tfLiteNode,
+                                       nodeIndex,
+                                       kTfLiteBuiltinReduceMax);
+        case kTfLiteBuiltinReduceMin:
+            return VisitReduceOperator(delegateData,
+                                       tfLiteContext,
+                                       tfLiteNode,
+                                       nodeIndex,
+                                       kTfLiteBuiltinReduceMin);
         case kTfLiteBuiltinRelu:
             return VisitActivationOperator(delegateData,
                                            tfLiteContext,
@@ -805,6 +818,12 @@
                                       tfLiteNode,
                                       nodeIndex,
                                       kTfLiteBuiltinStridedSlice);
+        case kTfLiteBuiltinSum:
+            return VisitReduceOperator(delegateData,
+                                       tfLiteContext,
+                                       tfLiteNode,
+                                       nodeIndex,
+                                       kTfLiteBuiltinSum);
         case kTfLiteBuiltinTranspose:
             return VisitTransposeOperator(delegateData,
                                           tfLiteContext,
diff --git a/delegate/src/test/ReduceTest.cpp b/delegate/src/test/ReduceTest.cpp
new file mode 100644
index 0000000..49608b6
--- /dev/null
+++ b/delegate/src/test/ReduceTest.cpp
@@ -0,0 +1,354 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ReduceTestHelper.hpp"
+
+#include <armnn_delegate.hpp>
+
+#include <flatbuffers/flatbuffers.h>
+#include <tensorflow/lite/schema/schema_generated.h>
+
+#include <doctest/doctest.h>
+
+namespace armnnDelegate
+{
+
+void ReduceUint8KeepDimsTest(tflite::BuiltinOperator reduceOperatorCode,
+                             std::vector<armnn::BackendId>& backends,
+                             std::vector<uint8_t>& expectedOutputValues)
+{
+    std::vector<int32_t> input0Shape { 1, 1, 2, 3 };
+    std::vector<int32_t> input1Shape { 1 };
+    std::vector<int32_t> expectedOutputShape { 1, 1, 1, 3 };
+
+    std::vector<uint8_t> input0Values { 1, 2, 3,
+                                        4, 3, 1  }; // Inputs
+    std::vector<int32_t> input1Values { 2 }; // Axis
+
+    ReduceTest<uint8_t>(reduceOperatorCode,
+                        ::tflite::TensorType_UINT8,
+                        backends,
+                        input0Shape,
+                        input1Shape,
+                        expectedOutputShape,
+                        input0Values,
+                        input1Values,
+                        expectedOutputValues,
+                        true);
+}
+
+void ReduceUint8Test(tflite::BuiltinOperator reduceOperatorCode,
+                     std::vector<armnn::BackendId>& backends,
+                     std::vector<uint8_t>& expectedOutputValues)
+{
+    std::vector<int32_t> input0Shape { 1, 1, 2, 3 };
+    std::vector<int32_t> input1Shape { 1 };
+    std::vector<int32_t> expectedOutputShape { 1, 1, 3 };
+
+    std::vector<uint8_t> input0Values { 1, 2, 3,
+                                        4, 3, 1 }; // Inputs
+    std::vector<int32_t> input1Values { 2 }; // Axis
+
+    ReduceTest<uint8_t>(reduceOperatorCode,
+                        ::tflite::TensorType_UINT8,
+                        backends,
+                        input0Shape,
+                        input1Shape,
+                        expectedOutputShape,
+                        input0Values,
+                        input1Values,
+                        expectedOutputValues,
+                        false);
+}
+
+void ReduceFp32KeepDimsTest(tflite::BuiltinOperator reduceOperatorCode,
+                            std::vector<armnn::BackendId>& backends,
+                            std::vector<float>& expectedOutputValues)
+{
+    std::vector<int32_t> input0Shape { 1, 1, 2, 3 };
+    std::vector<int32_t> input1Shape { 1 };
+    std::vector<int32_t> expectedOutputShape { 1, 1, 1, 3 };
+
+    std::vector<float>   input0Values { 1001.0f, 11.0f,   1003.0f,
+                                        10.0f,   1002.0f, 12.0f }; // Inputs
+    std::vector<int32_t> input1Values { 2 }; // Axis
+
+    ReduceTest<float>(reduceOperatorCode,
+                      ::tflite::TensorType_FLOAT32,
+                      backends,
+                      input0Shape,
+                      input1Shape,
+                      expectedOutputShape,
+                      input0Values,
+                      input1Values,
+                      expectedOutputValues,
+                      true);
+}
+
+void ReduceFp32Test(tflite::BuiltinOperator reduceOperatorCode,
+                    std::vector<armnn::BackendId>& backends,
+                    std::vector<float>& expectedOutputValues)
+{
+    std::vector<int32_t> input0Shape { 1, 1, 2, 3 };
+    std::vector<int32_t> input1Shape { 1 };
+    std::vector<int32_t> expectedOutputShape { 1, 1, 3 };
+
+    std::vector<float>   input0Values { 1001.0f, 11.0f,   1003.0f,
+                                        10.0f,   1002.0f, 12.0f }; // Inputs
+    std::vector<int32_t> input1Values { 2 }; // Axis
+
+    ReduceTest<float>(reduceOperatorCode,
+                      ::tflite::TensorType_FLOAT32,
+                      backends,
+                      input0Shape,
+                      input1Shape,
+                      expectedOutputShape,
+                      input0Values,
+                      input1Values,
+                      expectedOutputValues,
+                      false);
+}
+
+// REDUCE_MAX Tests
+TEST_SUITE("ReduceMax_CpuRefTests")
+{
+
+TEST_CASE ("ReduceMax_Uint8_KeepDims_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<uint8_t> expectedOutputValues { 4, 3, 3 };
+    ReduceUint8KeepDimsTest(tflite::BuiltinOperator_REDUCE_MAX,
+                            backends,
+                            expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Uint8_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<uint8_t> expectedOutputValues { 4, 3, 3 };
+    ReduceUint8Test(tflite::BuiltinOperator_REDUCE_MAX,
+                    backends,
+                    expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Fp32_KeepDims_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<float>   expectedOutputValues { 1001.0f, 1002.0f, 1003.0f };
+    ReduceFp32KeepDimsTest(tflite::BuiltinOperator_REDUCE_MAX,
+                           backends,
+                           expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Fp32_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<float>   expectedOutputValues { 1001.0f, 1002.0f, 1003.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_REDUCE_MAX,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of ReduceMax_CpuRefTests
+
+TEST_SUITE("ReduceMax_CpuAccTests")
+{
+
+TEST_CASE ("ReduceMax_Uint8_KeepDims_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<uint8_t> expectedOutputValues { 4, 3, 3 };
+    ReduceUint8KeepDimsTest(tflite::BuiltinOperator_REDUCE_MAX,
+                            backends,
+                            expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Uint8_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<uint8_t> expectedOutputValues { 4, 3, 3 };
+    ReduceUint8Test(tflite::BuiltinOperator_REDUCE_MAX,
+                    backends,
+                    expectedOutputValues);
+}
+
+
+TEST_CASE ("ReduceMax_Fp32_KeepDims_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<float>   expectedOutputValues { 1001.0f, 1002.0f, 1003.0f };
+    ReduceFp32KeepDimsTest(tflite::BuiltinOperator_REDUCE_MAX,
+                           backends,
+                           expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Fp32_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<float>   expectedOutputValues { 1001.0f, 1002.0f, 1003.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_REDUCE_MAX,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of ReduceMax_CpuAccTests
+
+TEST_SUITE("ReduceMax_GpuAccTests")
+{
+
+TEST_CASE ("ReduceMax_Uint8_KeepDims_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<uint8_t> expectedOutputValues { 4, 3, 3 };
+    ReduceUint8KeepDimsTest(tflite::BuiltinOperator_REDUCE_MAX,
+                            backends,
+                            expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Uint8_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<uint8_t> expectedOutputValues { 4, 3, 3 };
+    ReduceUint8Test(tflite::BuiltinOperator_REDUCE_MAX,
+                    backends,
+                    expectedOutputValues);
+}
+
+
+TEST_CASE ("ReduceMax_Fp32_KeepDims_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<float>   expectedOutputValues { 1001.0f, 1002.0f, 1003.0f };
+    ReduceFp32KeepDimsTest(tflite::BuiltinOperator_REDUCE_MAX,
+                           backends,
+                           expectedOutputValues);
+}
+
+TEST_CASE ("ReduceMax_Fp32_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<float>   expectedOutputValues { 1001.0f, 1002.0f, 1003.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_REDUCE_MAX,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of ReduceMax_GpuAccTests
+
+// REDUCE_MIN Tests
+TEST_SUITE("ReduceMin_CpuRefTests")
+{
+
+TEST_CASE ("ReduceMin_Fp32_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<float>   expectedOutputValues { 10.0f, 11.0f, 12.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_REDUCE_MIN,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of ReduceMin_CpuRefTests
+
+TEST_SUITE("ReduceMin_CpuAccTests")
+{
+
+TEST_CASE ("ReduceMin_Fp32_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<float>   expectedOutputValues { 10.0f, 11.0f, 12.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_REDUCE_MIN,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of ReduceMin_CpuAccTests
+
+TEST_SUITE("ReduceMin_GpuAccTests")
+{
+
+TEST_CASE ("ReduceMin_Fp32_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<float>   expectedOutputValues { 10.0f, 11.0f, 12.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_REDUCE_MIN,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of ReduceMin_GpuAccTests
+
+// SUM Tests
+TEST_SUITE("Sum_CpuRefTests")
+{
+
+TEST_CASE ("Sum_Uint8_KeepDims_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<uint8_t> expectedOutputValues { 5, 5, 4 };
+    ReduceUint8KeepDimsTest(tflite::BuiltinOperator_SUM,
+                            backends,
+                            expectedOutputValues);
+}
+
+TEST_CASE ("Sum_Fp32_CpuRef_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+    std::vector<float>   expectedOutputValues { 1011.0f, 1013.0f, 1015.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_SUM,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of Sum_CpuRefTests
+
+TEST_SUITE("Sum_CpuAccTests")
+{
+
+TEST_CASE ("Sum_Uint8_KeepDims_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<uint8_t> expectedOutputValues { 5, 5, 4 };
+    ReduceUint8KeepDimsTest(tflite::BuiltinOperator_SUM,
+                            backends,
+                            expectedOutputValues);
+}
+
+TEST_CASE ("Sum_Fp32_CpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuAcc};
+    std::vector<float>   expectedOutputValues { 1011.0f, 1013.0f, 1015.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_SUM,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of Sum_CpuAccTests
+
+TEST_SUITE("Sum_GpuAccTests")
+{
+
+TEST_CASE ("Sum_Uint8_KeepDims_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<uint8_t> expectedOutputValues { 5, 5, 4 };
+    ReduceUint8KeepDimsTest(tflite::BuiltinOperator_SUM,
+                            backends,
+                            expectedOutputValues);
+}
+
+TEST_CASE ("Sum_Fp32_GpuAcc_Test")
+{
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    std::vector<float>   expectedOutputValues { 1011.0f, 1013.0f, 1015.0f };
+    ReduceFp32Test(tflite::BuiltinOperator_SUM,
+                   backends,
+                   expectedOutputValues);
+}
+
+} // End of Sum_GpuAccTests
+
+
+} // namespace armnnDelegate
\ No newline at end of file
diff --git a/delegate/src/test/ReduceTestHelper.hpp b/delegate/src/test/ReduceTestHelper.hpp
new file mode 100644
index 0000000..b41fcfa
--- /dev/null
+++ b/delegate/src/test/ReduceTestHelper.hpp
@@ -0,0 +1,186 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "TestUtils.hpp"
+
+#include <armnn_delegate.hpp>
+
+#include <flatbuffers/flatbuffers.h>
+#include <tensorflow/lite/interpreter.h>
+#include <tensorflow/lite/kernels/register.h>
+#include <tensorflow/lite/model.h>
+#include <tensorflow/lite/schema/schema_generated.h>
+#include <tensorflow/lite/version.h>
+
+#include <doctest/doctest.h>
+
+#include <string>
+
+namespace
+{
+
+std::vector<char> CreateReduceTfLiteModel(tflite::BuiltinOperator reduceOperatorCode,
+                                        tflite::TensorType tensorType,
+                                        std::vector<int32_t>& input0TensorShape,
+                                        std::vector<int32_t>& input1TensorShape,
+                                        const std::vector <int32_t>& outputTensorShape,
+                                        std::vector<int32_t>& axisData,
+                                        const bool keepDims,
+                                        float quantScale = 1.0f,
+                                        int quantOffset  = 0)
+{
+    using namespace tflite;
+    flatbuffers::FlatBufferBuilder flatBufferBuilder;
+
+    std::array<flatbuffers::Offset<tflite::Buffer>, 2> buffers;
+    buffers[0] = CreateBuffer(flatBufferBuilder, flatBufferBuilder.CreateVector({}));
+    buffers[1] = CreateBuffer(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector(reinterpret_cast<const uint8_t*>(axisData.data()),
+                                                             sizeof(int32_t) * axisData.size()));
+
+    auto quantizationParameters =
+            CreateQuantizationParameters(flatBufferBuilder,
+                                         0,
+                                         0,
+                                         flatBufferBuilder.CreateVector<float>({ quantScale }),
+                                         flatBufferBuilder.CreateVector<int64_t>({ quantOffset }));
+
+    std::array<flatbuffers::Offset<Tensor>, 3> tensors;
+    tensors[0] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(input0TensorShape.data(),
+                                                                      input0TensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("input"),
+                              quantizationParameters);
+
+    tensors[1] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(input1TensorShape.data(),
+                                                                      input1TensorShape.size()),
+                              ::tflite::TensorType_INT32,
+                              1,
+                              flatBufferBuilder.CreateString("axis"),
+                              quantizationParameters);
+
+    // Create output tensor
+    tensors[2] = CreateTensor(flatBufferBuilder,
+                              flatBufferBuilder.CreateVector<int32_t>(outputTensorShape.data(),
+                                                                      outputTensorShape.size()),
+                              tensorType,
+                              0,
+                              flatBufferBuilder.CreateString("output"),
+                              quantizationParameters);
+
+    // Create operator. Reduce operations MIN, MAX, SUM, MEAN uses ReducerOptions.
+    tflite::BuiltinOptions operatorBuiltinOptionsType = tflite::BuiltinOptions_ReducerOptions;
+    flatbuffers::Offset<void> operatorBuiltinOptions = CreateReducerOptions(flatBufferBuilder, keepDims).Union();
+
+    const std::vector<int> operatorInputs{ {0, 1} };
+    const std::vector<int> operatorOutputs{ 2 };
+    flatbuffers::Offset <Operator> reduceOperator =
+            CreateOperator(flatBufferBuilder,
+                           0,
+                           flatBufferBuilder.CreateVector<int32_t>(operatorInputs.data(), operatorInputs.size()),
+                           flatBufferBuilder.CreateVector<int32_t>(operatorOutputs.data(), operatorOutputs.size()),
+                           operatorBuiltinOptionsType,
+                           operatorBuiltinOptions);
+
+    const std::vector<int> subgraphInputs{ {0, 1} };
+    const std::vector<int> subgraphOutputs{ 2 };
+    flatbuffers::Offset <SubGraph> subgraph =
+            CreateSubGraph(flatBufferBuilder,
+                           flatBufferBuilder.CreateVector(tensors.data(), tensors.size()),
+                           flatBufferBuilder.CreateVector<int32_t>(subgraphInputs.data(), subgraphInputs.size()),
+                           flatBufferBuilder.CreateVector<int32_t>(subgraphOutputs.data(), subgraphOutputs.size()),
+                           flatBufferBuilder.CreateVector(&reduceOperator, 1));
+
+    flatbuffers::Offset <flatbuffers::String> modelDescription =
+            flatBufferBuilder.CreateString("ArmnnDelegate: Reduce Operator Model");
+    flatbuffers::Offset <OperatorCode> operatorCode = CreateOperatorCode(flatBufferBuilder, reduceOperatorCode);
+
+    flatbuffers::Offset <Model> flatbufferModel =
+            CreateModel(flatBufferBuilder,
+                        TFLITE_SCHEMA_VERSION,
+                        flatBufferBuilder.CreateVector(&operatorCode, 1),
+                        flatBufferBuilder.CreateVector(&subgraph, 1),
+                        modelDescription,
+                        flatBufferBuilder.CreateVector(buffers.data(), buffers.size()));
+
+    flatBufferBuilder.Finish(flatbufferModel);
+
+    return std::vector<char>(flatBufferBuilder.GetBufferPointer(),
+                             flatBufferBuilder.GetBufferPointer() + flatBufferBuilder.GetSize());
+}
+
+template <typename T>
+void ReduceTest(tflite::BuiltinOperator reduceOperatorCode,
+                tflite::TensorType tensorType,
+                std::vector<armnn::BackendId>& backends,
+                std::vector<int32_t>& input0Shape,
+                std::vector<int32_t>& input1Shape,
+                std::vector<int32_t>& expectedOutputShape,
+                std::vector<T>& input0Values,
+                std::vector<int32_t>& input1Values,
+                std::vector<T>& expectedOutputValues,
+                const bool keepDims,
+                float quantScale = 1.0f,
+                int quantOffset  = 0)
+{
+    using namespace tflite;
+    std::vector<char> modelBuffer = CreateReduceTfLiteModel(reduceOperatorCode,
+                                                            tensorType,
+                                                            input0Shape,
+                                                            input1Shape,
+                                                            expectedOutputShape,
+                                                            input1Values,
+                                                            keepDims,
+                                                            quantScale,
+                                                            quantOffset);
+
+    const Model* tfLiteModel = GetModel(modelBuffer.data());
+
+    // Create TfLite Interpreters
+    std::unique_ptr<Interpreter> armnnDelegateInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+                  (&armnnDelegateInterpreter) == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter != nullptr);
+    CHECK(armnnDelegateInterpreter->AllocateTensors() == kTfLiteOk);
+
+    std::unique_ptr<Interpreter> tfLiteInterpreter;
+    CHECK(InterpreterBuilder(tfLiteModel, ::tflite::ops::builtin::BuiltinOpResolver())
+                  (&tfLiteInterpreter) == kTfLiteOk);
+    CHECK(tfLiteInterpreter != nullptr);
+    CHECK(tfLiteInterpreter->AllocateTensors() == kTfLiteOk);
+
+    // Create the ArmNN Delegate
+    armnnDelegate::DelegateOptions delegateOptions(backends);
+    std::unique_ptr<TfLiteDelegate, decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
+            theArmnnDelegate(armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
+                             armnnDelegate::TfLiteArmnnDelegateDelete);
+    CHECK(theArmnnDelegate != nullptr);
+
+    // Modify armnnDelegateInterpreter to use armnnDelegate
+    CHECK(armnnDelegateInterpreter->ModifyGraphWithDelegate(theArmnnDelegate.get()) == kTfLiteOk);
+
+    // Set input data
+    armnnDelegate::FillInput<T>(tfLiteInterpreter, 0, input0Values);
+    armnnDelegate::FillInput<T>(armnnDelegateInterpreter, 0, input0Values);
+
+    // Run EnqueWorkload
+    CHECK(tfLiteInterpreter->Invoke() == kTfLiteOk);
+    CHECK(armnnDelegateInterpreter->Invoke() == kTfLiteOk);
+
+    // Compare output data
+    armnnDelegate::CompareOutputData<T>(tfLiteInterpreter,
+                                        armnnDelegateInterpreter,
+                                        expectedOutputShape,
+                                        expectedOutputValues);
+
+    armnnDelegateInterpreter.reset(nullptr);
+}
+
+} // anonymous namespace
\ No newline at end of file
diff --git a/include/armnn/BackendHelper.hpp b/include/armnn/BackendHelper.hpp
index 3d0632d..a562f60 100644
--- a/include/armnn/BackendHelper.hpp
+++ b/include/armnn/BackendHelper.hpp
@@ -325,6 +325,11 @@
                          const TensorInfo& output,
                          Optional<std::string&> reasonIfUnsupported = EmptyOptional());
 
+    bool IsReduceSupported(const TensorInfo& input,
+                           const TensorInfo& output,
+                           const ReduceDescriptor& descriptor,
+                           Optional<std::string&> reasonIfUnsupported = EmptyOptional());
+
     bool IsReshapeSupported(const TensorInfo& input,
                             const TensorInfo& output,
                             const ReshapeDescriptor& descriptor,
diff --git a/src/armnn/BackendHelper.cpp b/src/armnn/BackendHelper.cpp
index fb74877..1467366 100644
--- a/src/armnn/BackendHelper.cpp
+++ b/src/armnn/BackendHelper.cpp
@@ -568,6 +568,14 @@
     return m_LayerSupport->IsRankSupported(input, output, reasonIfUnsupported.value());
 }
 
+bool LayerSupportHandle::IsReduceSupported(const TensorInfo& input,
+                                           const TensorInfo& output,
+                                           const ReduceDescriptor& descriptor,
+                                           Optional<std::string&> reasonIfUnsupported)
+{
+    return m_LayerSupport->IsReduceSupported(input, output, descriptor, reasonIfUnsupported.value());
+}
+
 bool LayerSupportHandle::IsReshapeSupported(const TensorInfo& input,
                                             const TensorInfo& output,
                                             const ReshapeDescriptor& descriptor,
diff --git a/src/armnnTfLiteParser/TfLiteParser.cpp b/src/armnnTfLiteParser/TfLiteParser.cpp
index 1b91576..8ce1667 100644
--- a/src/armnnTfLiteParser/TfLiteParser.cpp
+++ b/src/armnnTfLiteParser/TfLiteParser.cpp
@@ -631,6 +631,8 @@
     m_ParserFunctions[tflite::BuiltinOperator_QUANTIZE]                = &TfLiteParserImpl::ParseQuantize;
     m_ParserFunctions[tflite::BuiltinOperator_RELU]                    = &TfLiteParserImpl::ParseRelu;
     m_ParserFunctions[tflite::BuiltinOperator_RELU6]                   = &TfLiteParserImpl::ParseRelu6;
+    m_ParserFunctions[tflite::BuiltinOperator_REDUCE_MAX]              = &TfLiteParserImpl::ParseReduceMax;
+    m_ParserFunctions[tflite::BuiltinOperator_REDUCE_MIN]              = &TfLiteParserImpl::ParseReduceMin;
     m_ParserFunctions[tflite::BuiltinOperator_RESHAPE]                 = &TfLiteParserImpl::ParseReshape;
     m_ParserFunctions[tflite::BuiltinOperator_RESIZE_BILINEAR]         = &TfLiteParserImpl::ParseResizeBilinear;
     m_ParserFunctions[tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR] = &TfLiteParserImpl::ParseResizeNearestNeighbor;
@@ -3059,6 +3061,21 @@
 
 void TfLiteParserImpl::ParseSum(size_t subgraphIndex, size_t operatorIndex)
 {
+    ParseReduce(subgraphIndex, operatorIndex, armnn::ReduceOperation::Sum);
+}
+
+void TfLiteParserImpl::ParseReduceMax(size_t subgraphIndex, size_t operatorIndex)
+{
+    ParseReduce(subgraphIndex, operatorIndex, armnn::ReduceOperation::Max);
+}
+
+void TfLiteParserImpl::ParseReduceMin(size_t subgraphIndex, size_t operatorIndex)
+{
+    ParseReduce(subgraphIndex, operatorIndex, armnn::ReduceOperation::Min);
+}
+
+void TfLiteParserImpl::ParseReduce(size_t subgraphIndex, size_t operatorIndex, ReduceOperation reduceOperation)
+{
     CHECK_MODEL(m_Model, subgraphIndex, operatorIndex);
 
     const auto &operatorPtr = m_Model->subgraphs[subgraphIndex]->operators[operatorIndex];
@@ -3070,7 +3087,7 @@
     auto outputs = GetOutputs(m_Model, subgraphIndex, operatorIndex);
     CHECK_VALID_SIZE(outputs.size(), 1);
 
-    auto layerName = fmt::format("Sum:{}:{}", subgraphIndex, operatorIndex);
+    auto layerName = fmt::format("Reduce:{}:{}", subgraphIndex, operatorIndex);
 
     armnn::TensorInfo inputTensorInfo0 = ToTensorInfo(inputs[0]);
     armnn::TensorInfo inputTensorInfo1 = ToTensorInfo(inputs[1]);
@@ -3088,11 +3105,18 @@
                                                                axisBufferPtr->data.data()[i]));
         }
     }
+    else
+    {
+        for (uint32_t i = 0; i < inputTensorInfo0.GetNumDimensions(); ++i)
+        {
+            desc.m_vAxis.push_back(i);
+        }
+    }
 
     desc.m_TargetHeight    = input0Shape[1];
     desc.m_TargetWidth     = input0Shape[2];
     desc.m_KeepDims        = options->keep_dims;
-    desc.m_ReduceOperation = armnn::ReduceOperation::Sum;
+    desc.m_ReduceOperation = reduceOperation;
 
     // Register a new layer object, Sum.
     IConnectableLayer *layer = m_Network->AddReduceLayer(desc, layerName.c_str());
diff --git a/src/armnnTfLiteParser/TfLiteParser.hpp b/src/armnnTfLiteParser/TfLiteParser.hpp
index 2603d90..b59571e 100644
--- a/src/armnnTfLiteParser/TfLiteParser.hpp
+++ b/src/armnnTfLiteParser/TfLiteParser.hpp
@@ -124,6 +124,9 @@
     void ParsePad(size_t subgraphIndex, size_t operatorIndex);
     void ParsePool(size_t subgraphIndex, size_t operatorIndex, armnn::PoolingAlgorithm algorithm);
     void ParseQuantize(size_t subgraphIndex, size_t operatorIndex);
+    void ParseReduce(size_t subgraphIndex, size_t operatorIndex, armnn::ReduceOperation reduceOperation);
+    void ParseReduceMax(size_t subgraphIndex, size_t operatorIndex);
+    void ParseReduceMin(size_t subgraphIndex, size_t operatorIndex);
     void ParseRelu(size_t subgraphIndex, size_t operatorIndex);
     void ParseRelu6(size_t subgraphIndex, size_t operatorIndex);
     void ParseReshape(size_t subgraphIndex, size_t operatorIndex);
diff --git a/src/armnnTfLiteParser/test/Reduce.cpp b/src/armnnTfLiteParser/test/Reduce.cpp
new file mode 100644
index 0000000..622d54e
--- /dev/null
+++ b/src/armnnTfLiteParser/test/Reduce.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <boost/test/unit_test.hpp>
+#include "ParserFlatbuffersFixture.hpp"
+#include "../TfLiteParser.hpp"
+
+#include <string>
+#include <iostream>
+
+BOOST_AUTO_TEST_SUITE(TensorflowLiteParser)
+
+struct ReduceMaxFixture : public ParserFlatbuffersFixture
+{
+    explicit ReduceMaxFixture(const std::string& inputShape,
+                              const std::string& outputShape,
+                              const std::string& axisShape,
+                              const std::string& axisData)
+    {
+        m_JsonString = R"(
+            {
+                "version": 3,
+                "operator_codes": [ { "builtin_code": "REDUCE_MAX" } ],
+                "subgraphs": [ {
+                    "tensors": [
+                        {
+                            "shape": )" + inputShape + R"(,
+                            "type": "FLOAT32",
+                            "buffer": 0,
+                            "name": "inputTensor",
+                            "quantization": {
+                                "min": [ 0.0 ],
+                                "max": [ 255.0 ],
+                                "scale": [ 1.0 ],
+                                "zero_point": [ 0 ],
+                            }
+                        },
+                        {
+                            "shape": )" + outputShape + R"( ,
+                            "type": "FLOAT32",
+                            "buffer": 1,
+                            "name": "outputTensor",
+                            "quantization": {
+                                "min": [ 0.0 ],
+                                "max": [ 255.0 ],
+                                "scale": [ 1.0 ],
+                                "zero_point": [ 0 ],
+                            }
+                        },
+                        {
+                            "shape": )" + axisShape + R"( ,
+                            "type": "INT32",
+                            "buffer": 2,
+                            "name": "axis",
+                            "quantization": {
+                                "min": [ 0.0 ],
+                                "max": [ 255.0 ],
+                                "scale": [ 1.0 ],
+                                "zero_point": [ 0 ],
+                            }
+                        }
+                    ],
+                    "inputs": [ 0 ],
+                    "outputs": [ 1 ],
+                    "operators": [
+                        {
+                            "opcode_index": 0,
+                            "inputs": [ 0 , 2 ],
+                            "outputs": [ 1 ],
+                            "builtin_options_type": "ReducerOptions",
+                            "builtin_options": {
+                              "keep_dims": true,
+                            },
+                            "custom_options_format": "FLEXBUFFERS"
+                        }
+                    ],
+                } ],
+                "buffers" : [
+                    { },
+                    { },
+                    { "data": )" + axisData + R"(, },
+                ]
+            }
+        )";
+        SetupSingleInputSingleOutput("inputTensor", "outputTensor");
+    }
+};
+
+struct SimpleReduceMaxFixture : public ReduceMaxFixture
+{
+    SimpleReduceMaxFixture() : ReduceMaxFixture("[ 1, 1, 2, 3 ]", "[ 1, 1, 1, 3 ]", "[ 1 ]", "[ 2 ]") {}
+};
+
+BOOST_FIXTURE_TEST_CASE(ParseReduceMax, SimpleReduceMaxFixture)
+{
+    RunTest<4, armnn::DataType::Float32, armnn::DataType::Float32>
+        (0, {{ "inputTensor",  { 1001.0f, 11.0f,   1003.0f,
+                                 10.0f,   1002.0f, 12.0f } } },
+            {{ "outputTensor", { 1001.0f, 1002.0f, 1003.0f } } });
+}
+
+struct ReduceMinFixture : public ParserFlatbuffersFixture
+{
+    explicit ReduceMinFixture(const std::string& inputShape,
+                              const std::string& outputShape,
+                              const std::string& axisShape,
+                              const std::string& axisData)
+    {
+        m_JsonString = R"(
+            {
+                "version": 3,
+                "operator_codes": [ { "builtin_code": "REDUCE_MIN" } ],
+                "subgraphs": [ {
+                    "tensors": [
+                        {
+                            "shape": )" + inputShape + R"(,
+                            "type": "FLOAT32",
+                            "buffer": 0,
+                            "name": "inputTensor",
+                            "quantization": {
+                                "min": [ 0.0 ],
+                                "max": [ 255.0 ],
+                                "scale": [ 1.0 ],
+                                "zero_point": [ 0 ],
+                            }
+                        },
+                        {
+                            "shape": )" + outputShape + R"( ,
+                            "type": "FLOAT32",
+                            "buffer": 1,
+                            "name": "outputTensor",
+                            "quantization": {
+                                "min": [ 0.0 ],
+                                "max": [ 255.0 ],
+                                "scale": [ 1.0 ],
+                                "zero_point": [ 0 ],
+                            }
+                        },
+                        {
+                            "shape": )" + axisShape + R"( ,
+                            "type": "INT32",
+                            "buffer": 2,
+                            "name": "axis",
+                            "quantization": {
+                                "min": [ 0.0 ],
+                                "max": [ 255.0 ],
+                                "scale": [ 1.0 ],
+                                "zero_point": [ 0 ],
+                            }
+                        }
+                    ],
+                    "inputs": [ 0 ],
+                    "outputs": [ 1 ],
+                    "operators": [
+                        {
+                            "opcode_index": 0,
+                            "inputs": [ 0 , 2 ],
+                            "outputs": [ 1 ],
+                            "builtin_options_type": "ReducerOptions",
+                            "builtin_options": {
+                              "keep_dims": true,
+                            },
+                            "custom_options_format": "FLEXBUFFERS"
+                        }
+                    ],
+                } ],
+                "buffers" : [
+                    { },
+                    { },
+                    { "data": )" + axisData + R"(, },
+                ]
+            }
+        )";
+        SetupSingleInputSingleOutput("inputTensor", "outputTensor");
+    }
+};
+
+struct SimpleReduceMinFixture : public ReduceMinFixture
+{
+    SimpleReduceMinFixture() : ReduceMinFixture("[ 1, 1, 2, 3 ]", "[ 1, 1, 1, 3 ]", "[ 1 ]", "[ 2 ]") {}
+};
+
+BOOST_FIXTURE_TEST_CASE(ParseReduceMin, SimpleReduceMinFixture)
+{
+    RunTest<4, armnn::DataType::Float32, armnn::DataType::Float32>
+        (0, {{ "inputTensor",  { 1001.0f, 11.0f,   1003.0f,
+                                 10.0f,   1002.0f, 12.0f } } },
+            {{ "outputTensor", { 10.0f, 11.0f, 12.0f } } });
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp
index 2a07078..d9efab2 100644
--- a/src/backends/aclCommon/ArmComputeUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeUtils.hpp
@@ -255,4 +255,16 @@
     return static_cast<unsigned int>(positiveAxis);
 }
 
+inline arm_compute::ReductionOperation ConvertReductionOperationToAcl(const ReduceDescriptor& descriptor)
+{
+    switch (descriptor.m_ReduceOperation)
+    {
+        case ReduceOperation::Sum:    return arm_compute::ReductionOperation::SUM;
+        case ReduceOperation::Mean:   return arm_compute::ReductionOperation::MEAN_SUM;
+        case ReduceOperation::Max:    return arm_compute::ReductionOperation::MAX;
+        case ReduceOperation::Min:    return arm_compute::ReductionOperation::MIN;
+        default:                         throw InvalidArgumentException("Unsupported Reduction operation");
+    }
+}
+
 } // namespace armnn
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 3b6299d..54c7916 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -75,6 +75,7 @@
     test/layerTests/PadTestImpl.cpp \
     test/layerTests/Pooling2dTestImpl.cpp \
     test/layerTests/RankTestImpl.cpp \
+    test/layerTests/ReductionTestImpl.cpp \
     test/layerTests/ReduceSumTestImpl.cpp \
     test/layerTests/ReshapeTestImpl.cpp \
     test/layerTests/ResizeTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index b20ef2d..f92e074 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -137,6 +137,8 @@
     layerTests/QuantizeTestImpl.hpp
     layerTests/RankTestImpl.cpp
     layerTests/RankTestImpl.hpp
+    layerTests/ReductionTestImpl.cpp
+    layerTests/ReductionTestImpl.hpp
     layerTests/ReduceSumTestImpl.cpp
     layerTests/ReduceSumTestImpl.hpp
     layerTests/ReshapeTestImpl.cpp
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index d87a3b0..a7dcb99 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -48,6 +48,7 @@
 #include <backendsCommon/test/layerTests/PreluTestImpl.hpp>
 #include <backendsCommon/test/layerTests/QuantizeTestImpl.hpp>
 #include <backendsCommon/test/layerTests/RankTestImpl.hpp>
+#include <backendsCommon/test/layerTests/ReductionTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ReduceSumTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ReshapeTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ResizeTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp
index 4edbd11..18821b9 100644
--- a/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/ReduceSumTestImpl.cpp
@@ -24,7 +24,8 @@
         const std::vector<float>& inputData,
         const std::vector<float>& outputData,
         const std::vector<int32_t> vAxis,
-        const armnn::ReduceOperation reduceOperation)
+        const armnn::ReduceOperation reduceOperation,
+        bool keepDims = false)
 {
     IgnoreUnused(memoryManager);
     auto inputTensor = MakeTensor<T, 4>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputData, inputTensorInfo));
@@ -53,6 +54,7 @@
 
     descriptor.m_Parameters.m_vAxis = updated_idx;
     descriptor.m_Parameters.m_ReduceOperation = reduceOperation;
+    descriptor.m_Parameters.m_KeepDims = keepDims;
     armnn::WorkloadInfo info;
 
     AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
@@ -268,7 +270,8 @@
                                        inputValues,
                                        outputValues,
                                        { 3 },
-                                       armnn::ReduceOperation::Sum);
+                                       armnn::ReduceOperation::Sum,
+                                       true);
 }
 
 template<armnn::DataType ArmnnType, typename T>
diff --git a/src/backends/backendsCommon/test/layerTests/ReductionTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ReductionTestImpl.cpp
new file mode 100644
index 0000000..589cc03
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ReductionTestImpl.cpp
@@ -0,0 +1,315 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ReductionTestImpl.hpp"
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <iostream>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReductionTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory,
+        const armnn::TensorInfo inputTensorInfo,
+        const armnn::TensorInfo outputTensorInfo,
+        const std::vector<float>& inputData,
+        const std::vector<float>& outputData,
+        const std::vector<int32_t> vAxis,
+        const armnn::ReduceOperation reduceOperation,
+        bool keepDims = false)
+{
+    IgnoreUnused(memoryManager);
+    auto inputTensor = MakeTensor<T, 4>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputData, inputTensorInfo));
+
+    LayerTestResult<float, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<float, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ReduceQueueDescriptor descriptor;
+    std::vector<uint32_t> updated_idx;
+    uint32_t resolvedAxis = 0;
+    for (uint32_t i = 0; i < vAxis.size(); ++i)
+    {
+        if (vAxis[i] <  0)
+        {
+            resolvedAxis = inputTensorInfo.GetNumDimensions() + static_cast<uint32_t>(vAxis[i]);
+        } else
+        {
+            resolvedAxis = static_cast<uint32_t>(vAxis[i]);
+        }
+
+        updated_idx.push_back(resolvedAxis);
+    }
+
+    descriptor.m_Parameters.m_vAxis = updated_idx;
+    descriptor.m_Parameters.m_ReduceOperation = reduceOperation;
+    descriptor.m_Parameters.m_KeepDims = keepDims;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateReduce(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), inputTensor.origin());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(result.output.origin(), outputHandle.get());
+
+    return result;
+}
+
+} // namespace
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceMaxSimpleTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 1, 2, 3 };
+    const armnn::TensorShape outputShape{ 1, 1, 1, 3};
+
+        armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues
+    ({
+        1001.0f, 11.0f,   1003.0f,
+        10.0f,   1002.0f, 12.0f
+    });
+    std::vector<float> outputValues
+    ({
+        1001.0f, 1002.0f, 1003.0f
+    });
+
+    return ReductionTestCommon<ArmnnType>(workloadFactory,
+                                       memoryManager,
+                                       tensorHandleFactory,
+                                       inputTensorInfo,
+                                       outputTensorInfo,
+                                       inputValues,
+                                       outputValues,
+                                       { 2 },
+                                       armnn::ReduceOperation::Max);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceMaxNegativeAxisTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 1, 2, 3 };
+    const armnn::TensorShape outputShape{ 1, 1, 2, 1};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues
+    ({
+         1001.0f, 11.0f,   1003.0f,
+         10.0f,   1002.0f, 12.0f
+    });
+    std::vector<float> outputValues
+    ({
+        1003.0f, 1002.0f
+     });
+
+    return ReductionTestCommon<ArmnnType>(workloadFactory,
+                                          memoryManager,
+                                          tensorHandleFactory,
+                                          inputTensorInfo,
+                                          outputTensorInfo,
+                                          inputValues,
+                                          outputValues,
+                                          { -1 },
+                                          armnn::ReduceOperation::Max,
+                                          true);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceMaxSimpleTest2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 1, 2, 3 };
+    const armnn::TensorShape outputShape{ 1, 1, 2, 1 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues
+    ({
+         1.0f, 3.0f, 2.0f,
+         6.0f, 4.0f, 5.0f
+    });
+
+    std::vector<float> outputValues
+    ({
+        3.0f, 6.0f
+    });
+
+    return ReductionTestCommon<ArmnnType>(workloadFactory,
+                                          memoryManager,
+                                          tensorHandleFactory,
+                                          inputTensorInfo,
+                                          outputTensorInfo,
+                                          inputValues,
+                                          outputValues,
+                                          { 3 },
+                                          armnn::ReduceOperation::Max,
+                                          true);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceMinSimpleTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape  { 1, 1, 2, 3 };
+    const armnn::TensorShape outputShape { 1, 1, 1, 3};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues
+    ({
+        1001.0f, 11.0f,   1003.0f,
+        10.0f,   1002.0f, 12.0f
+    });
+    std::vector<float> outputValues
+    ({
+        10.0f, 11.0f, 12.0f
+    });
+
+    return ReductionTestCommon<ArmnnType>(workloadFactory,
+                                          memoryManager,
+                                          tensorHandleFactory,
+                                          inputTensorInfo,
+                                          outputTensorInfo,
+                                          inputValues,
+                                          outputValues,
+                                          { 2 },
+                                          armnn::ReduceOperation::Min);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<float, 4> ReduceMinNegativeAxisTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const armnn::TensorShape inputShape{ 1, 1, 2, 3 };
+    const armnn::TensorShape outputShape{ 1, 1, 2, 1};
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.0f);
+        inputTensorInfo.SetQuantizationOffset(0);
+    }
+
+    armnn::TensorInfo outputTensorInfo(outputShape, armnn::DataType::Float32);
+
+    std::vector<float> inputValues
+    ({
+         1001.0f, 11.0f,   1003.0f,
+         10.0f,   1002.0f, 12.0f
+    });
+    std::vector<float> outputValues
+    ({
+        11.0f, 10.0f
+     });
+
+    return ReductionTestCommon<ArmnnType>(workloadFactory,
+                                          memoryManager,
+                                          tensorHandleFactory,
+                                          inputTensorInfo,
+                                          outputTensorInfo,
+                                          inputValues,
+                                          outputValues,
+                                          { -1 },
+                                          armnn::ReduceOperation::Min,
+                                          true);
+}
+
+// Explicit template specializations
+template LayerTestResult<float, 4>
+ReduceMaxSimpleTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceMaxNegativeAxisTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceMaxSimpleTest2<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceMinSimpleTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<float, 4>
+ReduceMinNegativeAxisTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
diff --git a/src/backends/backendsCommon/test/layerTests/ReductionTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ReductionTestImpl.hpp
new file mode 100644
index 0000000..495a74b
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ReductionTestImpl.hpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceMaxSimpleTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceMaxNegativeAxisTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceMaxSimpleTest2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceMinSimpleTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<float, 4> ReduceMinNegativeAxisTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::ITensorHandleFactory& tensorHandleFactory);
diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp
index 65454d4..0ab7930 100644
--- a/src/backends/cl/ClLayerSupport.cpp
+++ b/src/backends/cl/ClLayerSupport.cpp
@@ -60,6 +60,7 @@
 #include "workloads/ClQLstmWorkload.hpp"
 #include "workloads/ClQuantizedLstmWorkload.hpp"
 #include "workloads/ClQuantizeWorkload.hpp"
+#include "workloads/ClReduceWorkload.hpp"
 #include "workloads/ClReshapeWorkload.hpp"
 #include "workloads/ClResizeWorkload.hpp"
 #include "workloads/ClRsqrtWorkload.hpp"
@@ -798,6 +799,18 @@
                                    output);
 }
 
+bool ClLayerSupport::IsReduceSupported(const TensorInfo& input,
+                                       const TensorInfo& output,
+                                       const ReduceDescriptor& descriptor,
+                                       Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(ClReduceWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input,
+                                   output,
+                                   descriptor);
+}
+
 bool ClLayerSupport::IsReshapeSupported(const TensorInfo& input,
                                         const TensorInfo& output,
                                         const ReshapeDescriptor& descriptor,
diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp
index f2df94c..8b87391 100644
--- a/src/backends/cl/ClLayerSupport.hpp
+++ b/src/backends/cl/ClLayerSupport.hpp
@@ -253,6 +253,11 @@
                              const TensorInfo& output,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsReduceSupported(const TensorInfo& input,
+                           const TensorInfo& output,
+                           const ReduceDescriptor& descriptor,
+                           Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsReshapeSupported(const TensorInfo& input,
                             const TensorInfo& output,
                             const ReshapeDescriptor& descriptor,
diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp
index d65b263..ee6bcd3 100644
--- a/src/backends/cl/ClWorkloadFactory.cpp
+++ b/src/backends/cl/ClWorkloadFactory.cpp
@@ -575,6 +575,12 @@
     return std::make_unique<ClRankWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                                           const WorkloadInfo& info) const
+{
+    return std::make_unique<ClReduceWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const
 {
diff --git a/src/backends/cl/ClWorkloadFactory.hpp b/src/backends/cl/ClWorkloadFactory.hpp
index 66aea84..1d2c572 100644
--- a/src/backends/cl/ClWorkloadFactory.hpp
+++ b/src/backends/cl/ClWorkloadFactory.hpp
@@ -206,6 +206,9 @@
     std::unique_ptr<IWorkload> CreateRank(const RankQueueDescriptor& descriptor,
                                           const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                              const WorkloadInfo& info) const override;
 
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 9514750..9a83257 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -66,6 +66,7 @@
         workloads/ClQLstmWorkload.cpp \
         workloads/ClQuantizedLstmWorkload.cpp \
         workloads/ClQuantizeWorkload.cpp \
+        workloads/ClReduceWorkload.cpp \
         workloads/ClReshapeWorkload.cpp \
         workloads/ClResizeWorkload.cpp \
         workloads/ClRsqrtWorkload.cpp \
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index 018a62d..013965c 100644
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -1271,6 +1271,21 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(LogicalOrBroadcast2, LogicalOrBroadcast2Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(LogicalOrBroadcast3, LogicalOrBroadcast3Test)
 
+// ReduceSum
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumFloat32, ReduceSumSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_1, ReduceSumSingleAxisTest1<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_2, ReduceSumSingleAxisTest2<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_3, ReduceSumSingleAxisTest3<DataType::Float32>)
+
+// ReduceMax
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMaxFloat32, ReduceMaxSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMaxNegativeAxisFloat32, ReduceMaxNegativeAxisTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMax2Float32, ReduceMaxSimpleTest2<DataType::Float32>)
+
+// ReduceMin
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMinFloat32, ReduceMinSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMinNegativeAxisFloat32, ReduceMinNegativeAxisTest<DataType::Float32>)
+
 #if defined(ARMNNREF_ENABLED)
 
 // The ARMNN_COMPARE_REF_AUTO_TEST_CASE and the ARMNN_COMPARE_REF_FIXTURE_TEST_CASE test units are not available
diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt
index 7427ea0..3a1b6b8 100644
--- a/src/backends/cl/workloads/CMakeLists.txt
+++ b/src/backends/cl/workloads/CMakeLists.txt
@@ -87,6 +87,8 @@
     ClQuantizeWorkload.cpp
     ClQuantizeWorkload.hpp
     ClRankWorkload.hpp
+    ClReduceWorkload.cpp
+    ClReduceWorkload.hpp
     ClReshapeWorkload.cpp
     ClReshapeWorkload.hpp
     ClResizeWorkload.cpp
diff --git a/src/backends/cl/workloads/ClReduceWorkload.cpp b/src/backends/cl/workloads/ClReduceWorkload.cpp
new file mode 100644
index 0000000..6f594ff
--- /dev/null
+++ b/src/backends/cl/workloads/ClReduceWorkload.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClReduceWorkload.hpp"
+
+#include <cl/ClTensorHandle.hpp>
+#include <aclCommon/ArmComputeUtils.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClReduceWorkloadValidate(const TensorInfo& input,
+                                             const TensorInfo& output,
+                                             const ReduceDescriptor& desc)
+{
+    const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+    {
+        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                   "ClReduceWorkload: Reduction is supported only on 1 axis.");
+    }
+
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
+                                                                          input.GetNumDimensions(),
+                                                                          desc.m_vAxis);
+
+
+    return arm_compute::CLReductionOperation::validate(&aclInputInfo,
+                                                       &aclOutputInfo,
+                                                       static_cast<unsigned int>(coords[0]),
+                                                       ConvertReductionOperationToAcl(desc),
+                                                       desc.m_KeepDims);
+}
+
+ClReduceWorkload::ClReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
+    : BaseWorkload<ReduceQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs("ClReduceWorkload", 1, 1);
+
+    arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(),
+                                                                          info.m_InputTensorInfos[0].GetNumDimensions(),
+                                                                          m_Data.m_Parameters.m_vAxis);
+    m_Layer.configure(&input,
+                      &output,
+                      static_cast<unsigned int>(coords[0]),
+                      ConvertReductionOperationToAcl(m_Data.m_Parameters),
+                      m_Data.m_Parameters.m_KeepDims);
+}
+
+void ClReduceWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_CL("ClReduceWorkload_Execute");
+    m_Layer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClReduceWorkload.hpp b/src/backends/cl/workloads/ClReduceWorkload.hpp
new file mode 100644
index 0000000..8481eee
--- /dev/null
+++ b/src/backends/cl/workloads/ClReduceWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/CL/functions/CLReductionOperation.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClReduceWorkloadValidate(const TensorInfo& input,
+                                             const TensorInfo& output,
+                                             const ReduceDescriptor& desc);
+
+class ClReduceWorkload : public BaseWorkload<ReduceQueueDescriptor>
+{
+public:
+    ClReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+    void Execute() const override;
+
+private:
+    mutable arm_compute::CLReductionOperation m_Layer;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp
index 0045e7a..f99a9fa 100644
--- a/src/backends/cl/workloads/ClWorkloads.hpp
+++ b/src/backends/cl/workloads/ClWorkloads.hpp
@@ -44,6 +44,7 @@
 #include "ClQuantizeWorkload.hpp"
 #include "ClQuantizedLstmWorkload.hpp"
 #include "ClRankWorkload.hpp"
+#include "ClReduceWorkload.hpp"
 #include "ClReshapeWorkload.hpp"
 #include "ClResizeWorkload.hpp"
 #include "ClRsqrtWorkload.hpp"
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 2d22576..66999c1 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -58,6 +58,7 @@
 #include "workloads/NeonQLstmWorkload.hpp"
 #include "workloads/NeonQuantizeWorkload.hpp"
 #include "workloads/NeonQuantizedLstmWorkload.hpp"
+#include "workloads/NeonReduceWorkload.hpp"
 #include "workloads/NeonReshapeWorkload.hpp"
 #include "workloads/NeonResizeWorkload.hpp"
 #include "workloads/NeonRsqrtWorkload.hpp"
@@ -784,6 +785,18 @@
                                    paramsInfo);
 }
 
+bool NeonLayerSupport::IsReduceSupported(const TensorInfo& input,
+                                         const TensorInfo& output,
+                                         const ReduceDescriptor& descriptor,
+                                         Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonReduceWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input,
+                                   output,
+                                   descriptor);
+}
+
 bool NeonLayerSupport::IsReshapeSupported(const TensorInfo& input,
                                           const TensorInfo& output,
                                           const ReshapeDescriptor& descriptor,
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index dc13cc2..2ae1b0d 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -263,6 +263,11 @@
                                   const QuantizedLstmInputParamsInfo& paramsInfo,
                                   Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsReduceSupported(const TensorInfo& input,
+                           const TensorInfo& output,
+                           const ReduceDescriptor& descriptor,
+                           Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsReshapeSupported(const TensorInfo& input,
                             const TensorInfo& output,
                             const ReshapeDescriptor& descriptor,
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index 0d36110..7d09428 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -497,6 +497,12 @@
     return std::make_unique<NeonRankWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                                             const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonReduceWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                                               const WorkloadInfo& info) const
 {
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
index 745dece..4817a06 100644
--- a/src/backends/neon/NeonWorkloadFactory.hpp
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -207,6 +207,9 @@
     std::unique_ptr<IWorkload> CreateRank(const RankQueueDescriptor& descriptor,
                                           const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateReduce(const ReduceQueueDescriptor& descriptor,
+                                            const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
                                              const WorkloadInfo& info) const override;
 
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 54560cb..6feeeb5 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -66,6 +66,7 @@
         workloads/NeonQLstmWorkload.cpp \
         workloads/NeonQuantizedLstmWorkload.cpp \
         workloads/NeonQuantizeWorkload.cpp \
+        workloads/NeonReduceWorkload.cpp \
         workloads/NeonReshapeWorkload.cpp \
         workloads/NeonResizeWorkload.cpp \
         workloads/NeonRsqrtWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index d351870..8434a67 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -1372,6 +1372,21 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(LogicalOrBroadcast2, LogicalOrBroadcast2Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(LogicalOrBroadcast3, LogicalOrBroadcast3Test)
 
+// ReduceSum
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumFloat32, ReduceSumSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_1, ReduceSumSingleAxisTest1<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_2, ReduceSumSingleAxisTest2<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_3, ReduceSumSingleAxisTest3<DataType::Float32>)
+
+// ReduceMax
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMaxFloat32, ReduceMaxSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMaxNegativeAxisFloat32, ReduceMaxNegativeAxisTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMax2Float32, ReduceMaxSimpleTest2<DataType::Float32>)
+
+// ReduceMin
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMinFloat32, ReduceMinSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMinNegativeAxisFloat32, ReduceMinNegativeAxisTest<DataType::Float32>)
+
 #if defined(ARMNNREF_ENABLED)
 
 // The ARMNN_COMPARE_REF_AUTO_TEST_CASE and the ARMNN_COMPARE_REF_FIXTURE_TEST_CASE test units are not available
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index f1a723b..7c2b185 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -93,6 +93,8 @@
     NeonQuantizeWorkload.cpp
     NeonQuantizeWorkload.hpp
     NeonRankWorkload.hpp
+    NeonReduceWorkload.cpp
+    NeonReduceWorkload.hpp
     NeonReshapeWorkload.cpp
     NeonReshapeWorkload.hpp
     NeonResizeWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.cpp b/src/backends/neon/workloads/NeonReduceWorkload.cpp
new file mode 100644
index 0000000..0e1b46a
--- /dev/null
+++ b/src/backends/neon/workloads/NeonReduceWorkload.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonReduceWorkload.hpp"
+
+#include <aclCommon/ArmComputeUtils.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <neon/NeonTensorHandle.hpp>
+
+#include "NeonWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input,
+                                               const TensorInfo& output,
+                                               const ReduceDescriptor& desc)
+{
+    const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+    {
+        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                   "NeonReduceWorkload: Reduction is supported only on 1 axis.");
+    }
+
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
+                                                                          input.GetNumDimensions(),
+                                                                          desc.m_vAxis);
+
+    return arm_compute::NEReductionOperation::validate(&aclInputInfo,
+                                                       &aclOutputInfo,
+                                                       static_cast<unsigned int>(coords[0]),
+                                                       ConvertReductionOperationToAcl(desc),
+                                                       desc.m_KeepDims);
+}
+
+NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
+    : BaseWorkload<ReduceQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs("NeonReduceWorkload", 1, 1);
+
+    arm_compute::ITensor& input  = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(),
+                                                                          info.m_InputTensorInfos[0].GetNumDimensions(),
+                                                                          m_Data.m_Parameters.m_vAxis);
+    m_Layer.configure(&input,
+                      &output,
+                      static_cast<unsigned int>(coords[0]),
+                      ConvertReductionOperationToAcl(m_Data.m_Parameters),
+                      m_Data.m_Parameters.m_KeepDims);
+}
+
+void NeonReduceWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReduceWorkload_Execute");
+    m_Layer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.hpp b/src/backends/neon/workloads/NeonReduceWorkload.hpp
new file mode 100644
index 0000000..0472091
--- /dev/null
+++ b/src/backends/neon/workloads/NeonReduceWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEReductionOperation.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonReduceWorkloadValidate(const TensorInfo& input,
+                                               const TensorInfo& output,
+                                               const ReduceDescriptor& desc);
+
+class NeonReduceWorkload : public BaseWorkload<ReduceQueueDescriptor>
+{
+public:
+    NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+    void Execute() const override;
+
+private:
+    mutable arm_compute::NEReductionOperation m_Layer;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 949100d..4eb526a 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -49,6 +49,7 @@
 #include "NeonQuantizedLstmWorkload.hpp"
 #include "NeonQuantizeWorkload.hpp"
 #include "NeonRankWorkload.hpp"
+#include "NeonReduceWorkload.hpp"
 #include "NeonReshapeWorkload.hpp"
 #include "NeonResizeWorkload.hpp"
 #include "NeonRsqrtWorkload.hpp"
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index d5e0f82..161476e 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -2241,4 +2241,13 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumSingleAxisFloat32_3, ReduceSumSingleAxisTest3<DataType::Float32>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceSumMultipleAxisFloat32, ReduceSumMultipleAxisTest<DataType::Float32>)
 
+// ReduceMax
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMaxFloat32, ReduceMaxSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMaxNegativeAxisFloat32, ReduceMaxNegativeAxisTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMax2Float32, ReduceMaxSimpleTest2<DataType::Float32>)
+
+// ReduceMin
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMinFloat32, ReduceMinSimpleTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(ReduceMinNegativeAxisFloat32, ReduceMinNegativeAxisTest<DataType::Float32>)
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/Reduce.cpp b/src/backends/reference/workloads/Reduce.cpp
index 5375c71..31c6262 100644
--- a/src/backends/reference/workloads/Reduce.cpp
+++ b/src/backends/reference/workloads/Reduce.cpp
@@ -75,33 +75,27 @@
             const std::vector<uint32_t> axis,
             const ReduceOperation reduceOperation)
 {
-    unsigned int inputNumDims  = inputInfo.GetNumDimensions();
-    unsigned int outputNumDims = outputInfo.GetNumDimensions();
-
-    armnn::TensorShape outputDims = outputInfo.GetShape();
     armnn::TensorShape inputDims = inputInfo.GetShape();
+    unsigned int inputNumDims    = inputInfo.GetNumDimensions();
+    unsigned int numOutputs      = outputInfo.GetNumElements();
 
-    // Initialise output data.
-    unsigned int numOutputs = 1;
-    for (unsigned int idx = 0; idx < outputNumDims; ++idx)
+    // Initialise temp output
+    std::vector<float> tempOut(numOutputs);
+    if (reduceOperation == ReduceOperation::Max || reduceOperation == ReduceOperation::Min)
     {
-        numOutputs *= outputDims[idx];
+        for (unsigned int idx = 0; idx < numOutputs; ++idx)
+        {
+            input[idx];
+            tempOut[idx] = input.Get();
+        }
+    }
+    else
+    {
+        std::fill(tempOut.begin(), tempOut.end(), 0.0);
     }
 
-    std::vector<float> tempSum(numOutputs);
-    for (unsigned int idx = 0; idx < numOutputs; ++idx)
-    {
-        output[idx];
-        output.Set(0.0f);
-        tempSum[idx] = 0.0f;
-    }
-
-    // Initialise temp index.
-    std::vector<unsigned int> tempIndex(inputNumDims);
-    for (unsigned int idx = 0; idx < inputNumDims; ++idx)
-    {
-        tempIndex[idx] = 0;
-    }
+    // Initialise temp index
+    std::vector<unsigned int> tempIndex(inputNumDims, 0);
 
     std::vector<unsigned int> resolvedAxis = axis;
     if (resolvedAxis.empty())
@@ -113,17 +107,35 @@
     }
     auto numResolvedAxis = armnn::numeric_cast<unsigned int>(resolvedAxis.size());
 
-    // Iterates through input_data and sum up the reduced axis.
+    // Iterates through input_data and operates over the reduced axis
     for (bool hasNext = true; hasNext; hasNext = NextIndex(inputNumDims, inputDims, tempIndex))
     {
         unsigned int inputOffset = ReducedOutputOffset(inputNumDims, inputDims, tempIndex, 0, {});
         unsigned int outputOffset = ReducedOutputOffset(inputNumDims, inputDims, tempIndex,
                                                         numResolvedAxis, resolvedAxis);
         input[inputOffset];
-        tempSum[outputOffset] += input.Get();
+        auto inputValue = input.Get();
+        if (reduceOperation == ReduceOperation::Max)
+        {
+            if (inputValue > tempOut[outputOffset])
+            {
+                tempOut[outputOffset] = inputValue;
+            }
+        }
+        else if (reduceOperation == ReduceOperation::Min)
+        {
+            if (inputValue < tempOut[outputOffset])
+            {
+                tempOut[outputOffset] = inputValue;
+            }
+        }
+        else
+        {
+            tempOut[outputOffset] += inputValue;
+        }
     }
 
-    // Takes average by num of elements added to get mean.
+    // Takes average by num of elements added to get MEAN
     size_t numElementsInAxis = 1;
     for (unsigned int idx = 0; idx < numResolvedAxis; ++idx)
     {
@@ -132,18 +144,20 @@
                      (std::numeric_limits<float>::max() / armnn::numeric_cast<float>(numElementsInAxis)));
         numElementsInAxis *= current;
     }
-    if (numElementsInAxis > 0) {
-        for (unsigned int idx = 0; idx < numOutputs; ++idx)
+
+    for (unsigned int idx = 0; idx < numOutputs; ++idx)
+    {
+        output[idx];
+        if (reduceOperation == ReduceOperation::Mean)
         {
-            output[idx];
-            if (reduceOperation == ReduceOperation::Sum)
+            if (numElementsInAxis > 0)
             {
-                output.Set(tempSum[idx]);
+                output.Set(tempOut[idx] / armnn::numeric_cast<float>(numElementsInAxis));
             }
-            else if (reduceOperation == ReduceOperation::Mean)
-            {
-                output.Set(tempSum[idx] / armnn::numeric_cast<float>(numElementsInAxis));
-            }
+        }
+        else
+        {
+            output.Set(tempOut[idx]);
         }
     }
 }