IVGCVSW-1200 Division layer

	*IVGCVSW-1772 Create QueueDescriptors
	*IVGCVSW-1773 Add a CL implementation of the DivisionWorkload
	*IVGCVSW-1774 Add Neon implementation of the DivisionWorkload
	*IVGCVSW-1775 Add a Ref implementation of the DivisionWorkload
	*IVGCVSW-1776 Add a Division Layer
	* Added simple division unit tests with broadcasting

Change-Id: I05751fb7f868789f6c06f91e8d25e52b4f12ab5e
diff --git a/src/armnn/InternalTypes.cpp b/src/armnn/InternalTypes.cpp
index 3426da3..7ccef9e 100644
--- a/src/armnn/InternalTypes.cpp
+++ b/src/armnn/InternalTypes.cpp
@@ -22,6 +22,7 @@
         case LayerType::ConvertFp32ToFp16: return "ConvertFp32ToFp16";
         case LayerType::Convolution2d: return "Convolution2d";
         case LayerType::DepthwiseConvolution2d: return "DepthwiseConvolution2d";
+        case LayerType::Division: return "Division";
         case LayerType::FakeQuantization: return "FakeQuantization";
         case LayerType::Floor: return "Floor";
         case LayerType::FullyConnected: return "FullyConnected";
diff --git a/src/armnn/InternalTypes.hpp b/src/armnn/InternalTypes.hpp
index 0968e17..236b77c 100644
--- a/src/armnn/InternalTypes.hpp
+++ b/src/armnn/InternalTypes.hpp
@@ -22,6 +22,7 @@
     ConvertFp32ToFp16,
     Convolution2d,
     DepthwiseConvolution2d,
+    Division,
     FakeQuantization,
     Floor,
     FullyConnected,
diff --git a/src/armnn/LayerSupport.cpp b/src/armnn/LayerSupport.cpp
index 8dcb0dc..50c78cb 100644
--- a/src/armnn/LayerSupport.cpp
+++ b/src/armnn/LayerSupport.cpp
@@ -141,6 +141,16 @@
     FORWARD_LAYER_SUPPORT_FUNC(compute, IsConvolution2dSupported, input, output, descriptor, weights, biases);
 }
 
+bool IsDivisionSupported(Compute compute,
+                         const TensorInfo& input0,
+                         const TensorInfo& input1,
+                         const TensorInfo& output,
+                         char* reasonIfUnsupported,
+                         size_t reasonIfUnsupportedMaxLength)
+{
+    FORWARD_LAYER_SUPPORT_FUNC(compute, IsDivisionSupported, input0, input1, output);
+}
+
 bool IsDepthwiseConvolutionSupported(Compute compute,
                                      const TensorInfo& input,
                                      const TensorInfo& output,
diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp
index e79149f..bf40f0f 100644
--- a/src/armnn/LayersFwd.hpp
+++ b/src/armnn/LayersFwd.hpp
@@ -14,6 +14,7 @@
 #include "layers/ConvertFp32ToFp16Layer.hpp"
 #include "layers/Convolution2dLayer.hpp"
 #include "layers/DepthwiseConvolution2dLayer.hpp"
+#include "layers/DivisionLayer.hpp"
 #include "layers/FakeQuantizationLayer.hpp"
 #include "layers/FloorLayer.hpp"
 #include "layers/FullyConnectedLayer.hpp"
@@ -67,6 +68,7 @@
 DECLARE_LAYER(ConvertFp32ToFp16)
 DECLARE_LAYER(Convolution2d)
 DECLARE_LAYER(DepthwiseConvolution2d)
+DECLARE_LAYER(Division)
 DECLARE_LAYER(FakeQuantization)
 DECLARE_LAYER(Floor)
 DECLARE_LAYER(FullyConnected)
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index f510207..76bf4f1 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -584,6 +584,11 @@
     return layer;
 }
 
+IConnectableLayer* Network::AddDivisionLayer(const char* name)
+{
+    return m_Graph->AddLayer<DivisionLayer>(name);
+}
+
 OptimizedNetwork::OptimizedNetwork(std::unique_ptr<Graph> graph)
     : m_Graph(std::move(graph))
 {
diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp
index 72100aa..e4dc0e3 100644
--- a/src/armnn/Network.hpp
+++ b/src/armnn/Network.hpp
@@ -113,6 +113,8 @@
                                     const LstmInputParams& params,
                                     const char* name = nullptr) override;
 
+    IConnectableLayer* AddDivisionLayer(const char* name = nullptr) override;
+
 private:
     IConnectableLayer* AddFullyConnectedLayerImpl(const FullyConnectedDescriptor& fullyConnectedDescriptor,
         const ConstTensor& weights,
diff --git a/src/armnn/backends/ClLayerSupport.cpp b/src/armnn/backends/ClLayerSupport.cpp
index b00a218..77e74f5 100644
--- a/src/armnn/backends/ClLayerSupport.cpp
+++ b/src/armnn/backends/ClLayerSupport.cpp
@@ -17,11 +17,11 @@
 #include "ClWorkloads/ClAdditionFloat32Workload.hpp"
 #include "ClWorkloads/ClActivationFloat32Workload.hpp"
 #include "ClWorkloads/ClBatchNormalizationFloat32Workload.hpp"
-
 #include "ClWorkloads/ClConvertFp16ToFp32Workload.hpp"
 #include "ClWorkloads/ClConvertFp32ToFp16Workload.hpp"
 #include "ClWorkloads/ClConvolution2dBaseWorkload.hpp"
 #include "ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp"
+#include "ClWorkloads/ClDivisionFloatWorkload.hpp"
 #include "ClWorkloads/ClL2NormalizationFloat32Workload.hpp"
 #include "ClWorkloads/ClMultiplicationFloat32Workload.hpp"
 #include "ClWorkloads/ClFullyConnectedFloat32Workload.hpp"
@@ -238,6 +238,18 @@
                                    biases);
 }
 
+bool IsDivisionSupportedCl(const TensorInfo& input0,
+                           const TensorInfo& input1,
+                           const TensorInfo& output,
+                           std::string* reasonIfUnsupported)
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(ClDivisionWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input0,
+                                   input1,
+                                   output);
+}
+
 bool IsFullyConnectedSupportedCl(const TensorInfo& input,
                                  const TensorInfo& output,
                                  const TensorInfo& weights,
diff --git a/src/armnn/backends/ClLayerSupport.hpp b/src/armnn/backends/ClLayerSupport.hpp
index ae5f4b0..71bbe7c 100644
--- a/src/armnn/backends/ClLayerSupport.hpp
+++ b/src/armnn/backends/ClLayerSupport.hpp
@@ -54,6 +54,11 @@
                                        const boost::optional<TensorInfo>& biases,
                                        std::string* reasonIfUnsupported = nullptr);
 
+bool IsDivisionSupportedCl(const TensorInfo& input0,
+                           const TensorInfo& input1,
+                           const TensorInfo& output,
+                           std::string* reasonIfUnsupported = nullptr);
+
 bool IsFullyConnectedSupportedCl(const TensorInfo& input,
                                  const TensorInfo& output,
                                  const TensorInfo& weights,
diff --git a/src/armnn/backends/ClWorkloadFactory.cpp b/src/armnn/backends/ClWorkloadFactory.cpp
index 354440c..77959d1 100644
--- a/src/armnn/backends/ClWorkloadFactory.cpp
+++ b/src/armnn/backends/ClWorkloadFactory.cpp
@@ -163,6 +163,12 @@
     return MakeWorkload<ClMultiplicationFloat32Workload, NullWorkload>(descriptor, info);
 }
 
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateDivision(
+    const DivisionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return MakeWorkload<ClDivisionFloatWorkload, NullWorkload>(descriptor, info);
+}
+
 std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateBatchNormalization(
     const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
 {
@@ -423,6 +429,12 @@
     return nullptr;
 }
 
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDivision(const DivisionQueueDescriptor& descriptor,
+                                                             const WorkloadInfo& info) const
+{
+    return nullptr;
+}
+
 void ClWorkloadFactory::Finalize()
 {
 }
diff --git a/src/armnn/backends/ClWorkloadFactory.hpp b/src/armnn/backends/ClWorkloadFactory.hpp
index d0786f3..ab8c926 100644
--- a/src/armnn/backends/ClWorkloadFactory.hpp
+++ b/src/armnn/backends/ClWorkloadFactory.hpp
@@ -108,6 +108,9 @@
     virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const override;
 
+    virtual std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+                                                      const WorkloadInfo& info) const override;
+
     virtual void Finalize() override;
 
     virtual void Release() override;
diff --git a/src/armnn/backends/ClWorkloads.hpp b/src/armnn/backends/ClWorkloads.hpp
index 9f5622a..6eb8ada 100644
--- a/src/armnn/backends/ClWorkloads.hpp
+++ b/src/armnn/backends/ClWorkloads.hpp
@@ -17,6 +17,7 @@
 #include "backends/ClWorkloads/ClConvolution2dUint8Workload.hpp"
 #include "backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp"
 #include "backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp"
+#include "backends/ClWorkloads/ClDivisionFloatWorkload.hpp"
 #include "backends/ClWorkloads/ClFloorFloat32Workload.hpp"
 #include "backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp"
 #include "backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp"
diff --git a/src/armnn/backends/ClWorkloads/ClDivisionFloatWorkload.cpp b/src/armnn/backends/ClWorkloads/ClDivisionFloatWorkload.cpp
new file mode 100644
index 0000000..07345c3
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClDivisionFloatWorkload.cpp
@@ -0,0 +1,49 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "ClDivisionFloatWorkload.hpp"
+#include "backends/ClTensorHandle.hpp"
+#include "backends/CpuTensorHandle.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClDivisionWorkloadValidate(const TensorInfo& input0,
+                                               const TensorInfo& input1,
+                                               const TensorInfo& output)
+{
+    const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+    const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+    const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+    // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
+    // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
+    // ignored for F32 tensors.
+    return arm_compute::CLArithmeticDivision::validate(&aclInput1, &aclInput2, &aclOutput);
+}
+
+
+ClDivisionFloatWorkload::ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor,
+                                                     const WorkloadInfo& info)
+    : FloatWorkload<DivisionQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs("ClDivisionFloatWorkload", 2, 1);
+
+    arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    // Construct
+    m_ArithmeticDivision.configure(&input0, &input1, &output);
+}
+
+void ClDivisionFloatWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_CL("ClDivisionFloatWorkload_Execute");
+
+    // Executes the layer.
+    m_ArithmeticDivision.run();
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/ClWorkloads/ClDivisionFloatWorkload.hpp b/src/armnn/backends/ClWorkloads/ClDivisionFloatWorkload.hpp
new file mode 100644
index 0000000..bd06d38
--- /dev/null
+++ b/src/armnn/backends/ClWorkloads/ClDivisionFloatWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClDivisionWorkloadValidate(const TensorInfo& input0,
+                                               const TensorInfo& input1,
+                                               const TensorInfo& output);
+
+class ClDivisionFloatWorkload : public FloatWorkload<DivisionQueueDescriptor>
+{
+public:
+    ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor, const
+    WorkloadInfo& info);
+
+    using FloatWorkload<DivisionQueueDescriptor>::FloatWorkload;
+    void Execute() const override;
+
+private:
+    mutable arm_compute::CLArithmeticDivision m_ArithmeticDivision;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/NeonLayerSupport.cpp b/src/armnn/backends/NeonLayerSupport.cpp
index 8f73b05..48b3ccb 100644
--- a/src/armnn/backends/NeonLayerSupport.cpp
+++ b/src/armnn/backends/NeonLayerSupport.cpp
@@ -225,6 +225,15 @@
                                    biases);
 }
 
+bool IsDivisionSupportedNeon(const TensorInfo& input0,
+                             const TensorInfo& input1,
+                             const TensorInfo& output,
+                             std::string* reasonIfUnsupported)
+{
+    // At the moment division is not supported
+    return false;
+}
+
 bool IsFullyConnectedSupportedNeon(const TensorInfo& input,
                                    const TensorInfo& output,
                                    const TensorInfo& weights,
diff --git a/src/armnn/backends/NeonLayerSupport.hpp b/src/armnn/backends/NeonLayerSupport.hpp
index 4503244..654d06b 100644
--- a/src/armnn/backends/NeonLayerSupport.hpp
+++ b/src/armnn/backends/NeonLayerSupport.hpp
@@ -59,6 +59,11 @@
                                          const boost::optional<TensorInfo>& biases,
                                          std::string* reasonIfUnsupported = nullptr);
 
+bool IsDivisionSupportedNeon(const TensorInfo& input0,
+                             const TensorInfo& input1,
+                             const TensorInfo& output,
+                             std::string* reasonIfUnsupported = nullptr);
+
 bool IsFullyConnectedSupportedNeon(const TensorInfo& input,
                                    const TensorInfo& output,
                                    const TensorInfo& weights,
diff --git a/src/armnn/backends/NeonWorkloadFactory.cpp b/src/armnn/backends/NeonWorkloadFactory.cpp
index 6ea72f7..2332b8b 100644
--- a/src/armnn/backends/NeonWorkloadFactory.cpp
+++ b/src/armnn/backends/NeonWorkloadFactory.cpp
@@ -156,6 +156,12 @@
     return MakeWorkload<NeonMultiplicationFloat32Workload, NullWorkload>(descriptor, info);
 }
 
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateDivision(
+    const DivisionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+}
+
 std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateBatchNormalization(
     const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
 {
@@ -417,6 +423,12 @@
     return nullptr;
 }
 
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDivision(const DivisionQueueDescriptor& data,
+                                                               const WorkloadInfo& info) const
+{
+    return nullptr;
+}
+
 void NeonWorkloadFactory::Finalize()
 {}
 
diff --git a/src/armnn/backends/NeonWorkloadFactory.hpp b/src/armnn/backends/NeonWorkloadFactory.hpp
index 83e1f5e..f6ddb6d 100644
--- a/src/armnn/backends/NeonWorkloadFactory.hpp
+++ b/src/armnn/backends/NeonWorkloadFactory.hpp
@@ -108,6 +108,9 @@
     virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const override;
 
+    virtual std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+                                                      const WorkloadInfo& info) const override;
+
     virtual void Finalize() override;
 
     virtual void Release() override;
diff --git a/src/armnn/backends/RefLayerSupport.cpp b/src/armnn/backends/RefLayerSupport.cpp
index dd89dd5..ff5809c 100644
--- a/src/armnn/backends/RefLayerSupport.cpp
+++ b/src/armnn/backends/RefLayerSupport.cpp
@@ -117,6 +117,19 @@
                                      &TrueFunc<>);
 }
 
+bool IsDivisionSupportedRef(const TensorInfo& input0,
+                            const TensorInfo& input1,
+                            const TensorInfo& output,
+                            std::string* reasonIfUnsupported)
+{
+    ignore_unused(input1);
+    ignore_unused(output);
+    return IsSupportedForDataTypeRef(reasonIfUnsupported,
+                                     input0.GetDataType(),
+                                     &TrueFunc<>,
+                                     &TrueFunc<>);
+}
+
 bool IsFullyConnectedSupportedRef(const TensorInfo& input,
                                   const TensorInfo& output,
                                   const TensorInfo& weights,
diff --git a/src/armnn/backends/RefLayerSupport.hpp b/src/armnn/backends/RefLayerSupport.hpp
index fde0968..900cf69 100644
--- a/src/armnn/backends/RefLayerSupport.hpp
+++ b/src/armnn/backends/RefLayerSupport.hpp
@@ -51,6 +51,11 @@
                                         const boost::optional<TensorInfo>& biases,
                                         std::string* reasonIfUnsupported = nullptr);
 
+bool IsDivisionSupportedRef(const TensorInfo& input0,
+                            const TensorInfo& input1,
+                            const TensorInfo& output,
+                            std::string* reasonIfUnsupported = nullptr);
+
 bool IsFullyConnectedSupportedRef(const TensorInfo& input,
                                   const TensorInfo& output,
                                   const TensorInfo& weights,
diff --git a/src/armnn/backends/RefWorkloadFactory.cpp b/src/armnn/backends/RefWorkloadFactory.cpp
index 9294c5a..b4e4cf9 100644
--- a/src/armnn/backends/RefWorkloadFactory.cpp
+++ b/src/armnn/backends/RefWorkloadFactory.cpp
@@ -221,4 +221,10 @@
     return std::make_unique<RefConvertFp32ToFp16Workload>(descriptor, info);
 }
 
+std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreateDivision(
+    const DivisionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+    return MakeWorkload<RefDivisionFloat32Workload, RefDivisionUint8Workload>(descriptor, info);
+}
+
 } // namespace armnn
diff --git a/src/armnn/backends/RefWorkloadFactory.hpp b/src/armnn/backends/RefWorkloadFactory.hpp
index ee8639f..9b9465c 100644
--- a/src/armnn/backends/RefWorkloadFactory.hpp
+++ b/src/armnn/backends/RefWorkloadFactory.hpp
@@ -124,6 +124,9 @@
     virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const override;
 
+    virtual std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+                                                      const WorkloadInfo& info) const override;
+
 private:
 
     template <typename F32Workload, typename U8Workload, typename QueueDescriptorType>
diff --git a/src/armnn/backends/RefWorkloads.hpp b/src/armnn/backends/RefWorkloads.hpp
index 1defdbb..8ce21d4 100644
--- a/src/armnn/backends/RefWorkloads.hpp
+++ b/src/armnn/backends/RefWorkloads.hpp
@@ -55,3 +55,5 @@
 #include "backends/RefWorkloads/RefLstmFloat32Workload.hpp"
 #include "backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp"
 #include "backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp"
+#include "backends/RefWorkloads/RefDivisionFloat32Workload.hpp"
+#include "backends/RefWorkloads/RefDivisionUint8Workload.hpp"
diff --git a/src/armnn/backends/RefWorkloads/Division.cpp b/src/armnn/backends/RefWorkloads/Division.cpp
new file mode 100644
index 0000000..9837fea
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/Division.cpp
@@ -0,0 +1,52 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "Division.hpp"
+#include "Broadcast.hpp"
+
+#include <functional>
+
+namespace
+{
+
+void ElementwiseDivision(unsigned int numElements,
+                         const float* inData0,
+                         const float* inData1,
+                         float* outData)
+{
+    for (unsigned int i = 0; i < numElements; ++i)
+    {
+        //TODO How to handle divide by 0
+        outData[i] = inData0[i] / inData1[i];
+    }
+}
+
+} // namespace
+
+namespace armnn
+{
+
+void Division(const TensorShape& inShape0,
+              const TensorShape& inShape1,
+              const TensorShape& outShape,
+              const float* inData0,
+              const float* inData1,
+              float* outData)
+{
+    if (inShape0 == inShape1)
+    {
+        ElementwiseDivision(inShape0.GetNumElements(), inData0, inData1, outData);
+    }
+    else
+    {
+        BroadcastLoop(inShape0, inShape1, outShape).Unroll(std::divides<float>(),
+                                                           0,
+                                                           inData0,
+                                                           inData1,
+                                                           outData);
+    }
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/Division.hpp b/src/armnn/backends/RefWorkloads/Division.hpp
new file mode 100644
index 0000000..d4c7e8d
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/Division.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include <armnn/Tensor.hpp>
+
+namespace armnn
+{
+
+    void Division(const TensorShape& inShape0,
+                  const TensorShape& inShape1,
+                  const TensorShape& outShape,
+                  const float* inData0,
+                  const float* inData1,
+                  float* outData);
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDivisionFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefDivisionFloat32Workload.cpp
new file mode 100644
index 0000000..7cbd1fa
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefDivisionFloat32Workload.cpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "RefDivisionFloat32Workload.hpp"
+
+#include "Division.hpp"
+#include "RefWorkloadUtils.hpp"
+
+#include "Profiling.hpp"
+
+namespace armnn
+{
+
+void RefDivisionFloat32Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDivisionFloat32Workload_Execute");
+
+    const TensorShape& inShape0 = GetTensorInfo(m_Data.m_Inputs[0]).GetShape();
+    const TensorShape& inShape1 = GetTensorInfo(m_Data.m_Inputs[1]).GetShape();
+    const TensorShape& outShape = GetTensorInfo(m_Data.m_Outputs[0]).GetShape();
+
+    float* outputData = GetOutputTensorDataFloat(0, m_Data);
+    const float* inputData0 = GetInputTensorDataFloat(0, m_Data);
+    const float* inputData1 = GetInputTensorDataFloat(1, m_Data);
+
+    Division(inShape0, inShape1, outShape, inputData0, inputData1, outputData);
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDivisionFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefDivisionFloat32Workload.hpp
new file mode 100644
index 0000000..e31c255
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefDivisionFloat32Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+
+namespace armnn
+{
+
+class RefDivisionFloat32Workload : public Float32Workload<DivisionQueueDescriptor>
+{
+public:
+    using Float32Workload<DivisionQueueDescriptor>::Float32Workload;
+    virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDivisionUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefDivisionUint8Workload.cpp
new file mode 100644
index 0000000..4354e70
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefDivisionUint8Workload.cpp
@@ -0,0 +1,37 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#include "RefDivisionUint8Workload.hpp"
+
+#include "Division.hpp"
+#include "RefWorkloadUtils.hpp"
+
+#include "Profiling.hpp"
+
+#include <vector>
+
+namespace armnn
+{
+
+void RefDivisionUint8Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDivisionUint8Workload_Execute");
+
+    const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(m_Data.m_Inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+
+    auto dequant0 = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo0);
+    auto dequant1 = Dequantize(GetInputTensorDataU8(1, m_Data), inputInfo1);
+
+    std::vector<float> results(outputInfo.GetNumElements());
+    Division(
+        inputInfo0.GetShape(), inputInfo1.GetShape(), outputInfo.GetShape(),
+        dequant0.data(), dequant1.data(),results.data());
+
+    Quantize(GetOutputTensorDataU8(0, m_Data), results.data(), outputInfo);
+}
+
+} //namespace armnn
diff --git a/src/armnn/backends/RefWorkloads/RefDivisionUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefDivisionUint8Workload.hpp
new file mode 100644
index 0000000..d9e26ce
--- /dev/null
+++ b/src/armnn/backends/RefWorkloads/RefDivisionUint8Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+
+#pragma once
+
+#include "backends/Workload.hpp"
+#include "backends/WorkloadData.hpp"
+
+namespace armnn
+{
+
+class RefDivisionUint8Workload : public Uint8Workload<DivisionQueueDescriptor>
+{
+public:
+    using Uint8Workload<DivisionQueueDescriptor>::Uint8Workload;
+    virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/armnn/backends/WorkloadData.cpp b/src/armnn/backends/WorkloadData.cpp
index aa76380..626b1eb 100644
--- a/src/armnn/backends/WorkloadData.cpp
+++ b/src/armnn/backends/WorkloadData.cpp
@@ -798,4 +798,17 @@
                               "output");
 }
 
+void DivisionQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    ValidateTwoInputs(workloadInfo, "DivisionQueueDescriptor");
+    ValidateSingleOutput(workloadInfo, "DivisionQueueDescriptor");
+
+    ValidateBroadcastTensorShapesMatch(workloadInfo.m_InputTensorInfos[0],
+                                       workloadInfo.m_InputTensorInfos[1],
+                                       workloadInfo.m_OutputTensorInfos[0],
+                                       "DivisionQueueDescriptor",
+                                       "first input",
+                                       "second input");
+}
+
 } //namespace armnn
diff --git a/src/armnn/backends/WorkloadData.hpp b/src/armnn/backends/WorkloadData.hpp
index db266e6..e7110a4 100644
--- a/src/armnn/backends/WorkloadData.hpp
+++ b/src/armnn/backends/WorkloadData.hpp
@@ -184,6 +184,12 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+// Division layer workload data.
+struct DivisionQueueDescriptor : QueueDescriptor
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 // Batch norm layer workload data.
 struct BatchNormalizationQueueDescriptor : QueueDescriptorWithParameters<BatchNormalizationDescriptor>
 {
diff --git a/src/armnn/backends/WorkloadFactory.cpp b/src/armnn/backends/WorkloadFactory.cpp
index 5708dc0..bdfda2b 100644
--- a/src/armnn/backends/WorkloadFactory.cpp
+++ b/src/armnn/backends/WorkloadFactory.cpp
@@ -482,6 +482,19 @@
                                           reasonCapacity);
             break;
         }
+        case LayerType::Division:
+        {
+            const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+            result = IsDivisionSupported(compute,
+                                         OverrideDataType(input0, dataType),
+                                         OverrideDataType(input1, dataType),
+                                         OverrideDataType(output, dataType),
+                                         reason,
+                                         reasonCapacity);
+            break;
+        }
         case LayerType::Reshape:
         {
             const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
diff --git a/src/armnn/backends/WorkloadFactory.hpp b/src/armnn/backends/WorkloadFactory.hpp
index c211a29..960a71f 100644
--- a/src/armnn/backends/WorkloadFactory.hpp
+++ b/src/armnn/backends/WorkloadFactory.hpp
@@ -120,6 +120,9 @@
 
     virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const = 0;
+
+    virtual std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+                                                      const WorkloadInfo& info) const = 0;
 };
 
 } //namespace armnn
diff --git a/src/armnn/backends/test/ArmComputeCl.cpp b/src/armnn/backends/test/ArmComputeCl.cpp
index d0cb724..4f6abad 100644
--- a/src/armnn/backends/test/ArmComputeCl.cpp
+++ b/src/armnn/backends/test/ArmComputeCl.cpp
@@ -140,6 +140,11 @@
 ARMNN_AUTO_TEST_CASE(SimpleAdd, AdditionTest)
 ARMNN_AUTO_TEST_CASE(AddBroadcast1Element, AdditionBroadcast1ElementTest)
 
+// Div
+ARMNN_AUTO_TEST_CASE(SimpleDivision, DivisionTest)
+ARMNN_AUTO_TEST_CASE(DivisionBroadcast1Element, DivisionBroadcast1ElementTest)
+ARMNN_AUTO_TEST_CASE(DivisionBroadcast1DVector, DivisionBroadcast1DVectorTest)
+
 // Mul
 ARMNN_AUTO_TEST_CASE(SimpleMultiplication, MultiplicationTest)
 ARMNN_AUTO_TEST_CASE(MultiplicationBroadcast1Element, MultiplicationBroadcast1ElementTest)
diff --git a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp
index eca3068..406dddd 100644
--- a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp
+++ b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp
@@ -340,6 +340,8 @@
 
 DECLARE_LAYER_POLICY_2_PARAM(Pooling2d)
 
+DECLARE_LAYER_POLICY_1_PARAM(Division)
+
 DECLARE_LAYER_POLICY_2_PARAM(ResizeBilinear)
 
 DECLARE_LAYER_POLICY_2_PARAM(Reshape)
diff --git a/src/armnn/backends/test/LayerTests.cpp b/src/armnn/backends/test/LayerTests.cpp
index 8039ffb..e916c05 100644
--- a/src/armnn/backends/test/LayerTests.cpp
+++ b/src/armnn/backends/test/LayerTests.cpp
@@ -1070,6 +1070,128 @@
 }
 
 namespace {
+    LayerTestResult<float,4> DivisionTestHelper(armnn::IWorkloadFactory& workloadFactory,
+                                                      const unsigned int shape0[4],
+                                                      const std::vector<float> & values0,
+                                                      const unsigned int shape1[4],
+                                                      const std::vector<float> & values1,
+                                                      const unsigned int outShape[4],
+                                                      const std::vector<float> & outValues)
+    {
+        const size_t dimensionCount = 4;
+        armnn::TensorInfo inputTensorInfo0{dimensionCount, shape0, armnn::DataType::Float32};
+        armnn::TensorInfo inputTensorInfo1{dimensionCount, shape1, armnn::DataType::Float32};
+        armnn::TensorInfo outputTensorInfo{dimensionCount, outShape, armnn::DataType::Float32};
+
+        auto input0 = MakeTensor<float, 4>(inputTensorInfo0, values0);
+        auto input1 = MakeTensor<float, 4>(inputTensorInfo1, values1);
+
+        LayerTestResult<float,4> ret(outputTensorInfo);
+
+        std::unique_ptr<armnn::ITensorHandle> inputHandle0 = workloadFactory.CreateTensorHandle(inputTensorInfo0);
+        std::unique_ptr<armnn::ITensorHandle> inputHandle1 = workloadFactory.CreateTensorHandle(inputTensorInfo1);
+        std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+        armnn::DivisionQueueDescriptor data;
+        armnn::WorkloadInfo info;
+        AddInputToWorkload(data, info, inputTensorInfo0, inputHandle0.get());
+        AddInputToWorkload(data, info, inputTensorInfo1, inputHandle1.get());
+        AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+        std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDivision(data, info);
+
+        inputHandle0->Allocate();
+        inputHandle1->Allocate();
+        outputHandle->Allocate();
+
+        CopyDataToITensorHandle(inputHandle0.get(), &input0[0][0][0][0]);
+        CopyDataToITensorHandle(inputHandle1.get(), &input1[0][0][0][0]);
+
+        workloadFactory.Finalize();
+        workload->Execute();
+
+        CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+        ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo, outValues);
+        return ret;
+    }
+} // anonymous namespace
+
+LayerTestResult<float,4> DivisionTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    const unsigned int width = 2;
+    const unsigned int height = 2;
+    const unsigned int channelCount = 2;
+    const unsigned int batchSize = 2;
+
+    unsigned int shape[] = { batchSize, channelCount, height, width };
+
+    std::vector<float> input0({
+                                      2,  2,  2,  2,    3,  3,  3,  3,
+                                      4,  4,  4,  4,    5,  5,  5,  5 });
+
+    std::vector<float> input1({
+                                      1,  1,  1,  1,    2,  2,  2,  2,
+                                      4,  4,  4,  4,    4,  4,  4,  4 });
+
+    std::vector<float> output({
+                                      2,  2,  2,  2,    1.5,  1.5,  1.5,  1.5,
+                                      1, 1, 1, 1,  1.25, 1.25, 1.25, 1.25 });
+
+    return DivisionTestHelper(workloadFactory,
+                                    shape,
+                                    input0,
+                                    shape,
+                                    input1,
+                                    shape,
+                                    output);
+}
+
+LayerTestResult<float, 4> DivisionBroadcast1ElementTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    unsigned int shape0[] = { 1, 2, 2, 2 };
+    std::vector<float> input0({ 2, 4, 6, 8, 10, 12, 14, 16});
+
+    unsigned int shape1[] = { 1, 1, 1, 1 };
+    std::vector<float> input1({ 2 });
+
+    std::vector<float> output({ 1, 2, 3, 4, 5, 6, 7, 8});
+
+    return DivisionTestHelper(workloadFactory,
+                                    shape0,
+                                    input0,
+                                    shape1,
+                                    input1,
+                                    shape0,
+                                    output);
+}
+
+LayerTestResult<float, 4> DivisionBroadcast1DVectorTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    unsigned int shape0[] = { 1, 3, 3, 2 };
+    std::vector<float> input0({
+                                      1,   4,       3,  8,      5, 12,
+                                      7,   16,      9, 20,     11, 24,
+                                      13,  28,     15, 32,     17, 36});
+
+    unsigned int shape1[] = { 1, 1, 1, 2 };
+    std::vector<float> input1({ 1, 2 });
+
+    std::vector<float> output({
+                                      1,   2,      3,  4,      5,  6,
+                                      7,   8,      9, 10,     11, 12,
+                                      13, 14,     15, 16,     17, 18});
+
+    return DivisionTestHelper(workloadFactory,
+                                    shape0,
+                                    input0,
+                                    shape1,
+                                    input1,
+                                    shape0,
+                                    output);
+}
+
+namespace {
 LayerTestResult<float,4> MultiplicationTestHelper(armnn::IWorkloadFactory& workloadFactory,
                                                   const unsigned int shape0[4],
                                                   const std::vector<float> & values0,
diff --git a/src/armnn/backends/test/LayerTests.hpp b/src/armnn/backends/test/LayerTests.hpp
index 48f73e7..a59ff05 100644
--- a/src/armnn/backends/test/LayerTests.hpp
+++ b/src/armnn/backends/test/LayerTests.hpp
@@ -192,6 +192,10 @@
                                                 armnn::ActivationFunction f,
                                                 unsigned int batchSize);
 
+LayerTestResult<float, 4> DivisionTest(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> DivisionBroadcast1ElementTest(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> DivisionBroadcast1DVectorTest(armnn::IWorkloadFactory& workloadFactory);
+
 LayerTestResult<float, 4> MultiplicationTest(armnn::IWorkloadFactory& workloadFactory);
 LayerTestResult<float, 4> MultiplicationBroadcast1ElementTest(armnn::IWorkloadFactory& workloadFactory);
 LayerTestResult<float, 4> MultiplicationBroadcast1DVectorTest(armnn::IWorkloadFactory& workloadFactory);
diff --git a/src/armnn/backends/test/Reference.cpp b/src/armnn/backends/test/Reference.cpp
index dedeb50..b31723c 100644
--- a/src/armnn/backends/test/Reference.cpp
+++ b/src/armnn/backends/test/Reference.cpp
@@ -146,6 +146,11 @@
 ARMNN_AUTO_TEST_CASE(AddBroadcastUint8, AdditionBroadcastUint8Test)
 ARMNN_AUTO_TEST_CASE(AddBroadcast1ElementUint8, AdditionBroadcast1ElementUint8Test)
 
+// Div
+ARMNN_AUTO_TEST_CASE(SimpleDivision, DivisionTest)
+ARMNN_AUTO_TEST_CASE(DivisionBroadcast1Element, DivisionBroadcast1ElementTest)
+ARMNN_AUTO_TEST_CASE(DivisionBroadcast1DVector, DivisionBroadcast1DVectorTest)
+
 // Mul
 ARMNN_AUTO_TEST_CASE(SimpleMultiplication, MultiplicationTest)
 ARMNN_AUTO_TEST_CASE(MultiplicationBroadcast1Element, MultiplicationBroadcast1ElementTest)
diff --git a/src/armnn/layers/DivisionLayer.cpp b/src/armnn/layers/DivisionLayer.cpp
new file mode 100644
index 0000000..bf09e14
--- /dev/null
+++ b/src/armnn/layers/DivisionLayer.cpp
@@ -0,0 +1,81 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#include "DivisionLayer.hpp"
+
+#include "LayerCloneBase.hpp"
+
+#include <armnn/TypesUtils.hpp>
+#include <backends/WorkloadData.hpp>
+#include <backends/WorkloadFactory.hpp>
+
+namespace armnn
+{
+
+DivisionLayer::DivisionLayer(const char* name)
+        : Layer(2, 1, LayerType::Division, name)
+{
+}
+
+std::unique_ptr<IWorkload> DivisionLayer::CreateWorkload(const Graph& graph,
+                                                         const IWorkloadFactory& factory) const
+{
+    DivisionQueueDescriptor descriptor;
+
+    return factory.CreateDivision(descriptor, PrepInfoAndDesc(descriptor, graph));
+}
+
+DivisionLayer* DivisionLayer::Clone(Graph& graph) const
+{
+    return CloneBase<DivisionLayer>(graph, GetName());
+}
+
+std::vector<TensorShape> DivisionLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const
+{
+    BOOST_ASSERT(inputShapes.size() == 2);
+    auto& input0 = inputShapes[0];
+    auto& input1 = inputShapes[1];
+
+    // Get the max of the inputs.
+    BOOST_ASSERT(input0.GetNumDimensions() == input1.GetNumDimensions());
+    unsigned int numDims = input0.GetNumDimensions();
+    std::vector<unsigned int> dims(numDims);
+
+    for (unsigned int i = 0; i < numDims; i++)
+    {
+        unsigned int dim0 = input0[i];
+        unsigned int dim1 = input1[i];
+
+        // Validates inputs are broadcast compatible.
+#if !NDEBUG
+        if (dim0 != dim1)
+        {
+            BOOST_ASSERT_MSG(dim0 == 1 || dim1 == 1, "Dimensions should either match or one should be of size 1.");
+        }
+#endif
+
+        dims[i] = std::max(dim0, dim1);
+    }
+
+    return std::vector<TensorShape>({ TensorShape(numDims, dims.data()) });
+}
+
+void DivisionLayer::ValidateTensorShapesFromInputs()
+{
+    VerifyLayerConnections(2, CHECK_LOCATION());
+
+    auto inferredShapes = InferOutputShapes({
+                                                    GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(),
+                                                    GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape()
+                                            });
+
+    BOOST_ASSERT(inferredShapes.size() == 1);
+
+    ConditionalThrowIfNotEqual<LayerValidationException>(
+            "DivisionLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.",
+            GetOutputSlot(0).GetTensorInfo().GetShape(),
+            inferredShapes[0]);
+}
+
+} // namespace armnn
diff --git a/src/armnn/layers/DivisionLayer.hpp b/src/armnn/layers/DivisionLayer.hpp
new file mode 100644
index 0000000..1bd69c4
--- /dev/null
+++ b/src/armnn/layers/DivisionLayer.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// See LICENSE file in the project root for full license information.
+//
+#pragma once
+
+#include <Layer.hpp>
+
+namespace armnn
+{
+
+class DivisionLayer : public Layer
+{
+public:
+    virtual std::unique_ptr<IWorkload> CreateWorkload(const Graph& graph,
+                                                      const IWorkloadFactory& factory) const override;
+
+    DivisionLayer* Clone(Graph& graph) const override;
+
+    void ValidateTensorShapesFromInputs() override;
+    std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override;
+
+protected:
+    DivisionLayer(const char* name);
+    ~DivisionLayer() = default;
+};
+
+} // namespace
diff --git a/src/armnn/test/UnitTests.hpp b/src/armnn/test/UnitTests.hpp
index 8d5c705..beb765f 100644
--- a/src/armnn/test/UnitTests.hpp
+++ b/src/armnn/test/UnitTests.hpp
@@ -8,6 +8,7 @@
 #include "armnn/Utils.hpp"
 #include "backends/RefWorkloadFactory.hpp"
 #include "backends/test/LayerTests.hpp"
+#include "TensorHelpers.hpp"
 #include <boost/test/unit_test.hpp>
 
 inline void ConfigureLoggingTest()