IVGCVSW-5410 Add front-end support for CAST
IVGCVSW-5415 Add TfLiteParser support for CAST

 * Added front end support for CAST, including support in the
Reference workload, Serialization, Deserializtion, Unit tests, and
TfLiteParser.

Signed-off-by: mathad01 <matthew.haddon@arm.com>
Change-Id: Iaf670ca5912a21ed6bc84f7f83a68b42154846bb
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index 77067d9..2e171f9 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -88,6 +88,13 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
+bool LayerSupportBase::IsCastSupported(const TensorInfo&, //input
+                                       const TensorInfo&, //output
+                                       Optional<std::string &> reasonIfUnsupported) const
+{
+    return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
+}
+
 bool LayerSupportBase::IsComparisonSupported(const TensorInfo&, // input0
                                              const TensorInfo&, // input1
                                              const TensorInfo&, // output
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index e04d657..a6f1b34 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -47,6 +47,10 @@
                                    const BatchToSpaceNdDescriptor& descriptor,
                                    Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsCastSupported(const TensorInfo& input,
+                         const TensorInfo& output,
+                         Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsComparisonSupported(const TensorInfo& input0,
                                const TensorInfo& input1,
                                const TensorInfo& output,
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 2c5303c..100d23e 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -706,6 +706,33 @@
     }
 }
 
+void CastQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"CastQueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    std::vector<DataType> supportedTypes =
+    {
+            DataType::BFloat16,
+            DataType::Float16,
+            DataType::Float32,
+            DataType::QAsymmS8,
+            DataType::QAsymmU8,
+            DataType::QSymmS8,
+            DataType::QSymmS16,
+            DataType::Signed32,
+            DataType::Signed64
+    };
+
+    ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
+    ValidateTensorShapesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+}
+
 void SoftmaxQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     const std::string descriptorName{"SoftmaxQueueDescriptor"};
diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp
index 8a2dd1f..abaa4f5 100644
--- a/src/backends/backendsCommon/WorkloadData.hpp
+++ b/src/backends/backendsCommon/WorkloadData.hpp
@@ -155,6 +155,11 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+struct CastQueueDescriptor : QueueDescriptor
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 // Fill layer workload data.
 struct FillQueueDescriptor : QueueDescriptorWithParameters<FillDescriptor>
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 20d7134..9d7d5bd 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -138,6 +138,16 @@
                                                                   reason);
             break;
         }
+        case LayerType::Cast:
+        {
+            const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+
+            result = layerSupportObject.IsCastSupported(OverrideDataType(input, dataType),
+                                                        OverrideDataType(output, dataType),
+                                                        reason);
+            break;
+        }
         case LayerType::Comparison:
         {
             auto cLayer = PolymorphicDowncast<const ComparisonLayer*>(&layer);
@@ -1345,6 +1355,12 @@
     return std::unique_ptr<IWorkload>();
 }
 
+std::unique_ptr<IWorkload> IWorkloadFactory::CreateCast(const CastQueueDescriptor& /*descriptor*/,
+                                                       const WorkloadInfo& /*info*/) const
+{
+    return std::unique_ptr<IWorkload>();
+}
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreateComparison(const ComparisonQueueDescriptor& /*descriptor*/,
                                                               const WorkloadInfo& /*info*/) const
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.hpp b/src/backends/backendsCommon/WorkloadFactory.hpp
index 13fd190..42360d3 100644
--- a/src/backends/backendsCommon/WorkloadFactory.hpp
+++ b/src/backends/backendsCommon/WorkloadFactory.hpp
@@ -87,6 +87,9 @@
     virtual std::unique_ptr<IWorkload> CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
                                                             const WorkloadInfo& Info) const;
 
+    virtual std::unique_ptr<IWorkload> CreateCast(const CastQueueDescriptor& descriptor,
+                                                  const WorkloadInfo& Info) const;
+
     virtual std::unique_ptr<IWorkload> CreateComparison(const ComparisonQueueDescriptor& descriptor,
                                                         const WorkloadInfo& Info) const;
 
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 54c7916..6e4a8c7 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -43,6 +43,7 @@
     test/layerTests/AdditionTestImpl.cpp \
     test/layerTests/ArgMinMaxTestImpl.cpp \
     test/layerTests/BatchNormalizationTestImpl.cpp \
+    test/layerTests/CastTestImpl.cpp \
     test/layerTests/ComparisonTestImpl.cpp \
     test/layerTests/ConcatTestImpl.cpp \
     test/layerTests/ConstantTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 248ada9..98b800b 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -68,6 +68,8 @@
     layerTests/BatchNormalizationTestImpl.cpp
     layerTests/BatchNormalizationTestImpl.hpp
     layerTests/BatchToSpaceNdTestImpl.hpp
+    layerTests/CastTestImpl.cpp
+    layerTests/CastTestImpl.hpp
     layerTests/ComparisonTestImpl.cpp
     layerTests/ComparisonTestImpl.hpp
     layerTests/ConcatTestImpl.cpp
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index c7d1dd2..b73efbe 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -566,6 +566,8 @@
 
 DECLARE_LAYER_POLICY_2_PARAM(BatchToSpaceNd)
 
+DECLARE_LAYER_POLICY_1_PARAM(Cast)
+
 DECLARE_LAYER_POLICY_2_PARAM(Comparison)
 
 DECLARE_LAYER_POLICY_2_PARAM(Concat)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index a7dcb99..c1b4b46 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -11,6 +11,7 @@
 #include <backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp>
 #include <backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp>
 #include <backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp>
+#include <backendsCommon/test/layerTests/CastTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ComparisonTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConcatTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/CastTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/CastTestImpl.cpp
new file mode 100644
index 0000000..ad23b8c
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/CastTestImpl.cpp
@@ -0,0 +1,229 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "CastTestImpl.hpp"
+#include "ElementwiseUnaryTestImpl.hpp"
+
+
+template<armnn::DataType inputDataType, armnn::DataType outputDataType, typename TInput, typename TOutput>
+LayerTestResult<TOutput, 4> CastTest(armnn::IWorkloadFactory& workloadFactory,
+                                     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                     const armnn::ITensorHandleFactory& tensorHandleFactory,
+                                     const std::vector<TInput>& inputValues,
+                                     const std::vector<TOutput>& outputValues)
+{
+    IgnoreUnused(memoryManager);
+    armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, inputDataType);
+    armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, outputDataType);
+    float quantizationScale = 1.0f;
+    int32_t quantizationOffset = 0;
+
+    if(armnn::IsQuantizedType<TInput>())
+    {
+        inputTensorInfo.SetQuantizationScale(quantizationScale);
+        inputTensorInfo.SetQuantizationOffset(quantizationOffset);
+    }
+    if(armnn::IsQuantizedType<TOutput>())
+    {
+        outputTensorInfo.SetQuantizationScale(quantizationScale);
+        outputTensorInfo.SetQuantizationOffset(quantizationOffset);
+    }
+
+    auto input = MakeTensor<TInput, 4>(inputTensorInfo, inputValues);
+
+    LayerTestResult<TOutput, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<TOutput, 4>(outputTensorInfo, outputValues);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::CastQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateCast(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+LayerTestResult<float, 4> CastInt32ToFloat2dTest(armnn::IWorkloadFactory& workloadFactory,
+                                 const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                 const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<int32_t> inputValues = { -1, -3, -1, -3, -1, -3, -1, -3, 1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<float> outputValues  = { -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                        3.0f, 1.0f, 3.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    return CastTest<armnn::DataType::Signed32, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
+
+LayerTestResult<float, 4> CastInt16ToFloat2dTest(armnn::IWorkloadFactory& workloadFactory,
+                                               const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                               const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<int16_t> inputValues = { -1, -3, -1, -3, -1, -3, -1, -3, 1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<float> outputValues  = { -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                         3.0f, 1.0f, 3.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    return CastTest<armnn::DataType::QSymmS16, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
+
+LayerTestResult<float, 4> CastInt8ToFloat2dTest(armnn::IWorkloadFactory& workloadFactory,
+                                                 const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                                 const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<int8_t> inputValues = { -1, -3, -1, -3, -1, -3, -1, -3, 1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<float> outputValues  = { -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                         3.0f, 1.0f, 3.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    return CastTest<armnn::DataType::QSymmS8, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                        outputValues);
+}
+
+LayerTestResult<float, 4> CastInt8AsymmToFloat2dTest(armnn::IWorkloadFactory& workloadFactory,
+                                                const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                                const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<int8_t> inputValues = { -1, -3, -1, -3, -1, -3, -1, -3, 1,
+                                        3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<float> outputValues  = { -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                         3.0f, 1.0f, 3.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    return CastTest<armnn::DataType::QAsymmS8, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                        tensorHandleFactory, inputValues, outputValues);
+}
+
+LayerTestResult<float, 4> CastUInt8ToFloat2dTest(armnn::IWorkloadFactory& workloadFactory,
+                                               const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                               const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<u_int8_t> inputValues = { 1, 3, 1, 3, 1, 3, 1, 3, 1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<float> outputValues  = { 1.0f, 3.0f, 1.0f, 3.0f, 1.0f, 3.0f, 1.0f, 3.0f, 1.0f,
+                                         3.0f, 1.0f, 3.0f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    return CastTest<armnn::DataType::QAsymmU8, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
+
+LayerTestResult<uint8_t, 4> CastInt8ToUInt82dTest(armnn::IWorkloadFactory& workloadFactory,
+                                                  const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                                  const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<int8_t> inputValues  = { -1, -3, -1, -3, -1, -3, -1, -3, -1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<uint8_t> outputValues = { 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    return CastTest<armnn::DataType::QSymmS8, armnn::DataType::QAsymmU8>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
+
+LayerTestResult<uint8_t, 4> CastInt8AsymmToUInt82dTest(armnn::IWorkloadFactory& workloadFactory,
+                                               const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                               const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<int8_t> inputValues  = { -1, -3, -1, -3, -1, -3, -1, -3, -1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    std::vector<uint8_t> outputValues = { 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    return CastTest<armnn::DataType::QAsymmS8, armnn::DataType::QAsymmU8>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                          outputValues);
+}
+
+LayerTestResult<float, 4> CastFloat16ToFloat322dTest(armnn::IWorkloadFactory& workloadFactory,
+                                               const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                               const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    using namespace half_float::literal;
+
+    std::vector<armnn::Half> inputValues = { -1.10_h, -3._h, -1.30_h, -3._h, -1._h, -3._h, -1._h, -3._h, 1._h,
+                                         3.10_h, 1._h, 3.30_h, 1._h, 2._h, 1._h, 3._h, 1._h, 3._h };
+    std::vector<float> outputValues  = { -1.1f, -3.0f, -1.3f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                       3.1f, 1.0f, 3.3f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    return CastTest<armnn::DataType::Float16, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                        tensorHandleFactory, inputValues,
+                                                                        outputValues);
+}
+
+LayerTestResult<float, 4> CastBFloat16ToFloat322dTest(armnn::IWorkloadFactory& workloadFactory,
+                                              const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                              const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+
+    std::vector<armnn::BFloat16> inputValues = armnnUtils::QuantizedVector<armnn::BFloat16>(
+            {
+                    -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+                    1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
+            },
+            1.0f, 0);
+
+
+    std::vector<float> outputValues = { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+                                                1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f };
+
+    return CastTest<armnn::DataType::BFloat16, armnn::DataType::Float32>(workloadFactory, memoryManager,
+                                                                        tensorHandleFactory, inputValues, outputValues);
+}
+
+LayerTestResult<armnn::Half, 4> CastFloat32ToFloat162dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    using namespace half_float::literal;
+
+    std::vector<float> inputValues = { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f,
+                                       0.00000004f, 3.4E38f, 300.0f, 0.5f, 1.3f, 1.5f, 2.1E4f, 8.76f, 15.2f, 37.5f };
+    std::vector<armnn::Half> outputValues = {-37.50_h, -15.20_h, -8.76_h, -2._h, -1.50_h, -1.30_h, -0.50_h, -0.40_h,
+                                     0._h, 6.55E4_h, 300._h, 0.50_h, 1.30_h, 1.50_h, 2.1E4_h, 8.76_h, 15.20_h, 37.50_h};
+
+    return CastTest<armnn::DataType::Float32, armnn::DataType::Float16>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
+
+LayerTestResult<int8_t , 4> CastFloat32ToInt82dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<float> inputValues  = { -1.0f, -3.5f, -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                         3.1f, 1.5f, 3.9f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    std::vector<int8_t> outputValues = { -1, -3, -1, -3, -1, -3, -1, -3, 1,
+                                        3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    return CastTest<armnn::DataType::Float32, armnn::DataType::QAsymmS8>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
+
+LayerTestResult<uint8_t , 4> CastFloat32ToUInt82dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    std::vector<float> inputValues  = { -1.0f, -3.5f, -1.0f, -3.0f, -1.0f, -3.0f, -1.0f, -3.0f, 1.0f,
+                                        3.1f, 1.5f, 3.9f, 1.0f, 2.0f, 1.0f, 3.0f, 1.0f, 3.0f };
+    std::vector<uint8_t> outputValues = { 0, 0, 0, 0, 0, 0, 0, 0, 1,
+                                         3, 1, 3, 1, 2, 1, 3, 1, 3 };
+    return CastTest<armnn::DataType::Float32, armnn::DataType::QAsymmU8>(workloadFactory, memoryManager,
+                                                                         tensorHandleFactory, inputValues,
+                                                                         outputValues);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/CastTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/CastTestImpl.hpp
new file mode 100644
index 0000000..bf8d5a4
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/CastTestImpl.hpp
@@ -0,0 +1,84 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+#include <Half.hpp>
+
+template<armnn::DataType inputDataType, armnn::DataType outputDataType,
+        typename TInput=armnn::ResolveType<inputDataType>,
+        typename TOutput=armnn::ResolveType<outputDataType>>
+LayerTestResult<TOutput, 4> CastTest(armnn::IWorkloadFactory& workloadFactory,
+                                     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                     const armnn::ITensorHandleFactory& tensorHandleFactory,
+                                     const std::vector<TInput>& inputTensor,
+                                     const std::vector<TOutput>& outputTensor);
+
+
+LayerTestResult<float, 4> CastInt32ToFloat2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float, 4> CastInt16ToFloat2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float, 4> CastInt8ToFloat2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float, 4> CastInt8AsymmToFloat2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float, 4> CastUInt8ToFloat2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 4> CastInt8ToUInt82dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t, 4> CastInt8AsymmToUInt82dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float, 4> CastFloat16ToFloat322dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<float, 4> CastBFloat16ToFloat322dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<armnn::Half, 4> CastFloat32ToFloat162dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<int8_t , 4> CastFloat32ToInt82dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+LayerTestResult<uint8_t , 4> CastFloat32ToUInt82dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 2e0a8f2..e278e45 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -303,6 +303,36 @@
     return supported;
 }
 
+bool RefLayerSupport::IsCastSupported(const TensorInfo& input,
+                                      const TensorInfo& output,
+                                      Optional<std::string&> reasonIfUnsupported) const
+{
+    std::array<DataType, 9> supportedInputTypes =
+            {
+                    DataType::BFloat16,
+                    DataType::Float32,
+                    DataType::Float16,
+                    DataType::QSymmS8,
+                    DataType::QAsymmS8,
+                    DataType::QAsymmU8,
+                    DataType::QSymmS16,
+                    DataType::Signed32
+            };
+
+    bool supported = true;
+    supported &= CheckSupportRule(TypeAnyOf(input, supportedInputTypes), reasonIfUnsupported,
+                                  "Reference cast: input is not a supported type");
+
+
+    supported &= CheckSupportRule(TypeAnyOf(output, supportedInputTypes), reasonIfUnsupported,
+                                  "Reference cast: output is not a supported type");
+
+    supported &= CheckSupportRule(ShapesAreSameTotalSize(input, output), reasonIfUnsupported,
+                                  "Reference cast: input and output shapes have different number of total elements");
+
+    return supported;
+}
+
 bool RefLayerSupport::IsComparisonSupported(const TensorInfo& input0,
                                             const TensorInfo& input1,
                                             const TensorInfo& output,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index b75b778..7a95bb0 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -46,6 +46,10 @@
                                    const BatchToSpaceNdDescriptor& descriptor,
                                    Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsCastSupported(const TensorInfo& input,
+                         const TensorInfo& output,
+                         Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsComparisonSupported(const TensorInfo& input0,
                                const TensorInfo& input1,
                                const TensorInfo& output,
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index fde6c86..c1e3d58 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -178,6 +178,12 @@
     return std::make_unique<RefBatchToSpaceNdWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateCast(const CastQueueDescriptor& descriptor,
+                                                          const WorkloadInfo& info) const
+{
+    return std::make_unique<RefCastWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateComparison(const ComparisonQueueDescriptor& descriptor,
                                                                 const WorkloadInfo& info) const
 {
diff --git a/src/backends/reference/RefWorkloadFactory.hpp b/src/backends/reference/RefWorkloadFactory.hpp
index c22d87f..734c5e4 100644
--- a/src/backends/reference/RefWorkloadFactory.hpp
+++ b/src/backends/reference/RefWorkloadFactory.hpp
@@ -85,6 +85,9 @@
     std::unique_ptr<IWorkload> CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor& descriptor,
                                                     const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateCast(const CastQueueDescriptor& descriptor,
+                                          const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateComparison(const ComparisonQueueDescriptor& descriptor,
                                                 const WorkloadInfo& info) const override;
 
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 9676509..bf18284 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -47,6 +47,7 @@
         workloads/RefArgMinMaxWorkload.cpp \
         workloads/RefBatchNormalizationWorkload.cpp \
         workloads/RefBatchToSpaceNdWorkload.cpp \
+        workloads/RefCastWorkload.cpp \
         workloads/RefComparisonWorkload.cpp \
         workloads/RefConcatWorkload.cpp \
         workloads/RefConstantWorkload.cpp \
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 7371692..228df09 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -1471,6 +1471,20 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(QLstm1, QLstmTest1)
 ARMNN_AUTO_TEST_CASE_WITH_THF(QLstm2, QLstmTest2)
 
+// Cast
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastInt32ToFloat, CastInt32ToFloat2dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastInt16ToFloat, CastInt16ToFloat2dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastInt8ToFloat, CastInt8ToFloat2dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastInt8AsymmToFloat, CastInt8AsymmToFloat2dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastUIntToFloat, CastUInt8ToFloat2dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastInt8ToUInt, CastInt8ToUInt82dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastInt8AsymmToUInt, CastInt8AsymmToUInt82dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastFloat16ToFloat32, CastFloat16ToFloat322dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastBFloat16ToFloat32, CastBFloat16ToFloat322dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastFloatToFloat16, CastFloat32ToFloat162dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastFloatToIn8, CastFloat32ToInt82dTest)
+ARMNN_AUTO_TEST_CASE_WITH_THF(CastFloatToUInt8, CastFloat32ToUInt82dTest)
+
 // Convert from BFloat16 to Float32
 ARMNN_AUTO_TEST_CASE_WITH_THF(ConvertBf16ToFp32, ConvertBf16ToFp32Test)
 
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 1f4298b..dadedf9 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -63,6 +63,8 @@
     RefBatchNormalizationWorkload.hpp
     RefBatchToSpaceNdWorkload.cpp
     RefBatchToSpaceNdWorkload.hpp
+    RefCastWorkload.cpp
+    RefCastWorkload.hpp
     RefComparisonWorkload.cpp
     RefComparisonWorkload.hpp
     RefConcatWorkload.cpp
diff --git a/src/backends/reference/workloads/RefCastWorkload.cpp b/src/backends/reference/workloads/RefCastWorkload.cpp
new file mode 100644
index 0000000..7080415
--- /dev/null
+++ b/src/backends/reference/workloads/RefCastWorkload.cpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefCastWorkload.hpp"
+#include "RefWorkloadUtils.hpp"
+#include <armnnUtils/FloatingPointConverter.hpp>
+#include <ResolveType.hpp>
+#include "Encoders.hpp"
+#include "Decoders.hpp"
+
+namespace
+{
+    void Cast(armnn::Decoder<float>& in, armnn::Encoder<float>& out, const uint32_t numElements )
+    {
+        for (unsigned int i = 0; i < numElements; i++)
+        {
+            out.Set(in.Get());
+            ++in;
+            ++out;
+        }
+    }
+}
+
+namespace armnn
+{
+
+    void RefCastWorkload::Execute() const
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefCastWorkload_Execute");
+        const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+        const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+
+        Cast(*MakeDecoder<float>(inputInfo, m_Data.m_Inputs[0]->Map()),
+             *MakeEncoder<float>(outputInfo, m_Data.m_Outputs[0]->Map()),
+             inputInfo.GetNumElements());
+    }
+
+} //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefCastWorkload.hpp b/src/backends/reference/workloads/RefCastWorkload.hpp
new file mode 100644
index 0000000..6742ef0
--- /dev/null
+++ b/src/backends/reference/workloads/RefCastWorkload.hpp
@@ -0,0 +1,24 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include "RefWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+
+class RefCastWorkload : public BaseWorkload<CastQueueDescriptor>
+{
+public:
+    using BaseWorkload<CastQueueDescriptor>::BaseWorkload;
+    void Execute() const override;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 989644f..d3995f2 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -18,6 +18,7 @@
 #include "RefArgMinMaxWorkload.hpp"
 #include "RefBatchNormalizationWorkload.hpp"
 #include "RefBatchToSpaceNdWorkload.hpp"
+#include "RefCastWorkload.hpp"
 #include "RefComparisonWorkload.hpp"
 #include "RefConvolution2dWorkload.hpp"
 #include "RefConstantWorkload.hpp"