IVGCVSW-4515 Add ConvertBf16ToFp32Layer and Ref workload support

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: Ida6d7e1d2c9abe0618f8b711bab9d62c011090d6
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index e8ef46e..1ac08af 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -111,6 +111,13 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
+bool LayerSupportBase::IsConvertBf16ToFp32Supported(const TensorInfo& /*input*/,
+                                                    const TensorInfo& /*output*/,
+                                                    Optional<std::string&> reasonIfUnsupported) const
+{
+    return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
+}
+
 bool LayerSupportBase::IsConvertFp16ToFp32Supported(const TensorInfo& /*input*/,
                                                     const TensorInfo& /*output*/,
                                                     Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index 888bef5..59e8b96 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -61,6 +61,10 @@
     bool IsConstantSupported(const TensorInfo& output,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsConvertBf16ToFp32Supported(const TensorInfo& input,
+                                      const TensorInfo& output,
+                                      Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsConvertFp16ToFp32Supported(const TensorInfo& input,
                                       const TensorInfo& output,
                                       Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/backendsCommon/Workload.hpp b/src/backends/backendsCommon/Workload.hpp
index 8ec09f9..d7434c0 100644
--- a/src/backends/backendsCommon/Workload.hpp
+++ b/src/backends/backendsCommon/Workload.hpp
@@ -177,6 +177,11 @@
                                                        armnn::DataType::Boolean>;
 
 template <typename QueueDescriptor>
+using BFloat16ToFloat32Workload = MultiTypedWorkload<QueueDescriptor,
+                                                     armnn::DataType::BFloat16,
+                                                     armnn::DataType::Float32>;
+
+template <typename QueueDescriptor>
 using Float16ToFloat32Workload = MultiTypedWorkload<QueueDescriptor,
                                                     armnn::DataType::Float16,
                                                     armnn::DataType::Float32>;
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index b501b3d..81aefa9 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -2016,6 +2016,29 @@
     }
 }
 
+void ConvertBf16ToFp32QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"ConvertBf16ToFp32QueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    if (inputTensorInfo.GetDataType() != DataType::BFloat16)
+    {
+        throw InvalidArgumentException(descriptorName + ": Input tensor type must be BFloat16.");
+    }
+
+    if (outputTensorInfo.GetDataType() != DataType::Float32)
+    {
+        throw InvalidArgumentException(descriptorName + ": Output tensor type must be Float32.");
+    }
+
+    ValidateTensorShapesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+}
+
 void ConvertFp32ToFp16QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     const std::string descriptorName{"ConvertFp32ToFp16QueueDescriptor"};
diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp
index 06289fa..9c392d3 100644
--- a/src/backends/backendsCommon/WorkloadData.hpp
+++ b/src/backends/backendsCommon/WorkloadData.hpp
@@ -407,6 +407,11 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+struct ConvertBf16ToFp32QueueDescriptor : QueueDescriptor
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 struct ConvertFp16ToFp32QueueDescriptor : QueueDescriptor
 {
     void Validate(const WorkloadInfo& workloadInfo) const;
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 2e1ce0a..d932eef 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -159,6 +159,13 @@
             result = layerSupportObject->IsConstantSupported(OverrideDataType(output, dataType), reason);
             break;
         }
+        case LayerType::ConvertBf16ToFp32:
+        {
+            const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+            result = layerSupportObject->IsConvertBf16ToFp32Supported(input, output, reason);
+            break;
+        }
         case LayerType::ConvertFp16ToFp32:
         {
             const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
@@ -1144,6 +1151,12 @@
     return std::unique_ptr<IWorkload>();
 }
 
+std::unique_ptr<IWorkload> IWorkloadFactory::CreateConvertBf16ToFp32(const ConvertBf16ToFp32QueueDescriptor& /*desc*/,
+                                                                     const WorkloadInfo& /*info*/) const
+{
+    return std::unique_ptr<IWorkload>();
+}
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& /*desc*/,
                                                                      const WorkloadInfo& /*info*/) const
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.hpp b/src/backends/backendsCommon/WorkloadFactory.hpp
index dae58b6..8c22452 100644
--- a/src/backends/backendsCommon/WorkloadFactory.hpp
+++ b/src/backends/backendsCommon/WorkloadFactory.hpp
@@ -79,6 +79,9 @@
     virtual std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
                                                       const WorkloadInfo& info) const;
 
+    virtual std::unique_ptr<IWorkload> CreateConvertBf16ToFp32(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+                                                               const WorkloadInfo& info) const;
+
     virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const;
 
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 56a21b3..22de3db 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -44,6 +44,7 @@
     test/layerTests/ConcatTestImpl.cpp \
     test/layerTests/ConstantTestImpl.cpp \
     test/layerTests/Conv2dTestImpl.cpp \
+    test/layerTests/ConvertBf16ToFp32TestImpl.cpp \
     test/layerTests/ConvertFp16ToFp32TestImpl.cpp \
     test/layerTests/ConvertFp32ToFp16TestImpl.cpp \
     test/layerTests/DebugTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 0376e3e..dc8031f 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -68,6 +68,8 @@
     layerTests/ConstantTestImpl.hpp
     layerTests/Conv2dTestImpl.cpp
     layerTests/Conv2dTestImpl.hpp
+    layerTests/ConvertBf16ToFp32TestImpl.cpp
+    layerTests/ConvertBf16ToFp32TestImpl.hpp
     layerTests/ConvertFp16ToFp32TestImpl.cpp
     layerTests/ConvertFp16ToFp32TestImpl.hpp
     layerTests/ConvertFp32ToFp16TestImpl.cpp
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index 15608cc..a070ac0 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -443,6 +443,8 @@
 
 DECLARE_LAYER_POLICY_1_PARAM(Constant)
 
+DECLARE_LAYER_POLICY_1_PARAM(ConvertBf16ToFp32)
+
 DECLARE_LAYER_POLICY_1_PARAM(ConvertFp16ToFp32)
 
 DECLARE_LAYER_POLICY_1_PARAM(ConvertFp32ToFp16)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 62a66df..1c6277a 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -13,6 +13,7 @@
 #include <backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ComparisonTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConcatTestImpl.hpp>
+#include <backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.hpp>
 #include <backendsCommon/test/layerTests/Conv2dTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.cpp
new file mode 100644
index 0000000..0dc3048
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.cpp
@@ -0,0 +1,56 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConvertBf16ToFp32TestImpl.hpp"
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+LayerTestResult<float, 4> ConvertBf16ToFp32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    IgnoreUnused(memoryManager);
+
+    const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::BFloat16);
+    const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+
+    std::vector<armnn::BFloat16> inputValues = armnnUtils::QuantizedVector<armnn::BFloat16>(
+        {
+            -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+          1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
+        },
+        1.0f, 0);
+
+    auto input = MakeTensor<armnn::BFloat16, 4>(inputTensorInfo, std::vector<armnn::BFloat16>(inputValues));
+
+    LayerTestResult<float, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo,
+        { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+          1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f });
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ConvertBf16ToFp32QueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertBf16ToFp32(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.hpp
new file mode 100644
index 0000000..717ec6a
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <BFloat16.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> ConvertBf16ToFp32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 94128fe..a4f4efd 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -376,6 +376,21 @@
                                   "Reference constant: output is not a supported type.");
 }
 
+bool RefLayerSupport::IsConvertBf16ToFp32Supported(const TensorInfo& input,
+                                                   const TensorInfo& output,
+                                                   Optional<std::string&> reasonIfUnsupported) const
+{
+    bool supported = true;
+
+    supported &= CheckSupportRule(TypeIs(input, DataType::BFloat16), reasonIfUnsupported,
+                                  "Reference for ConvertBf16ToFp32 layer: input type not supported");
+
+    supported &= CheckSupportRule(TypeIs(output, DataType::Float32), reasonIfUnsupported,
+                                  "Reference for ConvertBf16ToFp32 layer: output type not supported");
+
+    return supported;
+}
+
 bool RefLayerSupport::IsConvertFp16ToFp32Supported(const TensorInfo& input,
                                                    const TensorInfo& output,
                                                    Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 27f3f81..ff34781 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -60,6 +60,10 @@
     bool IsConstantSupported(const TensorInfo& output,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsConvertBf16ToFp32Supported(const TensorInfo& input,
+                                      const TensorInfo& output,
+                                      Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsConvertFp16ToFp32Supported(const TensorInfo& input,
                                       const TensorInfo& output,
                                       Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index aebf19b..c9fc626 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -180,6 +180,13 @@
     return std::make_unique<RefConstantWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateConvertBf16ToFp32(
+    const ConvertBf16ToFp32QueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
+{
+    return std::make_unique<RefConvertBf16ToFp32Workload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateConvertFp16ToFp32(
     const ConvertFp16ToFp32QueueDescriptor& descriptor,
     const WorkloadInfo& info) const
diff --git a/src/backends/reference/RefWorkloadFactory.hpp b/src/backends/reference/RefWorkloadFactory.hpp
index b64479e..119605b 100644
--- a/src/backends/reference/RefWorkloadFactory.hpp
+++ b/src/backends/reference/RefWorkloadFactory.hpp
@@ -86,6 +86,9 @@
     std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
                                               const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateConvertBf16ToFp32(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+                                                       const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
                                                        const WorkloadInfo& info) const override;
 
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 010d548..aeac2e2 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -49,6 +49,7 @@
         workloads/RefComparisonWorkload.cpp \
         workloads/RefConcatWorkload.cpp \
         workloads/RefConstantWorkload.cpp \
+        workloads/RefConvertBf16ToFp32Workload.cpp \
         workloads/RefConvertFp16ToFp32Workload.cpp \
         workloads/RefConvertFp32ToFp16Workload.cpp \
         workloads/RefConvolution2dWorkload.cpp \
diff --git a/src/backends/reference/test/RefLayerSupportTests.cpp b/src/backends/reference/test/RefLayerSupportTests.cpp
index ab749c1..0b549db 100644
--- a/src/backends/reference/test/RefLayerSupportTests.cpp
+++ b/src/backends/reference/test/RefLayerSupportTests.cpp
@@ -116,6 +116,38 @@
     BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type output");
 }
 
+BOOST_AUTO_TEST_CASE(IsConvertBf16ToFp32SupportedReference)
+{
+    std::string reasonIfUnsupported;
+
+    bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertBf16ToFp32Layer,
+      armnn::DataType::BFloat16, armnn::DataType::Float32>(reasonIfUnsupported);
+
+    BOOST_CHECK(result);
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertBf16ToFp32SupportedFp32InputReference)
+{
+    std::string reasonIfUnsupported;
+
+    bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertBf16ToFp32Layer,
+      armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported);
+
+    BOOST_CHECK(!result);
+    BOOST_CHECK_EQUAL(reasonIfUnsupported, "Reference for ConvertBf16ToFp32 layer: input type not supported\n");
+}
+
+BOOST_AUTO_TEST_CASE(IsConvertBf16ToFp32SupportedBf16OutputReference)
+{
+    std::string reasonIfUnsupported;
+
+    bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertBf16ToFp32Layer,
+      armnn::DataType::BFloat16, armnn::DataType::BFloat16>(reasonIfUnsupported);
+
+    BOOST_CHECK(!result);
+    BOOST_CHECK_EQUAL(reasonIfUnsupported, "Reference for ConvertBf16ToFp32 layer: output type not supported\n");
+}
+
 BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedReference)
 {
     std::string reasonIfUnsupported;
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 73b2a05..08cfa28 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -1142,6 +1142,9 @@
 ARMNN_AUTO_TEST_CASE(LstmLayerInt16NoCifgNoPeepholeNoProjectionInt16Constant,
                      LstmLayerInt16NoCifgNoPeepholeNoProjectionInt16ConstantTest)
 
+// Convert from BFloat16 to Float32
+ARMNN_AUTO_TEST_CASE(ConvertBf16ToFp32, ConvertBf16ToFp32Test)
+
 // Convert from Float16 to Float32
 ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test)
 // Convert from Float32 to Float16
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index b2d8938..86764d8 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -69,6 +69,8 @@
     RefConcatWorkload.hpp
     RefConstantWorkload.cpp
     RefConstantWorkload.hpp
+    RefConvertBf16ToFp32Workload.cpp
+    RefConvertBf16ToFp32Workload.hpp
     RefConvertFp16ToFp32Workload.cpp
     RefConvertFp16ToFp32Workload.hpp
     RefConvertFp32ToFp16Workload.cpp
diff --git a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp
new file mode 100644
index 0000000..c4b5416
--- /dev/null
+++ b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.cpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefConvertBf16ToFp32Workload.hpp"
+#include "RefWorkloadUtils.hpp"
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <BFloat16.hpp>
+
+namespace armnn
+{
+
+void RefConvertBf16ToFp32Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertBf16ToFp32Workload_Execute");
+
+    const BFloat16* const input = GetInputTensorDataBFloat16(0, m_Data);
+    float* const output = GetOutputTensorDataFloat(0, m_Data);
+
+    unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+    armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(input, numElements, output);
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
new file mode 100644
index 0000000..87cdc3e
--- /dev/null
+++ b/src/backends/reference/workloads/RefConvertBf16ToFp32Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+class RefConvertBf16ToFp32Workload : public BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>
+{
+public:
+    using BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>::BFloat16ToFloat32Workload;
+    virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloadUtils.hpp b/src/backends/reference/workloads/RefWorkloadUtils.hpp
index c3260c8..6971314 100644
--- a/src/backends/reference/workloads/RefWorkloadUtils.hpp
+++ b/src/backends/reference/workloads/RefWorkloadUtils.hpp
@@ -12,6 +12,7 @@
 
 #include <reference/RefTensorHandle.hpp>
 
+#include <BFloat16.hpp>
 #include <Half.hpp>
 #include <boost/polymorphic_cast.hpp>
 
@@ -68,6 +69,12 @@
     return GetOutputTensorData<Half>(idx, data);
 }
 
+template <typename PayloadType>
+const BFloat16* GetInputTensorDataBFloat16(unsigned int idx, const PayloadType& data)
+{
+    return GetInputTensorData<BFloat16>(idx, data);
+}
+
 ////////////////////////////////////////////
 /// u8 helpers
 ////////////////////////////////////////////
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index a0558ff..37d79f0 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -22,6 +22,7 @@
 #include "RefConvolution2dWorkload.hpp"
 #include "RefConstantWorkload.hpp"
 #include "RefConcatWorkload.hpp"
+#include "RefConvertBf16ToFp32Workload.hpp"
 #include "RefConvertFp16ToFp32Workload.hpp"
 #include "RefConvertFp32ToFp16Workload.hpp"
 #include "RefDebugWorkload.hpp"