IVGCVSW-4516 Add ConvertFp32ToBf16Layer and Ref workload support

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: I9099a4f840fb747336f77d20a0868b64e801a310
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index 1ac08af..c3c8421 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -125,6 +125,14 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
+bool LayerSupportBase::IsConvertFp32ToBf16Supported(const TensorInfo& /*input*/,
+                                                    const TensorInfo& /*output*/,
+                                                    Optional<std::string&> reasonIfUnsupported) const
+{
+    return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
+}
+
+
 bool LayerSupportBase::IsConvertFp32ToFp16Supported(const TensorInfo& /*input*/,
                                                     const TensorInfo& /*output*/,
                                                     Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index 59e8b96..0639833 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -69,6 +69,10 @@
                                       const TensorInfo& output,
                                       Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsConvertFp32ToBf16Supported(const TensorInfo& input,
+                                      const TensorInfo& output,
+                                      Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsConvertFp32ToFp16Supported(
             const TensorInfo& input,
             const TensorInfo& output,
diff --git a/src/backends/backendsCommon/Workload.hpp b/src/backends/backendsCommon/Workload.hpp
index 56736ce..984443b 100644
--- a/src/backends/backendsCommon/Workload.hpp
+++ b/src/backends/backendsCommon/Workload.hpp
@@ -182,6 +182,11 @@
                                                      armnn::DataType::Float32>;
 
 template <typename QueueDescriptor>
+using Float32ToBFloat16Workload = MultiTypedWorkload<QueueDescriptor,
+                                                     armnn::DataType::Float32,
+                                                     armnn::DataType::BFloat16>;
+
+template <typename QueueDescriptor>
 using Float16ToFloat32Workload = MultiTypedWorkload<QueueDescriptor,
                                                     armnn::DataType::Float16,
                                                     armnn::DataType::Float32>;
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 81aefa9..bf26056 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -2039,6 +2039,29 @@
     ValidateTensorShapesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
 }
 
+void ConvertFp32ToBf16QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"ConvertFp32ToBf16QueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    if (inputTensorInfo.GetDataType() != DataType::Float32)
+    {
+        throw InvalidArgumentException(descriptorName + ": Input tensor type must be Float32.");
+    }
+
+    if (outputTensorInfo.GetDataType() != DataType::BFloat16)
+    {
+        throw InvalidArgumentException(descriptorName + ": Output tensor type must be BFloat16.");
+    }
+
+    ValidateTensorShapesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+}
+
 void ConvertFp32ToFp16QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     const std::string descriptorName{"ConvertFp32ToFp16QueueDescriptor"};
diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp
index 9c392d3..85bda54 100644
--- a/src/backends/backendsCommon/WorkloadData.hpp
+++ b/src/backends/backendsCommon/WorkloadData.hpp
@@ -412,6 +412,11 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+struct ConvertFp32ToBf16QueueDescriptor : QueueDescriptor
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 struct ConvertFp16ToFp32QueueDescriptor : QueueDescriptor
 {
     void Validate(const WorkloadInfo& workloadInfo) const;
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index d932eef..5854bec 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -173,6 +173,13 @@
             result = layerSupportObject->IsConvertFp16ToFp32Supported(input, output, reason);
             break;
         }
+        case LayerType::ConvertFp32ToBf16:
+        {
+            const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+            result = layerSupportObject->IsConvertFp32ToBf16Supported(input, output, reason);
+            break;
+        }
         case LayerType::ConvertFp32ToFp16:
         {
             const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
@@ -1163,6 +1170,12 @@
     return std::unique_ptr<IWorkload>();
 }
 
+std::unique_ptr<IWorkload> IWorkloadFactory::CreateConvertFp32ToBf16(const ConvertFp32ToBf16QueueDescriptor& /*desc*/,
+                                                                     const WorkloadInfo& /*info*/) const
+{
+    return std::unique_ptr<IWorkload>();
+}
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& /*desc*/,
                                                                      const WorkloadInfo& /*info*/) const
 {
diff --git a/src/backends/backendsCommon/WorkloadFactory.hpp b/src/backends/backendsCommon/WorkloadFactory.hpp
index 8c22452..0fc7ab9 100644
--- a/src/backends/backendsCommon/WorkloadFactory.hpp
+++ b/src/backends/backendsCommon/WorkloadFactory.hpp
@@ -85,6 +85,9 @@
     virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const;
 
+    virtual std::unique_ptr<IWorkload> CreateConvertFp32ToBf16(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+                                                               const WorkloadInfo& info) const;
+
     virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                                const WorkloadInfo& info) const;
 
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 22de3db..ba55180 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -46,6 +46,7 @@
     test/layerTests/Conv2dTestImpl.cpp \
     test/layerTests/ConvertBf16ToFp32TestImpl.cpp \
     test/layerTests/ConvertFp16ToFp32TestImpl.cpp \
+    test/layerTests/ConvertFp32ToBf16TestImpl.cpp \
     test/layerTests/ConvertFp32ToFp16TestImpl.cpp \
     test/layerTests/DebugTestImpl.cpp \
     test/layerTests/DepthToSpaceTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index ea214de..bfaca6c 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -73,6 +73,8 @@
     layerTests/ConvertBf16ToFp32TestImpl.hpp
     layerTests/ConvertFp16ToFp32TestImpl.cpp
     layerTests/ConvertFp16ToFp32TestImpl.hpp
+    layerTests/ConvertFp32ToBf16TestImpl.cpp
+    layerTests/ConvertFp32ToBf16TestImpl.hpp
     layerTests/ConvertFp32ToFp16TestImpl.cpp
     layerTests/ConvertFp32ToFp16TestImpl.hpp
     layerTests/DebugTestImpl.cpp
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index a070ac0..d646847 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -447,6 +447,8 @@
 
 DECLARE_LAYER_POLICY_1_PARAM(ConvertFp16ToFp32)
 
+DECLARE_LAYER_POLICY_1_PARAM(ConvertFp32ToBf16)
+
 DECLARE_LAYER_POLICY_1_PARAM(ConvertFp32ToFp16)
 
 DECLARE_LAYER_POLICY_2_PARAM(Convolution2d)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 1c6277a..600a261 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -15,6 +15,7 @@
 #include <backendsCommon/test/layerTests/ConcatTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConvertBf16ToFp32TestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.hpp>
+#include <backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.hpp>
 #include <backendsCommon/test/layerTests/Conv2dTestImpl.hpp>
 #include <backendsCommon/test/layerTests/ConstantTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.cpp
new file mode 100644
index 0000000..66eb4ee
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.cpp
@@ -0,0 +1,77 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConvertFp32ToBf16TestImpl.hpp"
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+LayerTestResult<armnn::BFloat16, 4> ConvertFp32ToBf16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    IgnoreUnused(memoryManager);
+
+    const armnn::TensorInfo inputTensorInfo({1, 2, 4, 3}, armnn::DataType::Float32);
+    const armnn::TensorInfo outputTensorInfo({1, 2, 4, 3}, armnn::DataType::BFloat16);
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo,
+        { -37.5f, -15.2f, -8.76f,
+          -2.0f, -1.5f, -1.3f,
+          -0.5f, -0.4f, 0.0f,
+          1.0f, 0.4f, 0.5f,
+          1.3f, 1.5f, 2.0f,
+          8.76f, 15.2f, 37.5f,
+          3.8f, // 0x40733333 Round down
+          3.1055E+29f, // 0x707ADC3C Round up
+          9.149516E-10f, // 0x307B7FFF Round down
+          -3.8f, // 0xC0733333 Round down
+          -3.1055E+29f, // 0xF07ADC3C Round up
+          -9.149516E-10f // 0xB07B7FFF Round down
+        });
+
+    std::vector<armnn::BFloat16> outputValues = armnnUtils::QuantizedVector<armnn::BFloat16>(
+        {
+            -37.5f, -15.2f, -8.76f,
+            -2.0f, -1.5f, -1.3f,
+            -0.5f, -0.4f, 0.0f,
+          1.0f, 0.4f, 0.5f,
+          1.3f, 1.5f, 2.0f,
+          8.76f, 15.2f, 37.5f,
+          3.796875f, // 0x4073
+          3.1072295E29f, // 0x707B
+          9.131327E-10f, // 0x307B
+          -3.796875f, // 0xC073
+          -3.1072295E29f, // 0xF07B
+          -9.131327E-10f // 0xB07B
+        },
+        1.0f, 0);
+
+    LayerTestResult<armnn::BFloat16, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<armnn::BFloat16, 4>(outputTensorInfo, outputValues);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ConvertFp32ToBf16QueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp32ToBf16(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.hpp
new file mode 100644
index 0000000..4c6125f
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToBf16TestImpl.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <BFloat16.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<armnn::BFloat16, 4> ConvertFp32ToBf16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);