IVGCVSW-4633 Add conversion of BF16 support to Neon

 * Add NeonConvertBf16ToFp32Workload
 * Add NeonConvertFp32ToBf16Workload
 * Add BFloat16 type support to NeonConstantWorkload and NeonTensorHandle
 * Add ConvertBf16ToFp32Weight when ConvertBf16ToFp32Layer is added
 * Unit tests

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: Id5b44a203add5e0c98c1ca4e2162115741b56644
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index c2da4da..a443721 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -146,6 +146,30 @@
     return noErrors;
 }
 
+template <typename LayerT>
+LayerT* ConvertBf16ToFp32Weight(Layer* l)
+{
+    LayerT* layer = boost::polymorphic_downcast<LayerT*>(l);
+    if ((layer->GetType() == LayerType::Convolution2d || layer->GetType() == LayerType::FullyConnected)
+         && layer->m_Weight)
+    {
+        const TensorInfo& info = layer->m_Weight->GetTensorInfo();
+
+        if (info.GetDataType() == DataType::BFloat16)
+        {
+            std::vector<float> newValues(info.GetNumElements());
+
+            armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(
+                layer->m_Weight->template GetTensor<armnn::BFloat16>(), info.GetNumElements(), newValues.data());
+
+            TensorInfo newInfo(info.GetShape(), DataType::Float32);
+            ConstTensor newInput(newInfo, newValues);
+            layer->m_Weight.reset(new ScopedCpuTensorHandle(newInput));
+        }
+    }
+    return layer;
+}
+
 OptimizationResult AttemptBackendAssignment(BackendSettings& backendSettings,
                                             Graph& graph,
                                             Layer* layer,
@@ -260,6 +284,14 @@
                 {
                     convertBf16ToFp32Layers =
                         InsertConvertBf16ToFp32LayersBefore(graph, *layer);
+                    if (layer->GetType() == LayerType::Convolution2d)
+                    {
+                        ConvertBf16ToFp32Weight<Convolution2dLayer>(layer);
+                    }
+                    else if (layer->GetType() == LayerType::FullyConnected)
+                    {
+                        ConvertBf16ToFp32Weight<FullyConnectedLayer>(layer);
+                    }
                 }
 
                 // Insert FP32 -> BF16 conversion layer after current layer
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.cpp b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
index 84091e8..f5a9e05 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.cpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
@@ -17,6 +17,8 @@
 {
     switch(dataType)
     {
+        case armnn::DataType::BFloat16:
+            return arm_compute::DataType::BFLOAT16;
         case armnn::DataType::Boolean:
             return arm_compute::DataType::U8;
         case armnn::DataType::Float16:
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index c01a178..44e84fb 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -259,6 +259,16 @@
                                       &TrueFunc<>);
 }
 
+bool NeonLayerSupport::IsConvertBf16ToFp32Supported(const TensorInfo& input,
+                                                    const TensorInfo& output,
+                                                    Optional<std::string&> reasonIfUnsupported) const
+{
+    armnn::IgnoreUnused(input);
+    armnn::IgnoreUnused(output);
+    armnn::IgnoreUnused(reasonIfUnsupported);
+    return true;
+}
+
 bool NeonLayerSupport::IsConvertFp16ToFp32Supported(const TensorInfo& input,
                                                     const TensorInfo& output,
                                                     Optional<std::string&> reasonIfUnsupported) const
@@ -269,6 +279,16 @@
     return true;
 }
 
+bool NeonLayerSupport::IsConvertFp32ToBf16Supported(const TensorInfo& input,
+                                                    const TensorInfo& output,
+                                                    Optional<std::string&> reasonIfUnsupported) const
+{
+    armnn::IgnoreUnused(input);
+    armnn::IgnoreUnused(output);
+    armnn::IgnoreUnused(reasonIfUnsupported);
+    return true;
+}
+
 bool NeonLayerSupport::IsConvertFp32ToFp16Supported(const TensorInfo& input,
                                                     const TensorInfo& output,
                                                     Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index f45db35..fda0bc3 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -60,10 +60,18 @@
     bool IsConstantSupported(const TensorInfo& output,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsConvertBf16ToFp32Supported(const TensorInfo& input,
+                                      const TensorInfo& output,
+                                      Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsConvertFp16ToFp32Supported(const TensorInfo& input,
                                       const TensorInfo& output,
                                       Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsConvertFp32ToBf16Supported(const TensorInfo& input,
+                                      const TensorInfo& output,
+                                      Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsConvertFp32ToFp16Supported(const TensorInfo& input,
                                       const TensorInfo& output,
                                       Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
index 2e9be11..11d2087 100644
--- a/src/backends/neon/NeonTensorHandle.hpp
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -4,6 +4,7 @@
 //
 #pragma once
 
+#include <BFloat16.hpp>
 #include <Half.hpp>
 
 #include <aclCommon/ArmComputeTensorHandle.hpp>
@@ -176,6 +177,10 @@
                 armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
                                                                  static_cast<uint8_t*>(memory));
                 break;
+            case arm_compute::DataType::BFLOAT16:
+                armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
+                                                                 static_cast<armnn::BFloat16*>(memory));
+                break;
             case arm_compute::DataType::F16:
                 armcomputetensorutils::CopyArmComputeITensorData(this->GetTensor(),
                                                                  static_cast<armnn::Half*>(memory));
@@ -210,6 +215,10 @@
                 armcomputetensorutils::CopyArmComputeITensorData(static_cast<const uint8_t*>(memory),
                                                                  this->GetTensor());
                 break;
+            case arm_compute::DataType::BFLOAT16:
+                armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::BFloat16*>(memory),
+                                                                 this->GetTensor());
+                break;
             case arm_compute::DataType::F16:
                 armcomputetensorutils::CopyArmComputeITensorData(static_cast<const armnn::Half*>(memory),
                                                                  this->GetTensor());
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index 982104e..47f7205 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -154,6 +154,13 @@
     return std::make_unique<NeonConstantWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertBf16ToFp32(
+    const ConvertBf16ToFp32QueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonConvertBf16ToFp32Workload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32(
     const ConvertFp16ToFp32QueueDescriptor& descriptor,
     const WorkloadInfo& info) const
@@ -161,6 +168,13 @@
     return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToBf16(
+    const ConvertFp32ToBf16QueueDescriptor& descriptor,
+    const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonConvertFp32ToBf16Workload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16(
     const ConvertFp32ToFp16QueueDescriptor& descriptor,
     const WorkloadInfo& info) const
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
index f122792..d6968fa 100644
--- a/src/backends/neon/NeonWorkloadFactory.hpp
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -66,9 +66,15 @@
     std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
                                               const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateConvertBf16ToFp32(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+                                                       const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
                                                        const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateConvertFp32ToBf16(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+                                                       const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
                                                        const WorkloadInfo& info) const override;
 
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 3b3333a..c6b2230 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -30,6 +30,8 @@
         workloads/NeonComparisonWorkload.cpp \
         workloads/NeonConcatWorkload.cpp \
         workloads/NeonConstantWorkload.cpp \
+        workloads/NeonConvertBf16ToFp32Workload.cpp \
+        workloads/NeonConvertFp32ToBf16Workload.cpp \
         workloads/NeonConvertFp16ToFp32Workload.cpp \
         workloads/NeonConvertFp32ToFp16Workload.cpp \
         workloads/NeonConvolution2dWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 19a8916..20644fe 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -509,6 +509,12 @@
 ARMNN_AUTO_TEST_CASE(ConcatUint8DifferentInputOutputQParam,
                      ConcatDifferentInputOutputQParamTest<DataType::QAsymmU8>, false)
 
+// Convert from BFloat16 to Float32
+ARMNN_AUTO_TEST_CASE(ConvertBf16ToFp32, ConvertBf16ToFp32Test)
+
+// Convert from Float32 to BFloat16
+ARMNN_AUTO_TEST_CASE(ConvertFp32ToBf16, ConvertFp32ToBf16Test)
+
 // Fully Connected
 ARMNN_AUTO_TEST_CASE(SimpleFullyConnected, FullyConnectedFloat32Test, false, false)
 ARMNN_AUTO_TEST_CASE(SimpleFullyConnectedWithBias, FullyConnectedFloat32Test, true, false)
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index cbe1e3b..7db315f 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -22,8 +22,12 @@
     NeonConcatWorkload.hpp
     NeonConstantWorkload.cpp
     NeonConstantWorkload.hpp
+    NeonConvertBf16ToFp32Workload.cpp
+    NeonConvertBf16ToFp32Workload.hpp
     NeonConvertFp16ToFp32Workload.cpp
     NeonConvertFp16ToFp32Workload.hpp
+    NeonConvertFp32ToBf16Workload.cpp
+    NeonConvertFp32ToBf16Workload.hpp
     NeonConvertFp32ToFp16Workload.cpp
     NeonConvertFp32ToFp16Workload.hpp
     NeonConvolution2dWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonConstantWorkload.cpp b/src/backends/neon/workloads/NeonConstantWorkload.cpp
index 08f0390..83a2692 100644
--- a/src/backends/neon/workloads/NeonConstantWorkload.cpp
+++ b/src/backends/neon/workloads/NeonConstantWorkload.cpp
@@ -6,6 +6,7 @@
 #include "NeonConstantWorkload.hpp"
 
 #include <arm_compute/core/Types.h>
+#include <BFloat16.hpp>
 #include <Half.hpp>
 #include <aclCommon/ArmComputeTensorUtils.hpp>
 #include <neon/NeonTensorHandle.hpp>
@@ -46,6 +47,11 @@
 
         switch (computeDataType)
         {
+            case arm_compute::DataType::BFLOAT16:
+            {
+                CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<BFloat16>(), output);
+                break;
+            }
             case arm_compute::DataType::F16:
             {
                 CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<Half>(), output);
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
new file mode 100644
index 0000000..79d1f22
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.cpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertBf16ToFp32Workload.hpp"
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <BFloat16.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+namespace armnn
+{
+
+NeonConvertBf16ToFp32Workload::NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor,
+                                                             const WorkloadInfo& info)
+     : BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>(descriptor, info)
+{
+    this->m_Data.ValidateInputsOutputs("NeonConvertBf16ToFp32Workload", 1, 1);
+    GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertBf16ToFp32Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertBf16ToFp32Workload_Execute");
+
+    auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+        {
+            auto input = reinterpret_cast<const BFloat16*>(src);
+            auto output = reinterpret_cast<float*>(dst);
+            size_t numElements = size/2; // 2 bytes per Bf16
+            armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(input, numElements, output);
+        };
+
+    for (const auto& pair : m_TensorHandlePairs)
+    {
+        CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+    }
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
new file mode 100644
index 0000000..0969088
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertBf16ToFp32Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertBf16ToFp32Workload : public BFloat16ToFloat32Workload<ConvertBf16ToFp32QueueDescriptor>
+{
+public:
+    NeonConvertBf16ToFp32Workload(const ConvertBf16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+    std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
new file mode 100644
index 0000000..e1aceec
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.cpp
@@ -0,0 +1,44 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonConvertFp32ToBf16Workload.hpp"
+
+#include <BFloat16.hpp>
+#include <Profiling.hpp>
+
+#include <armnnUtils/FloatingPointConverter.hpp>
+
+#include <backendsCommon/WorkloadUtils.hpp>
+
+namespace armnn
+{
+
+NeonConvertFp32ToBf16Workload::NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor,
+                                                             const WorkloadInfo& info)
+    : Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>(descriptor, info)
+{
+    this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToBf16Workload", 1, 1);
+    GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
+}
+
+void NeonConvertFp32ToBf16Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToBf16Workload_Execute");
+
+    auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size)
+        {
+            auto input = reinterpret_cast<const float*>(src);
+            auto output = reinterpret_cast<BFloat16*>(dst);
+            size_t numElements = size/2; // 2 bytes per bf16
+            armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(input, numElements, output);
+        };
+
+    for (const auto& pair : m_TensorHandlePairs)
+    {
+        CopyTensorContentsGeneric(pair.first, pair.second, convertFunc);
+    }
+}
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
new file mode 100644
index 0000000..bc96c16
--- /dev/null
+++ b/src/backends/neon/workloads/NeonConvertFp32ToBf16Workload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+namespace armnn
+{
+
+class NeonConvertFp32ToBf16Workload : public Float32ToBFloat16Workload<ConvertFp32ToBf16QueueDescriptor>
+{
+public:
+    NeonConvertFp32ToBf16Workload(const ConvertFp32ToBf16QueueDescriptor& descriptor, const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
+    std::vector<TensorHandlePair> m_TensorHandlePairs;
+};
+
+} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 2b7eabe..f255547 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -13,7 +13,9 @@
 #include "NeonBatchToSpaceNdWorkload.hpp"
 #include "NeonComparisonWorkload.hpp"
 #include "NeonConstantWorkload.hpp"
+#include "NeonConvertBf16ToFp32Workload.hpp"
 #include "NeonConvertFp16ToFp32Workload.hpp"
+#include "NeonConvertFp32ToBf16Workload.hpp"
 #include "NeonConvertFp32ToFp16Workload.hpp"
 #include "NeonConvolution2dWorkload.hpp"
 #include "NeonDepthToSpaceWorkload.hpp"