IVGCVSW-4520 Implement BFloat16 Optimizer

 * Add ReduceFp32ToBf16 to OptimizerOptions
 * Add ConvertFp32NetworkToBf16
 * Add utility functions to insert conversion layers
 * Add constant conversion BF16 <-> FP32
 * Unit tests

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: Iaff77e20c721400b052cb37eb9ef6fe16d7abaff
diff --git a/src/armnn/CompatibleTypes.hpp b/src/armnn/CompatibleTypes.hpp
index 4332f74..1a663d3 100644
--- a/src/armnn/CompatibleTypes.hpp
+++ b/src/armnn/CompatibleTypes.hpp
@@ -5,8 +5,10 @@
 
 #pragma once
 
-#include "armnn/Types.hpp"
-#include "Half.hpp"
+#include <armnn/Types.hpp>
+
+#include <BFloat16.hpp>
+#include <Half.hpp>
 
 namespace armnn
 {
@@ -30,6 +32,12 @@
 }
 
 template<>
+inline bool CompatibleTypes<BFloat16>(DataType dataType)
+{
+    return dataType == DataType::BFloat16;
+}
+
+template<>
 inline bool CompatibleTypes<uint8_t>(DataType dataType)
 {
     return dataType == DataType::Boolean || dataType == DataType::QAsymmU8;
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 7a6fa8f..5f77197 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -248,6 +248,86 @@
                 return result;
             }
         }
+        else if (dataTypeIn == DataType::BFloat16 || dataTypeOut == DataType::BFloat16)
+        {
+            if (IWorkloadFactory::IsLayerSupported(*layer, DataType::Float32, reasonIfUnsupported)
+                && layer->GetType() != LayerType::ConvertFp32ToBf16
+                && layer->GetType() != LayerType::ConvertBf16ToFp32)
+            {
+                // Insert BF16 -> FP32 conversion layer before current layer
+                std::vector<ConvertBf16ToFp32Layer*> convertBf16ToFp32Layers;
+                if (dataTypeIn == DataType::BFloat16)
+                {
+                    convertBf16ToFp32Layers =
+                        InsertConvertBf16ToFp32LayersBefore(graph, *layer);
+                }
+
+                // Insert FP32 -> BF16 conversion layer after current layer
+                std::vector<ConvertFp32ToBf16Layer*> convertFp32ToBf16Layers;
+                if (dataTypeOut == DataType::BFloat16)
+                {
+                    convertFp32ToBf16Layers =
+                        InsertConvertFp32ToBf16LayersAfter(graph, *layer);
+                }
+
+                // Assign a supported backend to the newly introduced conversion layers
+                auto AssignFirstSupportedBackend = [&](Layer* layer, BackendId preferredBackend)
+                    {
+                        bool supportedBackendFound = false;
+                        std::string reasonIfUnsupported;
+
+                        // Try preferred backend first
+                        layer->SetBackendId(preferredBackend);
+                        if (IWorkloadFactory::IsLayerSupported(*layer,
+                                                               EmptyOptional(),
+                                                               reasonIfUnsupported))
+                        {
+                            supportedBackendFound = true;
+                        }
+                        else
+                        {
+                            for (const auto& backend : availablePreferredBackends)
+                            {
+                                // Skip preferred backend (we already determined that it is not supported)
+                                if (backend == preferredBackend)
+                                {
+                                    continue;
+                                }
+
+                                layer->SetBackendId(backend);
+                                if (IWorkloadFactory::IsLayerSupported(*layer,
+                                                                       EmptyOptional(),
+                                                                       reasonIfUnsupported))
+                                {
+                                    supportedBackendFound = true;
+                                    break;
+                                }
+                            }
+                        }
+
+                        return supportedBackendFound;
+                    };
+
+                for (ConvertBf16ToFp32Layer* convertLayer : convertBf16ToFp32Layers)
+                {
+                    if (!AssignFirstSupportedBackend(convertLayer, backend))
+                    {
+                        return ReturnError(convertLayer);
+                    }
+                }
+
+                for (ConvertFp32ToBf16Layer* convertLayer : convertFp32ToBf16Layers)
+                {
+                    if (!AssignFirstSupportedBackend(convertLayer, backend))
+                    {
+                        return ReturnError(convertLayer);
+                    }
+                }
+
+                return result;
+            }
+        }
+
         std::stringstream warningMsg;
         warningMsg << "Layer of type " << GetLayerTypeAsCString(layer->GetType())
                    << " is not supported on requested backend " << layer->GetBackendId().Get()
@@ -898,6 +978,11 @@
         throw armnn::InvalidArgumentException("Invoked Optimize with no backends specified");
     }
 
+    if (options.m_ReduceFp32ToFp16 && options.m_ReduceFp32ToBf16)
+    {
+        throw InvalidArgumentException("BFloat16 and Float16 optimization cannot be enabled at the same time.");
+    }
+
     const Network& network = *boost::polymorphic_downcast<const Network*>(&inNetwork);
     std::unique_ptr<Graph> graph = std::make_unique<Graph>(network.GetGraph());
 
@@ -934,6 +1019,13 @@
         Optimizer::Pass(optGraph, MakeOptimizations(ConvertConstantsFloatToHalf()));
     }
 
+    // If Fp32 to Bf16 optimization is set convert Fp32 network to Bf16
+    if (options.m_ReduceFp32ToBf16)
+    {
+        Optimizer::Pass(optGraph, MakeOptimizations(Fp32NetworkToBf16Converter()));
+        Optimizer::Pass(optGraph, MakeOptimizations(ConvertConstantsFloatToBFloat()));
+    }
+
     // Initialize backend settings
     BackendSettings backendSettings(backendPreferences, deviceSpec);
     if (backendSettings.GetAvailablePreferredBackends().empty())
diff --git a/src/armnn/NetworkUtils.cpp b/src/armnn/NetworkUtils.cpp
index 1bbeaac..8653a08 100644
--- a/src/armnn/NetworkUtils.cpp
+++ b/src/armnn/NetworkUtils.cpp
@@ -16,7 +16,7 @@
 namespace
 {
 
-void UpdateOutputSlotFp16ToFp32(OutputSlot& outputSlot)
+void UpdateOutputSlotToFp32(OutputSlot& outputSlot)
 {
     const TensorInfo& origTensorInfo = outputSlot.GetTensorInfo();
     TensorInfo newTensorInfo(origTensorInfo);
@@ -24,19 +24,69 @@
     outputSlot.SetTensorInfo(newTensorInfo);
 }
 
+void ChangeOutputBf16ToFp32(Layer& layer)
+{
+    for (auto&& outputSlot = layer.BeginOutputSlots(); outputSlot != layer.EndOutputSlots(); ++outputSlot)
+    {
+        if (outputSlot->GetTensorInfo().GetDataType() == DataType::BFloat16)
+        {
+            UpdateOutputSlotToFp32(*outputSlot);
+        }
+    }
+}
+
 void ChangeOutputFp16ToFp32(Layer& layer)
 {
     for (auto&& outputSlot = layer.BeginOutputSlots(); outputSlot != layer.EndOutputSlots(); ++outputSlot)
     {
         if (outputSlot->GetTensorInfo().GetDataType() == DataType::Float16)
         {
-            UpdateOutputSlotFp16ToFp32(*outputSlot);
+            UpdateOutputSlotToFp32(*outputSlot);
         }
     }
 }
 
 } // anonymous namespace
 
+std::vector<ConvertBf16ToFp32Layer*> InsertConvertBf16ToFp32LayersBefore(Graph& graph,
+                                                                         Layer& layer,
+                                                                         bool expectCorrectInputType)
+{
+    std::vector<ConvertBf16ToFp32Layer*> convertLayers;
+    convertLayers.reserve(layer.GetNumInputSlots());
+
+    // Insert a ConvertBf16ToFp32Layer before each input slot
+    for (auto&& inputSlot = layer.BeginInputSlots(); inputSlot != layer.EndInputSlots(); ++inputSlot)
+    {
+        bool allowInsert = true;
+        if (expectCorrectInputType)
+        {
+            // Only insert ConvertBf16ToFp32Layer before BF16 input slots
+            OutputSlot* connectedOutputSlot = inputSlot->GetConnectedOutputSlot();
+            allowInsert =
+                connectedOutputSlot && connectedOutputSlot->GetTensorInfo().GetDataType() == DataType::BFloat16;
+        }
+
+        if (allowInsert)
+        {
+            const std::string name =
+                std::string("convert_bf16_to_fp32-" + std::to_string(inputSlot->GetSlotIndex()) + "-") +
+                layer.GetName();
+            ConvertBf16ToFp32Layer* convertLayer =
+                graph.InsertNewLayer<ConvertBf16ToFp32Layer>(*inputSlot, name.c_str());
+
+            TensorInfo convertInfo = convertLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+            convertInfo.SetDataType(DataType::Float32);
+
+            convertLayer->GetOutputSlot().SetTensorInfo(convertInfo);
+
+            convertLayers.emplace_back(convertLayer);
+        }
+    }
+
+    return convertLayers;
+}
+
 std::vector<ConvertFp16ToFp32Layer*> InsertConvertFp16ToFp32LayersBefore(Graph& graph,
                                                                          Layer& layer,
                                                                          bool expectCorrectInputType)
@@ -76,6 +126,39 @@
     return convertLayers;
 }
 
+std::vector<ConvertFp32ToBf16Layer*> InsertConvertFp32ToBf16LayersAfter(Graph& graph, Layer& layer)
+{
+    const unsigned int numOutputSlots = layer.GetNumOutputSlots();
+
+    std::vector<ConvertFp32ToBf16Layer*> convertLayers;
+    convertLayers.reserve(numOutputSlots);
+
+    // Update Bf16 output slots to FP32 on current layer
+    ChangeOutputBf16ToFp32(layer);
+
+    // Insert a ConvertFp32ToBf16Layer after each FP32 output slot
+    for (unsigned int slotIndex = 0u; slotIndex < numOutputSlots; ++slotIndex)
+    {
+        OutputSlot& outputSlot = layer.GetOutputSlot(slotIndex);
+        if(outputSlot.GetTensorInfo().GetDataType() == DataType::Float32)
+        {
+            const std::string name =
+                std::string("convert_fp32_to_bf16-" + std::to_string(slotIndex) + "-") + layer.GetName();
+            ConvertFp32ToBf16Layer* convertLayer =
+                graph.InsertNewLayer<ConvertFp32ToBf16Layer>(outputSlot, name.c_str());
+
+            TensorInfo convertInfo = convertLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+            convertInfo.SetDataType(DataType::BFloat16);
+
+            convertLayer->GetOutputSlot().SetTensorInfo(convertInfo);
+
+            convertLayers.emplace_back(convertLayer);
+        }
+    }
+
+    return convertLayers;
+}
+
 std::vector<ConvertFp32ToFp16Layer*> InsertConvertFp32ToFp16LayersAfter(Graph& graph, Layer& layer)
 {
     const unsigned int numOutputSlots = layer.GetNumOutputSlots();
diff --git a/src/armnn/NetworkUtils.hpp b/src/armnn/NetworkUtils.hpp
index 38fb223..064545a 100644
--- a/src/armnn/NetworkUtils.hpp
+++ b/src/armnn/NetworkUtils.hpp
@@ -11,6 +11,12 @@
 namespace armnn
 {
 
+std::vector<ConvertBf16ToFp32Layer*> InsertConvertBf16ToFp32LayersBefore(Graph& graph,
+                                                                         Layer& layer,
+                                                                         bool expectCorrectInputType = true);
+
+std::vector<ConvertFp32ToBf16Layer*> InsertConvertFp32ToBf16LayersAfter(Graph& graph, Layer& layer);
+
 std::vector<ConvertFp16ToFp32Layer*> InsertConvertFp16ToFp32LayersBefore(Graph& graph,
                                                                          Layer& layer,
                                                                          bool expectCorrectInputType = true);
diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp
index 273c337..9fc2842 100644
--- a/src/armnn/optimizations/All.hpp
+++ b/src/armnn/optimizations/All.hpp
@@ -6,6 +6,7 @@
 
 #include "AddDebug.hpp"
 #include "ConvertConstants.hpp"
+#include "ConvertFp32NetworkToBf16.hpp"
 #include "ConvertFp32NetworkToFp16.hpp"
 #include "FoldPadIntoConvolution2d.hpp"
 #include "MovePermuteUp.hpp"
diff --git a/src/armnn/optimizations/ConvertConstants.hpp b/src/armnn/optimizations/ConvertConstants.hpp
index 5e19c7b..f3ebcdf 100644
--- a/src/armnn/optimizations/ConvertConstants.hpp
+++ b/src/armnn/optimizations/ConvertConstants.hpp
@@ -13,6 +13,7 @@
 
 #include <armnn/utility/IgnoreUnused.hpp>
 
+#include <BFloat16.hpp>
 #include <Half.hpp>
 
 namespace armnn
@@ -20,6 +21,27 @@
 namespace optimizations
 {
 
+struct BFloat16ToFloat32
+{
+    static void Func(std::unique_ptr<ScopedCpuTensorHandle>& handle)
+    {
+        const TensorInfo& info = handle->GetTensorInfo();
+
+        if (info.GetDataType() == DataType::BFloat16)
+        {
+            std::vector<float> newValues(info.GetNumElements());
+
+            armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(handle->GetTensor<BFloat16>(),
+                                                                         info.GetNumElements(),
+                                                                         newValues.data());
+
+            TensorInfo newInfo(info.GetShape(), DataType::Float32);
+            ConstTensor newInput(newInfo, newValues);
+            handle.reset(new ScopedCpuTensorHandle(newInput));
+        }
+    }
+};
+
 struct Float16ToFloat32
 {
     static void Func(std::unique_ptr<ScopedCpuTensorHandle>& handle)
@@ -41,6 +63,27 @@
     }
 };
 
+struct Float32ToBFloat16
+{
+    static void Func(std::unique_ptr<ScopedCpuTensorHandle>& handle)
+    {
+        const TensorInfo& info = handle->GetTensorInfo();
+
+        if (info.GetDataType() == DataType::Float32)
+        {
+            std::vector<BFloat16> newValues(info.GetNumElements());
+
+            armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(handle->GetTensor<float>(),
+                                                                         info.GetNumElements(),
+                                                                         newValues.data());
+
+            TensorInfo newInfo(info.GetShape(), DataType::BFloat16);
+            ConstTensor newInput(newInfo, newValues);
+            handle.reset(new ScopedCpuTensorHandle(newInput));
+        }
+    }
+};
+
 struct Float32ToFloat16
 {
     static void Func(std::unique_ptr<ScopedCpuTensorHandle>& handle)
@@ -97,6 +140,17 @@
     }
 };
 
+struct IsBFloat16Layer
+{
+    static bool Test(const Layer& layer)
+    {
+        return layer.GetDataType() == DataType::BFloat16;
+    }
+};
+
+using ConvertConstantsBFloatToFloat = ConvertConstants<BFloat16ToFloat32, IsFloat32Layer>;
+using ConvertConstantsFloatToBFloat = ConvertConstants<Float32ToBFloat16, IsBFloat16Layer>;
+
 using ConvertConstantsHalfToFloat = ConvertConstants<Float16ToFloat32, IsFloat32Layer>;
 using ConvertConstantsFloatToHalf = ConvertConstants<Float32ToFloat16, IsFloat16Layer>;
 
diff --git a/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp b/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp
new file mode 100644
index 0000000..d6350c3
--- /dev/null
+++ b/src/armnn/optimizations/ConvertFp32NetworkToBf16.hpp
@@ -0,0 +1,81 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "Optimization.hpp"
+#include "NetworkUtils.hpp"
+
+namespace armnn
+{
+namespace optimizations
+{
+
+class ConvertFp32NetworkToBf16Impl
+{
+public:
+    void Run(Graph& graph, Layer& layer) const
+    {
+        if(layer.GetType() == LayerType::Input)
+        {
+            // if the outputs of this layer are DataType::Float32
+            // add a ConvertFloat32ToBFloat16 layer after each of the outputs
+            if (layer.GetDataType() == DataType::Float32)
+            {
+                InsertConvertFp32ToBf16LayersAfter(graph, layer);
+            }
+        }
+        else if (layer.GetType() == LayerType::Output)
+        {
+            // if the inputs of this layer are DataType::Float32
+            // add a ConvertBFloat16ToFloat32 layer before each of the inputs
+            if (layer.GetDataType() == DataType::Float32)
+            {
+                // NOTE: We need to call InsertConvertBf16ToFp32LayersBefore with expectCorrectInputType = false
+                // here, otherwise it will expect the inputs to be DataType::BFloat16
+                InsertConvertBf16ToFp32LayersBefore(graph, layer, false);
+            }
+        }
+        else if (layer.GetType() != LayerType::ConvertFp32ToBf16 && layer.GetType() != LayerType::ConvertBf16ToFp32)
+        {
+            // if the inputs/outputs of this layer are DataType::Float32
+            // change the data type for all inputs and outputs to DataType::BFloat16
+            for (auto&& input = layer.BeginInputSlots(); input != layer.EndInputSlots(); ++input)
+            {
+                // if it is connected to OutputSlot of the InputLayer do not change the DataType of connection
+                // InputSlots of the current layer will be updated when conversion layer is inserted after InputLayer
+                Layer& base = input->GetConnectedOutputSlot()->GetOwningLayer();
+                if (base.GetType() != LayerType::Input)
+                {
+                    TensorInfo convertInfo = input->GetConnection()->GetTensorInfo();
+                    if (convertInfo.GetDataType() == DataType::Float32)
+                    {
+                        convertInfo.SetDataType(DataType::BFloat16);
+                        input->GetConnection()->SetTensorInfo(convertInfo);
+                    }
+                }
+            }
+
+            // change outputs to DataType::BFloat16
+            for (auto&& output = layer.BeginOutputSlots(); output != layer.EndOutputSlots(); ++output)
+            {
+                TensorInfo convertInfo = output->GetTensorInfo();
+                if (convertInfo.GetDataType() == DataType::Float32)
+                {
+                    convertInfo.SetDataType(DataType::BFloat16);
+                    output->SetTensorInfo(convertInfo);
+                }
+            }
+        }
+    }
+
+protected:
+    ConvertFp32NetworkToBf16Impl() = default;
+    ~ConvertFp32NetworkToBf16Impl() = default;
+};
+
+using Fp32NetworkToBf16Converter = OptimizeForType<Layer, ConvertFp32NetworkToBf16Impl>;
+
+} // namespace optimizations
+} // namespace armnn
diff --git a/src/armnn/test/optimizations/ConvertConstantsBFloatTests.cpp b/src/armnn/test/optimizations/ConvertConstantsBFloatTests.cpp
new file mode 100644
index 0000000..5cb89da
--- /dev/null
+++ b/src/armnn/test/optimizations/ConvertConstantsBFloatTests.cpp
@@ -0,0 +1,127 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../TestUtils.hpp"
+
+#include <BFloat16.hpp>
+#include <Optimizer.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+using namespace armnn::optimizations;
+
+BOOST_AUTO_TEST_CASE(ConvertConstantsFloatToBFloatTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo info({ 1, 1, 1, 2 }, armnn::DataType::BFloat16);
+
+    // Create const tensor from fp32 data
+    unsigned int dims[] = { 4, 2, 1, 1 };
+    std::vector<float> floatWeights{ 0.0f, -1.0f,
+                                     3.8f, // 0x40733333 Round down
+                                     3.1055E+29f, // 0x707ADC3C Round up
+                                     9.149516E-10f, // 0x307B7FFF Round down
+                                    -3.8f, // 0xC0733333 Round down
+                                    -3.1055E+29f, // 0xF07ADC3C Round up
+                                    -9.149516E-10f // 0xB07B7FFF Round down
+                                   };
+    armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), floatWeights);
+
+    // Create simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(info);
+
+    auto fc      = graph.AddLayer<armnn::FullyConnectedLayer>(armnn::FullyConnectedDescriptor(), "fc");
+    fc->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(weights);
+    fc->GetOutputSlot().SetTensorInfo(info);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    // Connect up the layers
+    input->GetOutputSlot().Connect(fc->GetInputSlot(0));
+    fc->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    // Check tensor data type before conversion
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float32);
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(ConvertConstantsFloatToBFloat()));
+
+    // Check tensor data type after conversion
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::BFloat16);
+
+    // Check whether data matches expected Bf16 data
+    BFloat16* data = fc->m_Weight->GetTensor<BFloat16>();
+    BOOST_CHECK(data[0] == BFloat16(0.0f));
+    BOOST_CHECK(data[1] == BFloat16(-1.0f));
+    BOOST_CHECK(data[2] == BFloat16(3.796875f)); // 0x4073
+    BOOST_CHECK(data[3] == BFloat16(3.1072295E29f)); // 0x707B
+    BOOST_CHECK(data[4] == BFloat16(9.131327E-10f)); // 0x307B
+    BOOST_CHECK(data[5] == BFloat16(-3.796875f)); // 0xC073
+    BOOST_CHECK(data[6] == BFloat16(-3.1072295E29f)); // 0xF07B
+    BOOST_CHECK(data[7] == BFloat16(-9.131327E-10f)); // 0xB07B
+}
+
+BOOST_AUTO_TEST_CASE(ConvertConstantsBFloatToFloatTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo info({ 1, 1, 1, 2 }, armnn::DataType::Float32);
+
+    // Create the BFloat16 precision input data
+    unsigned int dims[] = { 4, 2, 1, 1 };
+    std::vector<float> convWeightsData{ 0.f, -1.f,
+                                        3.796875f, // 0x4073
+                                        3.1072295E29f, // 0x707B
+                                        9.131327E-10f, // 0x307B
+                                       -3.796875f, // 0xC073
+                                       -3.1072295E29f, // 0xF07B
+                                       -9.131327E-10f // 0xB07B
+                                       };
+    std::vector<uint16_t> bfWeights(8);
+    armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(convWeightsData.data(), convWeightsData.size(),
+                                                                 bfWeights.data());
+    armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::BFloat16), bfWeights);
+
+    //Create the simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(info);
+
+    auto fc      = graph.AddLayer<armnn::FullyConnectedLayer>(armnn::FullyConnectedDescriptor(), "fc");
+    fc->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(weights);
+    fc->GetOutputSlot().SetTensorInfo(info);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    //Connect up the layers
+    input->GetOutputSlot().Connect(fc->GetInputSlot(0));
+    fc->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    //Test the tensor info is correct.
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::BFloat16);
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(ConvertConstantsBFloatToFloat()));
+
+    //Test the tensor info is correct.
+    BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float32);
+
+    // Now test the data matches float32 data
+    float* data = fc->m_Weight->GetTensor<float>();
+    BOOST_CHECK(data[0] == 0.0f);
+    BOOST_CHECK(data[1] == -1.0f);
+    BOOST_CHECK(data[2] == 3.796875f);
+    BOOST_CHECK(data[3] == 3.1072295E29f);
+    BOOST_CHECK(data[4] == 9.131327E-10f);
+    BOOST_CHECK(data[5] == -3.796875f);
+    BOOST_CHECK(data[6] == -3.1072295E29f);
+    BOOST_CHECK(data[7] == -9.131327E-10f);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/armnn/test/optimizations/Fp32NetworkToBf16ConverterTests.cpp b/src/armnn/test/optimizations/Fp32NetworkToBf16ConverterTests.cpp
new file mode 100644
index 0000000..90a1548
--- /dev/null
+++ b/src/armnn/test/optimizations/Fp32NetworkToBf16ConverterTests.cpp
@@ -0,0 +1,45 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../TestUtils.hpp"
+
+#include <Optimizer.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+using namespace armnn::optimizations;
+
+BOOST_AUTO_TEST_CASE(Fp32NetworkToBf16OptimizationTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo infoFP32({ 2, 2, 1, 3 }, armnn::DataType::Float32);
+
+    // Create the simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(infoFP32);
+
+    auto floor = graph.AddLayer<armnn::FloorLayer>("floor");
+    floor->GetOutputSlot().SetTensorInfo(infoFP32);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    // Connect up the layers
+    input->GetOutputSlot().Connect(floor->GetInputSlot(0));
+    floor->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::FloorLayer>, &IsLayerOfType<armnn::OutputLayer>));
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(Fp32NetworkToBf16Converter()));
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::ConvertFp32ToBf16Layer>, &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::ConvertBf16ToFp32Layer>, &IsLayerOfType<armnn::OutputLayer>));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file