IVGCVSW-6978: RedirectMembersToConstantInputs does not work with Fp32NetworkToBf16Converter

 * Fuse FP32ToBF16Layers with Constant Layer so Conv2d/FullyConnected
   can have their weights redirected.
 * If BF16 Unsupported in Conv2d || FullyConnected revert fused
   Constant Layer to FP32

Change-Id: If523c708a822659d64597d9ae39cca1c2f84b76f
Signed-off-by: Francis Murtagh <francis.murtagh@arm.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 41db866..f0eb81c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -547,6 +547,7 @@
         src/armnn/test/optimizations/Fp32NetworkToFp16ConverterTests.cpp
         src/armnn/test/optimizations/FuseActivationTests.cpp
         src/armnn/test/optimizations/FuseBatchNormTests.cpp
+        src/armnn/test/optimizations/FuseConvertF32BF16IntoConstLayerTests.cpp
         src/armnn/test/optimizations/InsertDebugLayerTests.cpp
         src/armnn/test/optimizations/MovePermuteUpTests.cpp
         src/armnn/test/optimizations/MoveTransposeUpTests.cpp
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 8fe4445..5d44306 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -790,13 +790,18 @@
         }
         else if (dataTypeIn == DataType::BFloat16 || dataTypeOut == DataType::BFloat16)
         {
+            const auto layerType = layer->GetType();
             if (IWorkloadFactory::IsLayerSupported(*layer, DataType::Float32, reasonIfUnsupported)
-                && layer->GetType() != LayerType::ConvertFp32ToBf16
-                && layer->GetType() != LayerType::ConvertBf16ToFp32)
+                && layerType != LayerType::ConvertFp32ToBf16
+                && layerType != LayerType::ConvertBf16ToFp32)
             {
-                // Insert BF16 -> FP32 conversion layer before current layer
+                bool revertConstantWeightsConversion = RevertConstantWeightsToFP32(layer);
+
+                // Insert BF16 -> FP32 conversion layer before current layer.
+                // Unless we have reverted Constant Weights Type above.
                 std::vector<ConvertBf16ToFp32Layer*> convertBf16ToFp32Layers;
-                if (dataTypeIn == DataType::BFloat16)
+                if (dataTypeIn == DataType::BFloat16 && dataTypeOut != DataType::BFloat16
+                    && !revertConstantWeightsConversion)
                 {
                     convertBf16ToFp32Layers =
                         InsertConvertBf16ToFp32LayersBefore(graph, *layer);
@@ -1759,10 +1764,12 @@
     // If Fp32 to Bf16 optimization is set convert Fp32 network to Bf16
     // Convert input of Convolution2d and FullyConnected from Fp32 to Bf16
     // Only Constant weight of Convolution2d and FullyConnected are converted from Fp32 to Bf16
+    // Constant and Fp32ToBf16 layers will also be fused so conversion is no longer needed at inference time
     if (options.m_ReduceFp32ToBf16)
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Optimizer_ReduceFp32ToBf16");
         Optimizer::Pass(optGraph, MakeOptimizations(Fp32NetworkToBf16Converter()));
+        Optimizer::Pass(optGraph, MakeOptimizations(FuseConversionLayersIntoConstLayers()));
     }
 
     // Initialize backend settings
diff --git a/src/armnn/NetworkUtils.cpp b/src/armnn/NetworkUtils.cpp
index 7597798..5ff0e6c 100644
--- a/src/armnn/NetworkUtils.cpp
+++ b/src/armnn/NetworkUtils.cpp
@@ -1,10 +1,12 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "NetworkUtils.hpp"
 
+#include <armnnUtils/FloatingPointConverter.hpp>
+#include <BFloat16.hpp>
 #include "SubgraphViewSelector.hpp"
 
 #include <armnn/Exceptions.hpp>
@@ -272,4 +274,50 @@
     return debugLayers;
 }
 
+bool RevertConstantWeightsToFP32(Layer* layer)
+{
+    if (layer->GetType() == LayerType::Convolution2d || layer->GetType() == LayerType::FullyConnected)
+    {
+        // Revert Weights on Constant Layer to FP32 so they can be accessed by Conv2d or FullyConnected
+        // This prevents a conversion layer being added in during backend assignment which blocks
+        // the RedirectMembersToConstantInputs backward compatibility workaround/optimization.
+        auto constantLayerInfo = layer->GetInputSlot(1).GetConnection()->GetTensorInfo();
+
+        if (constantLayerInfo.IsConstant() && constantLayerInfo.GetDataType() == DataType::BFloat16)
+        {
+            std::vector<float> newValues(constantLayerInfo.GetNumElements());
+
+            auto weightLayer = PolymorphicDowncast<ConstantLayer*>(
+                    &layer->GetInputSlot(1).GetConnection()->GetOwningIConnectableLayer());
+            armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(
+                    weightLayer->m_LayerOutput->GetConstTensor<BFloat16>(),
+                    constantLayerInfo.GetNumElements(),
+                    newValues.data());
+
+            TensorInfo newInfo(constantLayerInfo.GetShape(), DataType::Float32);
+            newInfo.SetConstant(true);
+            ConstTensor newInput(newInfo, newValues);
+            weightLayer->m_LayerOutput.reset(new ScopedTensorHandle(newInput));
+            weightLayer->GetOutputSlot(0).SetTensorInfo(newInfo);
+
+            // Connect Conv2d/FullyConnected to InputLayer directly leaving out
+            // the ConversionLayer to be cleaned up later
+            auto& conversionLayer = layer->GetInputSlot(0).GetConnection()->GetOwningIConnectableLayer();
+            auto actualInputOutputSlot = conversionLayer.GetInputSlot(0).GetConnection();
+
+            auto& conversionLayerOutputSlot =
+                    layer->GetInputSlot(0).GetConnection()->GetOwningIConnectableLayer().GetOutputSlot(0);
+            auto& conversionLayerInputSlot =
+                    layer->GetInputSlot(0).GetConnection()->GetOwningIConnectableLayer().GetInputSlot(0);
+            actualInputOutputSlot->Disconnect(conversionLayerInputSlot);
+            conversionLayerOutputSlot.Disconnect(layer->GetInputSlot(0));
+
+            actualInputOutputSlot->Connect(layer->GetInputSlot(0));
+
+            return true;
+        }
+    }
+    return false;
+}
+
 } // namespace armnn
diff --git a/src/armnn/NetworkUtils.hpp b/src/armnn/NetworkUtils.hpp
index a922770..77dd068 100644
--- a/src/armnn/NetworkUtils.hpp
+++ b/src/armnn/NetworkUtils.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -29,4 +29,6 @@
 
 std::vector<DebugLayer*> InsertDebugLayerAfter(Graph& graph, Layer& layer);
 
+bool RevertConstantWeightsToFP32(Layer* layer);
+
 } // namespace armnn
diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp
index 900e763..0421f31 100644
--- a/src/armnn/optimizations/All.hpp
+++ b/src/armnn/optimizations/All.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -9,6 +9,7 @@
 #include "ConvertConstants.hpp"
 #include "ConvertConstDequantisationLayersToConstLayers.hpp"
 #include "ConvertConstPermuteLayersToConstLayers.hpp"
+#include "FuseConvertFp32ToBf16IntoConstLayers.hpp"
 #include "ConvertFp32NetworkToBf16.hpp"
 #include "ConvertFp32NetworkToFp16.hpp"
 #include "FoldPadIntoLayer2d.hpp"
diff --git a/src/armnn/optimizations/FuseConvertFp32ToBf16IntoConstLayers.hpp b/src/armnn/optimizations/FuseConvertFp32ToBf16IntoConstLayers.hpp
new file mode 100644
index 0000000..d112010
--- /dev/null
+++ b/src/armnn/optimizations/FuseConvertFp32ToBf16IntoConstLayers.hpp
@@ -0,0 +1,89 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Optimization.hpp"
+#include <armnnUtils/Permute.hpp>
+#include <ResolveType.hpp>
+
+namespace armnn
+{
+namespace optimizations
+{
+
+class FuseConvertFp32ToBf16IntoConstLayers
+{
+public:
+    void Run(Graph& graph, InputSlot& connection) const
+    {
+        Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
+        Layer& child = connection.GetOwningLayer();
+
+        ARMNN_ASSERT(base.GetType() == LayerType::Constant);
+        ARMNN_ASSERT(child.GetType() == LayerType::ConvertFp32ToBf16);
+
+        auto dataType = base.GetDataType();
+        switch (dataType)
+        {
+            case DataType::Float32:
+                ReplaceConvertFp32ToBf16Layer<DataType::BFloat16>(
+                        graph,
+                        PolymorphicDowncast<ConstantLayer*>(&base),
+                        PolymorphicDowncast<ConvertFp32ToBf16Layer*>(&child));
+                break;
+            default:
+                throw InvalidArgumentException(GetDataTypeName(dataType) +
+                                               std::string(" Constant Layer cannot be fused into ")  +
+                                               GetDataTypeName(child.GetDataType()) +
+                                               std::string(" conversion layer."));
+        }
+    }
+protected:
+    FuseConvertFp32ToBf16IntoConstLayers()  = default;
+    ~FuseConvertFp32ToBf16IntoConstLayers() = default;
+private:
+    template<armnn::DataType ArmnnType,
+             typename T = armnn::ResolveType<ArmnnType>>
+    static void ReplaceConvertFp32ToBf16Layer(Graph& graph,
+                                              ConstantLayer* constantLayer,
+                                              ConvertFp32ToBf16Layer* convertFp32ToBf16layer)
+    {
+        IgnoreUnused(graph);
+        /**
+         * This optimisation is to find situations where a constant set of inputs is being provided to a
+         * ConvertFp32ToBf16 layer. In this case we don't want the overhead of Converting the values on
+         * every inference, instead we want to Convert them once and store them in a Const layer to be
+         * used everytime as they will not change.
+         */
+        TensorInfo outputConvertFp32ToBf16Info = convertFp32ToBf16layer->GetOutputSlot(0).GetTensorInfo();
+        std::vector<T> newValues(outputConvertFp32ToBf16Info.GetNumElements());
+
+        armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(
+                constantLayer->m_LayerOutput->GetConstTensor<float>(),
+                outputConvertFp32ToBf16Info.GetNumElements(),
+                newValues.data());
+        TensorInfo newInfo = outputConvertFp32ToBf16Info;
+        newInfo.SetConstant(true);
+        ConstTensor newInput(newInfo, newValues);
+
+        constantLayer->m_LayerOutput.reset(new ScopedTensorHandle(newInput));
+
+        // Moves connections in convertFp32ToBf16layer output slot to the constant layer.
+        // ConvertFp32ToBf16layer layer will be removed if left unconnected.
+        convertFp32ToBf16layer->GetOutputSlot().MoveAllConnections(constantLayer->GetOutputSlot());
+
+        // Updating the output tensor
+        constantLayer->GetOutputSlot(0).SetTensorInfo(newInfo);
+        ARMNN_ASSERT(constantLayer->GetOutputSlot(0).GetTensorInfo().IsConstant() == true);
+    }
+};
+
+using FuseConversionLayersIntoConstLayers = OptimizeForConnection<ConstantLayer,
+                                                                  ConvertFp32ToBf16Layer,
+                                                                  FuseConvertFp32ToBf16IntoConstLayers>;
+
+} // namespace optimizations
+} // namespace armnn
\ No newline at end of file
diff --git a/src/armnn/test/optimizations/FuseConvertF32BF16IntoConstLayerTests.cpp b/src/armnn/test/optimizations/FuseConvertF32BF16IntoConstLayerTests.cpp
new file mode 100644
index 0000000..93d5948
--- /dev/null
+++ b/src/armnn/test/optimizations/FuseConvertF32BF16IntoConstLayerTests.cpp
@@ -0,0 +1,151 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <LayersFwd.hpp>
+#include <Network.hpp>
+#include <NetworkUtils.hpp>
+#include <Optimizer.hpp>
+#include <TestUtils.hpp>
+
+#include <armnn/backends/TensorHandle.hpp>
+
+#include <doctest/doctest.h>
+
+TEST_SUITE("Optimizer")
+{
+using namespace armnn;
+using namespace armnn::optimizations;
+
+TEST_CASE("FuseConvertFp32Fp16intoConst")
+{
+    Graph graph;
+    const unsigned int shape[] = {1, 2, 2, 3};
+
+    const TensorInfo constTensorInfo(4, shape, DataType::Float32, 1.0, 0, true);
+    const TensorInfo outputConvertInfo(4, shape, DataType::BFloat16, 1.0, 0, true);
+
+    ConstantLayer* constantLayer = graph.AddLayer<ConstantLayer>("constant");
+    std::vector<float> constantValues(constTensorInfo.GetNumElements(), 3.1416f);
+    ConstTensor constTensor(constTensorInfo, constantValues.data());
+    constantLayer->m_LayerOutput = std::make_shared<ScopedTensorHandle>(constTensor);
+    constantLayer->GetOutputSlot().SetTensorInfo(constTensorInfo);
+
+    ConvertFp32ToBf16Layer* convertLayer = graph.AddLayer<ConvertFp32ToBf16Layer>("convert");
+    convertLayer->GetOutputSlot().SetTensorInfo(outputConvertInfo);
+
+    OutputLayer* output = graph.AddLayer<OutputLayer>(0, "output");
+
+    // Connect up constant -> convert -> output
+    constantLayer->GetOutputSlot().Connect(convertLayer->GetInputSlot(0));
+    convertLayer->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    auto checkConstantFloat32 = [](const armnn::Layer *const layer) -> bool {
+        return IsLayerOfType<ConstantLayer>(layer) &&
+               (layer->GetDataType() == DataType::Float32);
+    };
+    auto checkConstantBFloat16 = [](const armnn::Layer *const layer) -> bool {
+        return IsLayerOfType<ConstantLayer>(layer) &&
+               (layer->GetDataType() == DataType::BFloat16);
+    };
+
+    CHECK(CheckSequence(graph.cbegin(), graph.cend(),
+                        checkConstantFloat32,
+                        &IsLayerOfType<ConvertFp32ToBf16Layer>,
+                        &IsLayerOfType<OutputLayer>));
+
+    armnn::Optimizer::Pass(graph, MakeOptimizations(FuseConversionLayersIntoConstLayers()));
+
+    CHECK(CheckSequence(graph.cbegin(), graph.cend(),
+                        checkConstantBFloat16,
+                        &IsLayerOfType<OutputLayer>));
+}
+
+TEST_CASE("RevertConstantWeightsToFP32")
+{
+    Graph graph;
+    const unsigned int shape[] = {1, 2, 2, 3};
+
+    const TensorInfo constTensorInfo(4, shape, DataType::Float32, 1.0, 0, true);
+    const TensorInfo outputConvertInfo(4, shape, DataType::BFloat16, 1.0, 0, true);
+
+    TensorInfo inputInfo(4, shape, DataType::Float32);
+    auto* input = graph.AddLayer<InputLayer>(0, "input0");
+    input->GetOutputSlot().SetTensorInfo(inputInfo);
+
+    auto* constantLayer = graph.AddLayer<ConstantLayer>("constant");
+    std::vector<float> constantValues(constTensorInfo.GetNumElements(), 3.1416f);
+    ConstTensor constTensor(constTensorInfo, constantValues.data());
+    constantLayer->m_LayerOutput = std::make_shared<ScopedTensorHandle>(constTensor);
+    constantLayer->GetOutputSlot().SetTensorInfo(constTensorInfo);
+
+    ConvertFp32ToBf16Layer* convertLayerInputs = graph.AddLayer<ConvertFp32ToBf16Layer>("convert");
+    convertLayerInputs->GetOutputSlot().SetTensorInfo(outputConvertInfo);
+    ConvertFp32ToBf16Layer* convertLayerWeights = graph.AddLayer<ConvertFp32ToBf16Layer>("convert2");
+    convertLayerWeights->GetOutputSlot().SetTensorInfo(outputConvertInfo);
+    ConvertFp32ToBf16Layer* convertLayerBiases = graph.AddLayer<ConvertFp32ToBf16Layer>("convert3");
+    convertLayerBiases->GetOutputSlot().SetTensorInfo(outputConvertInfo);
+
+    auto* biases  = graph.AddLayer<armnn::ConstantLayer>("Biases");
+    biases->m_LayerOutput  = std::make_unique<armnn::ScopedTensorHandle>(constTensor);
+    biases->GetOutputSlot().SetTensorInfo(constTensorInfo);
+
+    armnn::Convolution2dDescriptor descriptor;
+    descriptor.m_BiasEnabled = true;
+    auto* conv = graph.AddLayer<armnn::Convolution2dLayer>(descriptor, "conv2d");
+    const armnn::TensorInfo infoFP32({ 2, 3, 8, 1 }, armnn::DataType::Float32);
+    conv->GetOutputSlot().SetTensorInfo(infoFP32);
+
+    auto* output = graph.AddLayer<OutputLayer>(0, "output");
+
+    // Connect up Input    -> Convert ->
+    //            Constant -> Convert -> Conv2d -> Output
+    //            Constant -> Convert ->
+    input->GetOutputSlot().Connect(convertLayerInputs->GetInputSlot(0));
+    constantLayer->GetOutputSlot().Connect(convertLayerWeights->GetInputSlot(0));
+    biases->GetOutputSlot().Connect(convertLayerBiases->GetInputSlot(0));
+
+    convertLayerInputs->GetOutputSlot().Connect(conv->GetInputSlot(0));
+    convertLayerWeights->GetOutputSlot().Connect(conv->GetInputSlot(1));
+    convertLayerBiases->GetOutputSlot().Connect(conv->GetInputSlot(2));
+
+    conv->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    auto checkConstantFloat32 = [](const armnn::Layer *const layer) -> bool {
+        return IsLayerOfType<ConstantLayer>(layer) &&
+               (layer->GetDataType() == DataType::Float32);
+    };
+    auto checkConstantBFloat16 = [](const armnn::Layer *const layer) -> bool {
+        return IsLayerOfType<ConstantLayer>(layer) &&
+               (layer->GetDataType() == DataType::BFloat16);
+    };
+
+    CHECK(CheckSequence(graph.cbegin(), graph.cend(),
+                        &IsLayerOfType<InputLayer>,
+                        checkConstantFloat32,
+                        checkConstantFloat32,
+                        &IsLayerOfType<ConvertFp32ToBf16Layer>,
+                        &IsLayerOfType<ConvertFp32ToBf16Layer>,
+                        &IsLayerOfType<ConvertFp32ToBf16Layer>,
+                        &IsLayerOfType<Convolution2dLayer>,
+                        &IsLayerOfType<OutputLayer>));
+
+    armnn::Optimizer::Pass(graph, MakeOptimizations(FuseConversionLayersIntoConstLayers()));
+
+    bool revert = RevertConstantWeightsToFP32(conv);
+
+    // Erase unconnected layer as occurs during Topological Sort.
+    graph.EraseLayer(convertLayerInputs);
+
+    CHECK(revert);
+    CHECK(constantLayer->GetDataType() == DataType::Float32);
+
+    CHECK(CheckSequence(graph.cbegin(), graph.cend(),
+                        &IsLayerOfType<InputLayer>,
+                        checkConstantBFloat16,
+                        checkConstantFloat32,
+                        &IsLayerOfType<Convolution2dLayer>,
+                        &IsLayerOfType<OutputLayer>));
+}
+}