IVGCVSW-1434 Add debug mode to Optimizer
    * Modified optimizer to support debug mode via DebugLayer

Change-Id: Ic8f313778e55540c182cf99876c44a0823be04c6
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1731576..189a1ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -298,6 +298,7 @@
     src/armnn/optimizations/SquashEqualSiblings.hpp
     src/armnn/optimizations/OptimizeInverseConversions.hpp
     src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp
+    src/armnn/optimizations/AddDebug.hpp
     third-party/half/half.hpp
     )
 
diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp
index 31f1dc2..33181ce 100644
--- a/include/armnn/INetwork.hpp
+++ b/include/armnn/INetwork.hpp
@@ -349,15 +349,22 @@
 
 struct OptimizerOptions
 {
-    OptimizerOptions() : m_ReduceFp32ToFp16(false) {}
+    OptimizerOptions() :
+    m_ReduceFp32ToFp16(false)
+    , m_Debug(false)
+    {}
 
-    OptimizerOptions(bool reduceFp32ToFp16)
+    OptimizerOptions(bool reduceFp32ToFp16, bool debug)
         : m_ReduceFp32ToFp16(reduceFp32ToFp16)
+        , m_Debug(debug)
     {
     }
 
     // Reduce Fp32 data to Fp16 for faster processing
     bool m_ReduceFp32ToFp16;
+
+    // Add debug data for easier troubleshooting
+    bool m_Debug;
 };
 
 /// Create an optimized version of the network
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 0cf0ed3..ecab504 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -129,6 +129,12 @@
         Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(Fp32NetworkToFp16Converter()));
     }
 
+    // if debug optimization is set, then print out data after each layer
+    if (options.m_Debug)
+    {
+        Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(InsertDebugLayer()));
+    }
+
     // We know that DeviceSpec should be the only implementation of IDeviceSpec.
     const DeviceSpec& spec = *boost::polymorphic_downcast<const DeviceSpec*>(&deviceSpec);
     auto const& supportedBackends = spec.GetSupportedBackends();
diff --git a/src/armnn/NetworkUtils.cpp b/src/armnn/NetworkUtils.cpp
index 1e3add6..9a4ce87 100644
--- a/src/armnn/NetworkUtils.cpp
+++ b/src/armnn/NetworkUtils.cpp
@@ -74,4 +74,33 @@
     return convertLayers;
 }
 
+
+std::vector<DebugLayer*> InsertDebugLayerAfter(Graph& graph, Layer& layer)
+{
+    std::vector<DebugLayer*> debugLayers;
+    debugLayers.reserve(layer.GetNumOutputSlots());
+
+    // Change outputs to DataType::Float16
+    for (auto&& outputSlot = layer.BeginOutputSlots(); outputSlot != layer.EndOutputSlots(); ++outputSlot)
+    {
+        // Insert debug layer after the layer
+        const std::string name =
+            std::string("DebugLayerAfter") + layer.GetName();
+
+        const DebugDescriptor descriptor;
+
+        DebugLayer* debugLayer =
+            graph.InsertNewLayer<DebugLayer>(*outputSlot, descriptor, name.c_str());
+
+        // Sets output tensor info for the debug layer.
+        TensorInfo debugInfo = debugLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+
+        debugLayer->GetOutputSlot().SetTensorInfo(debugInfo);
+
+        debugLayers.emplace_back(debugLayer);
+    }
+
+    return debugLayers;
+}
+
 } // namespace armnn
diff --git a/src/armnn/NetworkUtils.hpp b/src/armnn/NetworkUtils.hpp
index dbb8538..b81d5cb 100644
--- a/src/armnn/NetworkUtils.hpp
+++ b/src/armnn/NetworkUtils.hpp
@@ -14,4 +14,6 @@
 
 std::vector<ConvertFp32ToFp16Layer*> InsertConvertFp32ToFp16LayersAfter(Graph& graph, Layer& layer);
 
+std::vector<DebugLayer*> InsertDebugLayerAfter(Graph& graph, Layer& layer);
+
 } // namespace armnn
diff --git a/src/armnn/optimizations/AddDebug.hpp b/src/armnn/optimizations/AddDebug.hpp
new file mode 100644
index 0000000..60271b0
--- /dev/null
+++ b/src/armnn/optimizations/AddDebug.hpp
@@ -0,0 +1,37 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "Optimization.hpp"
+#include "NetworkUtils.hpp"
+
+namespace armnn
+{
+namespace optimizations
+{
+
+class AddDebugImpl
+{
+public:
+
+    void Run(Graph& graph, Layer& layer) const
+    {
+        if (layer.GetType() != LayerType::Debug && layer.GetType() != LayerType::Output)
+        {
+            // if the inputs/outputs of this layer do not have a debug layer
+            // insert the debug layer after them
+            InsertDebugLayerAfter(graph, layer);
+        }
+    }
+
+protected:
+    AddDebugImpl() = default;
+    ~AddDebugImpl() = default;
+};
+
+using InsertDebugLayer = OptimizeForType<Layer, AddDebugImpl>;
+
+} // namespace optimizations
+} // namespace armnn
diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp
index a1bff3c..0a6684e 100644
--- a/src/armnn/optimizations/All.hpp
+++ b/src/armnn/optimizations/All.hpp
@@ -12,3 +12,4 @@
 #include "MovePermuteUp.hpp"
 #include "OptimizeInverseConversions.hpp"
 #include "ConvertFp32NetworkToFp16.hpp"
+#include "AddDebug.hpp"
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 30ca520..29d1702 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -768,6 +768,43 @@
                              &IsLayerOfType<armnn::OutputLayer>));
 }
 
+BOOST_AUTO_TEST_CASE(InsertDebugOptimizationTest)
+{
+    armnn::Graph graph;
+
+    const armnn::TensorInfo info({ 2,2,1,3 }, armnn::DataType::Float32);
+
+    // Create the simple test network
+    auto input = graph.AddLayer<armnn::InputLayer>(0, "input");
+    input->GetOutputSlot().SetTensorInfo(info);
+
+    auto floor = graph.AddLayer<armnn::FloorLayer>("floor");
+    floor->GetOutputSlot().SetTensorInfo(info);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(1, "output");
+
+    // Connect up the layers
+    input->GetOutputSlot().Connect(floor->GetInputSlot(0));
+    floor->GetOutputSlot().Connect(output->GetInputSlot(0));
+
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+
+    // Run the optimizer
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(InsertDebugLayer()));
+
+    BOOST_TEST(CheckSequence(graph.cbegin(),
+                             graph.cend(),
+                             &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::DebugLayer>,
+                             &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::DebugLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+}
+
 void CreateConvolution2dGraph(Graph &graph, const unsigned int* inputShape,
                               const unsigned int* weightsShape, const unsigned int* outputShape,
                               DataLayout dataLayout = DataLayout::NCHW)
diff --git a/src/backends/reference/test/RefOptimizedNetworkTests.cpp b/src/backends/reference/test/RefOptimizedNetworkTests.cpp
index 907e795..68617b9 100644
--- a/src/backends/reference/test/RefOptimizedNetworkTests.cpp
+++ b/src/backends/reference/test/RefOptimizedNetworkTests.cpp
@@ -10,6 +10,7 @@
 #include <reference/RefWorkloadFactory.hpp>
 
 #include <boost/test/unit_test.hpp>
+#include <test/GraphUtils.hpp>
 
 BOOST_AUTO_TEST_SUITE(RefOptimizedNetwork)
 
@@ -209,4 +210,50 @@
     BOOST_TEST(ss.str() == expected.str());
 }
 
+BOOST_AUTO_TEST_CASE(DebugTestOnCpuRef)
+{
+    armnn::Network net;
+
+    armnn::ActivationDescriptor activation1Descriptor;
+    activation1Descriptor.m_Function = armnn::ActivationFunction::BoundedReLu;
+    activation1Descriptor.m_A = 1.f;
+    activation1Descriptor.m_B = -1.f;
+
+    // Defines layers.
+    auto input = net.AddInputLayer(0, "InputLayer");
+    auto activation = net.AddActivationLayer(activation1Descriptor, "ActivationLayer");
+    auto output = net.AddOutputLayer(0, "OutputLayer");
+
+    // Connects layers.
+    input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
+    activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    armnn::TensorShape shape({4});
+    armnn::TensorInfo info(shape, armnn::DataType::Float32);
+    input->GetOutputSlot(0).SetTensorInfo(info);
+    activation->GetOutputSlot(0).SetTensorInfo(info);
+
+    armnn::IRuntime::CreationOptions options;
+    armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    std::vector<armnn::BackendId> backends = {armnn::Compute::CpuRef};
+
+    armnn::OptimizerOptions optimizerOptions;
+    optimizerOptions.m_Debug = true;
+
+    armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec(),
+                                                               optimizerOptions);
+
+    const armnn::Graph& graph = static_cast<armnn::OptimizedNetwork*>(optimizedNet.get())->GetGraph();
+    // Tests that all layers are present in the graph.
+    BOOST_TEST(graph.GetNumLayers() == 5);
+
+    // Tests that the vertices exist and have correct names.
+    BOOST_TEST(GraphHasNamedLayer(graph, "InputLayer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "DebugLayerAfterInputLayer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "ActivationLayer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "DebugLayerAfterActivationLayer"));
+    BOOST_TEST(GraphHasNamedLayer(graph, "OutputLayer"));
+}
+
 BOOST_AUTO_TEST_SUITE_END()