MLBEDSW-3019: Add profiling debug database

 - Added mechanism to track input to output graph transforms for
   debugging the resultant command stream.
 - Provides base implementation for MLBEDSW-2661

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I2dfe8a409fbde7ad0282bfab5acb11ba1c8b82d8
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index e31348b..7304630 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -25,6 +25,7 @@
 from . import rewrite_graph
 from . import scaling
 from .data_type import DataType
+from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .numeric_util import clamp_sigmoid
@@ -77,6 +78,7 @@
             new_op.attrs["concat_end"] = offset
             new_op.run_on_npu = True
             tens.ops.append(new_op)
+            DebugDatabase.add_optimised(concat_op, new_op)
         assert tens.shape[axis] == offset
 
         # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
@@ -128,6 +130,7 @@
         new_op.attrs["split_end"] = offset_end
         new_op.run_on_npu = True
         new_op.set_output_tensor(tens)
+        DebugDatabase.add_optimised(split_op, new_op)
 
     return tens
 
@@ -399,6 +402,7 @@
             reshape_op.attrs["new_shape"] = desired_shape
             reshape_op.inputs = [inp, new_shape_tens]
             reshape_op.set_output_tensor(reshape_out)
+            DebugDatabase.add_optimised(op, reshape_op)
 
             op.inputs[idx] = reshape_out
 
@@ -492,6 +496,7 @@
             reshape_op.attrs["new_shape"] = reshape_input_shape
             reshape_op.inputs = [reshape_in, new_shape_tens]
             reshape_op.set_output_tensor(out_tens)
+            DebugDatabase.add_optimised(op, reshape_op)
 
             op.outputs[idx] = reshape_in
 
@@ -568,6 +573,7 @@
                     op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3]
                 )
             )
+        DebugDatabase.add_optimised(op, op)
     return op
 
 
@@ -616,6 +622,9 @@
             reshape_op.set_output_tensor(orig_ofm_tensor)
             # Replace this ops OFM to point to the 2D tensor
             op.outputs[0] = fc_ofm_tensor
+            # Record optimisation in debug database
+            DebugDatabase.add_optimised(op, reshape_op)
+            DebugDatabase.add_optimised(op, op)
     return op
 
 
@@ -670,6 +679,10 @@
 
             # Mark the op so that it will be removed as passthrough later on
             op.type = Op.Identity
+
+            # Record optimisation in debug database
+            DebugDatabase.add_optimised(op, act_op)
+            DebugDatabase.add_optimised(op, op)
     return op
 
 
@@ -788,6 +801,10 @@
         op.name = op.name.replace("Maximum", new_op.name)
         op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
         op.inputs = [shared_in]
+
+        # Record optimisation in debug database
+        DebugDatabase.add_optimised(op, op)
+
     return op
 
 
@@ -812,6 +829,7 @@
     mul_alpha.add_input_tensor(alpha_tens)
     fm_alpha = ofm.clone(op.name + "_alpha")
     mul_alpha.set_output_tensor(fm_alpha)
+    DebugDatabase.add_optimised(op, mul_alpha)
 
     if check_quantized_tens_scaling_equal(ifm, ofm):
         # No identity multiplication is needed
@@ -832,6 +850,7 @@
         mul_identity.add_input_tensor(identity_tens)
         fm_id = ofm.clone(op.name + "_id")
         mul_identity.set_output_tensor(fm_id)
+        DebugDatabase.add_optimised(op, mul_alpha)
 
     # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
     op.type = Op.Maximum
@@ -840,6 +859,8 @@
     ifm.consumer_list.remove(op)
     op.add_input_tensor(fm_alpha)
     op.add_input_tensor(fm_id)
+
+    DebugDatabase.add_optimised(op, op)
     return op
 
 
@@ -1012,6 +1033,7 @@
         prev_op.set_activation_lut(op.activation_lut)
     # Bypass op
     prev_op.set_output_tensor(ofm)
+    DebugDatabase.add_optimised(op, prev_op)
     return op
 
 
@@ -1052,6 +1074,11 @@
     return op
 
 
+def _record_optimised(op, arch):
+    if op.type != Op.Const:
+        DebugDatabase.add_optimised(op, op)
+
+
 def optimise_graph_a(nng, arch, verbose_graph=False):
     if verbose_graph:
         nng.print_graph()
@@ -1093,6 +1120,10 @@
             nng, sg, arch, [remove_passthrough_tensor], [fuse_activation_function_with_prev, add_padding_fields]
         )
 
+    # Post-optimisation operator debug tracing
+    for sg in nng.subgraphs:
+        rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [_record_optimised])
+
     if verbose_graph:
         nng.print_graph()
     return nng