MLBEDSW-3019: Add profiling debug database
- Added mechanism to track input to output graph transforms for
debugging the resultant command stream.
- Provides base implementation for MLBEDSW-2661
Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I2dfe8a409fbde7ad0282bfab5acb11ba1c8b82d8
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index e31348b..7304630 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -25,6 +25,7 @@
from . import rewrite_graph
from . import scaling
from .data_type import DataType
+from .debug_database import DebugDatabase
from .errors import UnsupportedFeatureError
from .ethos_u55_regs.ethos_u55_regs import resampling_mode
from .numeric_util import clamp_sigmoid
@@ -77,6 +78,7 @@
new_op.attrs["concat_end"] = offset
new_op.run_on_npu = True
tens.ops.append(new_op)
+ DebugDatabase.add_optimised(concat_op, new_op)
assert tens.shape[axis] == offset
# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
@@ -128,6 +130,7 @@
new_op.attrs["split_end"] = offset_end
new_op.run_on_npu = True
new_op.set_output_tensor(tens)
+ DebugDatabase.add_optimised(split_op, new_op)
return tens
@@ -399,6 +402,7 @@
reshape_op.attrs["new_shape"] = desired_shape
reshape_op.inputs = [inp, new_shape_tens]
reshape_op.set_output_tensor(reshape_out)
+ DebugDatabase.add_optimised(op, reshape_op)
op.inputs[idx] = reshape_out
@@ -492,6 +496,7 @@
reshape_op.attrs["new_shape"] = reshape_input_shape
reshape_op.inputs = [reshape_in, new_shape_tens]
reshape_op.set_output_tensor(out_tens)
+ DebugDatabase.add_optimised(op, reshape_op)
op.outputs[idx] = reshape_in
@@ -568,6 +573,7 @@
op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3]
)
)
+ DebugDatabase.add_optimised(op, op)
return op
@@ -616,6 +622,9 @@
reshape_op.set_output_tensor(orig_ofm_tensor)
# Replace this ops OFM to point to the 2D tensor
op.outputs[0] = fc_ofm_tensor
+ # Record optimisation in debug database
+ DebugDatabase.add_optimised(op, reshape_op)
+ DebugDatabase.add_optimised(op, op)
return op
@@ -670,6 +679,10 @@
# Mark the op so that it will be removed as passthrough later on
op.type = Op.Identity
+
+ # Record optimisation in debug database
+ DebugDatabase.add_optimised(op, act_op)
+ DebugDatabase.add_optimised(op, op)
return op
@@ -788,6 +801,10 @@
op.name = op.name.replace("Maximum", new_op.name)
op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
op.inputs = [shared_in]
+
+ # Record optimisation in debug database
+ DebugDatabase.add_optimised(op, op)
+
return op
@@ -812,6 +829,7 @@
mul_alpha.add_input_tensor(alpha_tens)
fm_alpha = ofm.clone(op.name + "_alpha")
mul_alpha.set_output_tensor(fm_alpha)
+ DebugDatabase.add_optimised(op, mul_alpha)
if check_quantized_tens_scaling_equal(ifm, ofm):
# No identity multiplication is needed
@@ -832,6 +850,7 @@
mul_identity.add_input_tensor(identity_tens)
fm_id = ofm.clone(op.name + "_id")
mul_identity.set_output_tensor(fm_id)
+ DebugDatabase.add_optimised(op, mul_alpha)
# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
op.type = Op.Maximum
@@ -840,6 +859,8 @@
ifm.consumer_list.remove(op)
op.add_input_tensor(fm_alpha)
op.add_input_tensor(fm_id)
+
+ DebugDatabase.add_optimised(op, op)
return op
@@ -1012,6 +1033,7 @@
prev_op.set_activation_lut(op.activation_lut)
# Bypass op
prev_op.set_output_tensor(ofm)
+ DebugDatabase.add_optimised(op, prev_op)
return op
@@ -1052,6 +1074,11 @@
return op
+def _record_optimised(op, arch):
+ if op.type != Op.Const:
+ DebugDatabase.add_optimised(op, op)
+
+
def optimise_graph_a(nng, arch, verbose_graph=False):
if verbose_graph:
nng.print_graph()
@@ -1093,6 +1120,10 @@
nng, sg, arch, [remove_passthrough_tensor], [fuse_activation_function_with_prev, add_padding_fields]
)
+ # Post-optimisation operator debug tracing
+ for sg in nng.subgraphs:
+ rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [_record_optimised])
+
if verbose_graph:
nng.print_graph()
return nng