MLBEDSW-6881 SHAPE single op network is optimised to nothing

Fixed by adding an operation to copy the statically optimised
data to the subgraph output.

Change-Id: Ica757e37d5460237973444ffd39c7d2850f319e3
Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index b8e61f4..90b2932 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -48,6 +48,7 @@
 from .operation import Op
 from .operation import Operation
 from .operation import Padding
+from .operation_util import create_add_nop
 from .operation_util import create_avgpool_nop
 from .operation_util import get_pad_values_from_input
 from .scaling import quantise_scale
@@ -1801,6 +1802,7 @@
 
         # Convert this SHAPE op to const
         op.type = Op.Const
+        DebugDatabase.add_optimised(op, op)
 
         # Add size calculation to shape output tensors
         ofm.values = np.array(ifm.shape)
@@ -1935,4 +1937,23 @@
         rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
         sg.refresh_after_modification()
 
+    # Make sure that const optimisations on subgraph outputs are handled correctly
+    for sg in nng.subgraphs:
+        for ofm in sg.output_tensors:
+            if ofm.is_const and ofm.ops[0].type_changed:
+                # Subgraph output cannot be const - insert a memory copy
+                op = ofm.ops[0]
+                ofm_clone = ofm.clone()
+                ofm_clone.values = ofm.values
+                ofm.values = None
+                np_dtype = ofm.dtype.as_numpy_type()
+                zero = create_const_tensor("zero", [1], ofm.dtype, [0], np_dtype, quantization=ofm.quantization)
+                memcpy = create_add_nop(f"{ofm.name}_copy")
+                memcpy.add_input_tensor(ofm_clone)
+                memcpy.add_input_tensor(zero)
+                memcpy.set_output_tensor(ofm)
+                memcpy.set_ifm_ofm_shapes()
+                op.set_output_tensor(ofm_clone)
+                DebugDatabase.add_optimised(op, memcpy)
+
     return nng