TOSA: Added Depthwise support

This is mainly to add support for depthwise conv2d
with dephmultiplier = 1.

(But there are no testcases suited, all I have sourced
has depth_multiplier set to 2, which is not supported.)

-Added support for depthwise conv2d.
-Added support for removing Transpose of constant data
-Added support for removing reshape

Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: I143e6246becfa78fd9f7510af0bf0d6b3fbbf2c7
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 255a1f5..ef39aea 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -30,7 +30,10 @@
 from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
+from .graph_optimiser_util import bypass_reshape_and_squeeze_ops
 from .graph_optimiser_util import calc_explicit_padding
+from .graph_optimiser_util import convert_depthwise_to_conv
+from .graph_optimiser_util import fix_sg_input_output
 from .graph_optimiser_util import needed_total_padding
 from .graph_optimiser_util import set_ifm_ofm_op_shapes
 from .graph_optimiser_util import set_tensor_equivalence
@@ -244,32 +247,6 @@
     DebugDatabase.add_optimised(tens.ops[0], copy_op)
 
 
-def fix_sg_input_output(op, arch, nng):
-    if not op.run_on_npu or op.type not in (Op.Reshape, Op.Squeeze):
-        return op
-
-    # For the Reshape/Squeeze operators we want to remove, tensors are removed.
-    # But in order to to do this, they cannot be outputs of the sg,
-    # this need to be fixed prior to the removal.
-    # Solution is to add a avgpool NOP, to maintain the original tensor.
-    # This is also valid when reshape ifm/ofm is produced respectively
-    # consumed by CPU
-
-    # Check if operator ifm/ofm are sg ifm/ofm
-    ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
-    ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
-    ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
-    # Check if ifm/ofm is produced respectively consumed by CPU
-    ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
-    ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
-
-    if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
-        # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the Reshape/Squeeze
-        insert_copy_op_after_tens(op.ifm)
-
-    return op
-
-
 def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
     k_w, k_h = kernel.dilated_wh()
     s_x, s_y = kernel.stride
@@ -576,33 +553,6 @@
     return op
 
 
-def convert_depthwise_to_conv(op, arch, nng):
-    # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
-    # the ofm depth equals the depth multipler.
-    # If those conditions are true, then we can perform a simple
-    # switch of the operator type (and weight order)
-
-    if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
-        ifm_shape = op.ifm_shapes[0]
-        weight_tensor = op.inputs[1]
-        ofm_shape = op.ofm_shapes[0]
-        if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
-            # Change op type to Conv2d
-            op.type = Op.Conv2DBias
-            del op.attrs["channel_multiplier"]
-            del op.attrs["depth_multiplier"]
-
-            weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
-            weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
-        else:
-            raise UnsupportedFeatureError(
-                f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
-                f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
-            )
-        DebugDatabase.add_optimised(op, op)
-    return op
-
-
 def reorder_depthwise_weights(op, arch, nng):
     if op.type.is_depthwise_conv2d_op():
         weight_tensor = op.inputs[1]
@@ -1058,35 +1008,7 @@
             # or the reshape need to be replace with a NOP.
             return
 
-        # Check if ifm/ofm are network ifm/ofm
-        ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
-        ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
-        ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
-        # Check if ifm/ofm is produced respectively consumed by CPU
-        ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
-        ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
-
-        # This case should be handled prior to this function
-        assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))
-
-        if ofm_is_sg_ofm or ofm_is_cpu_consumed:
-            # Bypassed by replacing ifm with ofm
-            ofm.ops = []
-            for prev_op in ifm.ops:
-                prev_op.outputs = [ofm]
-                ofm.ops.append(prev_op)
-
-            # All ifm consumers need to use ofm as input
-            for ifm_cons in ifm.consumer_list:
-                for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
-                    if cons_ifm == ifm:
-                        ifm_cons.set_input_tensor(ofm, ifm_idx)
-        else:
-            # Bypassed by replacing ofm with ifm
-            for cons in ofm.consumer_list:
-                for ifm_idx, cons_ifm in enumerate(cons.inputs):
-                    if cons_ifm == ofm:
-                        cons.set_input_tensor(ifm, ifm_idx)
+        bypass_reshape_and_squeeze_ops(op)
 
 
 def fuse_activation_function_with_prev(op, arch, nng):