ethosu/vela/pass_packing.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.


 # Description:
 # Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.

 from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor
 import collections
 import enum
 from .data_type import BaseType, DataType


 class PassFlags(enum.Flag):
     Empty = 0
     Pre = 1
     Main = 2
     Post = 4
     Mac = 8
     Dma = 32
     ElementWise = 256
     Npu = 512
     Cpu = 1024
     StartupInit = 2048
     MemoryOnly = 4096
     PostFusingLimited = 8192


 npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))

 mac_main_ops = set(
     (
         # convolutions
         "Conv2DBiasAct",
         "Conv2D",
         "QuantizedConv2D",
         "Conv2DBackpropInputSwitched",
         # depth-wise convolutions
         "DepthwiseConv2dBiasAct",
         "DepthwiseConv2dNative",
         "QuantizedDepthwiseConv2D",
         # FC layers
         "QuantizedMatMul",
         "MatMul",
         "FullyConnectedAct",
         # RNN/LSTM/GRU
         "BlockLSTM",
         # pooling
         "QuantizedMaxPool",
         "QuantizedAvgPool",
         "AvgPool",
         "MaxPool",
         "AvgPoolAct",
         "MaxPoolAct",
     )
 )

 binary_elem_wise_main_ops = set(
     (
         # binary element-wise
         "AddAct",
         "MulAct",
         "SubAct",
         "QuantizedAdd",
         "QuantizedSub",
         "QuantizedMul",
         "Mul",
         "Add",
         "Sub",
         "Minimum",
         "Maximum",
     )
 )

 unary_elem_wise_main_ops = set(("LeakyRelu", "Abs"))  # Unary element-wise operations

 elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops

 activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
 npu_post_ops = activation_ops | set(
     # Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
     ("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
 )

 npu_post_fuse_limited_ops = set(
     # Set of post operators that should not be fused with main/elementwise ops
     ("ConcatSliceWrite", "Sigmoid", "Tanh")
 )

 elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh"))


 quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
 cpu_ops = (
     set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN"))
     | quantization_ops
 )

 npu_dma_ops = set(("DMA",))
 startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
 memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))


 test_sequence = [
     (
         # ops_set
         npu_post_ops,
         # incompatible_pack_flags
         PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
         # flags_to_set
         PassFlags.Npu | PassFlags.Post,
         # flags_to_clear
         PassFlags.Empty,
     ),
     (
         # ops_set
         npu_post_fuse_limited_ops,
         # incompatible_pack_flags
         PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
         # flags_to_set
         PassFlags.Npu | PassFlags.PostFusingLimited,
         # flags_to_clear
         PassFlags.Empty,
     ),
     (
         # ops_set
         mac_main_ops,
         # incompatible_pack_flags
         PassFlags.Cpu
         | PassFlags.MemoryOnly
         | PassFlags.ElementWise
         | PassFlags.Pre
         | PassFlags.Main
         | PassFlags.PostFusingLimited,
         # flags_to_set
         PassFlags.Npu | PassFlags.Mac | PassFlags.Main,
         # flags_to_clear
         PassFlags.Empty,
     ),
     (
         # ops_set
         elem_wise_main_ops,
         # incompatible_pack_flags
         PassFlags.Cpu
         | PassFlags.MemoryOnly
         | PassFlags.Mac
         | PassFlags.Pre
         | PassFlags.Main
         | PassFlags.PostFusingLimited,
         # flags_to_set
         PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main,
         # flags_to_clear
         PassFlags.Empty,
     ),
     (
         # ops_set
         npu_pre_ops,
         # incompatible_pack_flags
         PassFlags.Cpu | PassFlags.MemoryOnly,
         # flags_to_set
         PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise,
         # flags_to_clear
         PassFlags.Empty,
     ),
     (
         # ops_set
         npu_dma_ops,
         # incompatible_pack_flags
         PassFlags.Cpu | PassFlags.MemoryOnly,
         # flags_to_set
         PassFlags.Npu | PassFlags.Dma,
         # flags_to_clear
         PassFlags.Empty
     ),
     (
         # ops_set
         startup_init_ops,
         # incompatible_pack_flags
         PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly,
         # flags_to_set
         PassFlags.StartupInit | PassFlags.Main,
         # flags_to_clear
         PassFlags.Empty,
     ),
     (
         # ops_set
         memory_only_ops,
         # incompatible_pack_flags
         PassFlags.Npu | PassFlags.Cpu,
         # flags_to_set
         PassFlags.MemoryOnly | PassFlags.Main,
         # flags_to_clear
         PassFlags.Empty
     ),
     (
         # ops_set
         cpu_ops,
         # incompatible_pack_flags
         PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
         # flags_to_set
         PassFlags.Cpu | PassFlags.Main,
         # flags_to_clear
         PassFlags.Empty
     ),
     (   # This last one is a fallback for unrecognised operations
         # ops_set
         None,
         # incompatible_pack_flags
         PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
         # flags_to_set
         PassFlags.Cpu | PassFlags.Main,
         # flags_to_clear
         PassFlags.Empty
     ),
 ]

 # Some sanity checking
 for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
     assert not flags_to_clear & flags_to_set

     if operation_set is not None:
         for op in operation_set:
             assert len(op) > 1  # This is to avoid string literals being decomposed


 def pack_into_passes(nng, arch, verbose_packing=False):
     def visit_op(op, ignored):
         visit_op_refcount[op] += 1

         if visit_op_refcount[op] == 1:  # First-time visit, go and fix up unused output tensors
             for tens in op.outputs:
                 if len(tens.consumers()) == 0:
                     visit_op_refcount[op] += 1

         assert visit_op_refcount[op] <= len(op.outputs)
         if visit_op_refcount[op] == len(op.outputs):

             if op.type in startup_init_ops:
                 startup_list.append(op)
             else:
                 _, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
                 if ofm_tensor is None:
                     ofm_tensor = op.outputs[0]
                 build_pass((op,), ofm_tensor)

     def build_pass(start_ops_to_process, ofm_tensor=None):
         reverse_ops_list = []
         curr_flags = PassFlags.Empty
         npu_block_type = NpuBlockType.Default

         reverse_intermediates = []
         input_set = set()
         ifm_tensor = None
         primary_op = None

         to_process = collections.deque()
         for start_op in start_ops_to_process:
             to_process.append((start_op, None))

         while to_process:
             curr_op, tens = to_process.popleft()

             if curr_op in reverse_ops_list:
                 continue

             for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
                 if operation_set is None or curr_op.type in operation_set:
                     if not (curr_flags & incompatible_pack_flags):
                         if flags_to_set & PassFlags.Npu:
                             if not curr_op.run_on_npu:
                                 continue

                         reverse_ops_list.append(curr_op)
                         new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
                         if new_block_type != NpuBlockType.Default:
                             assert npu_block_type == NpuBlockType.Default
                             npu_block_type = new_block_type  # Only one major block type per pass
                             assert primary_op is None
                             primary_op = curr_op

                         curr_flags &= ~flags_to_clear
                         curr_flags |= flags_to_set

                         if flags_to_set & PassFlags.Npu:
                             if flags_to_set & (
                                 PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited
                             ):
                                 assert len(curr_op.inputs) >= 1
                                 if curr_op.type == "BlockLSTM":
                                     ifm_tensor = curr_op.inputs[3]
                                 else:
                                     ifm_tensor = curr_op.inputs[0]
                                 assert ifm_tensor.purpose == TensorPurpose.FeatureMap

                         if flags_to_set & PassFlags.Dma:
                             # DMAs are special - Output buffers need to be preserved as intermediates,
                             # if the pass consumes the results
                             if tens is not None:
                                 reverse_intermediates.append(tens)

                         if operation_set is None:
                             print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")

                         for inp in curr_op.inputs:
                             can_pack = True
                             if len(inp.ops) == 1:
                                 next_op = inp.ops[0]
                                 for outp in next_op.outputs:
                                     consumers = outp.consumers()
                                     if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
                                         can_pack = False
                                         break
                             else:
                                 can_pack = False

                             if can_pack:
                                 to_process.append((next_op, inp))
                             else:
                                 assert inp is not None
                                 input_set.add(inp)

                         break

             else:
                 # This operation is not compatible with already packed operations, just register the tensor as an input
                 assert tens is not None
                 input_set.add(tens)

         if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac):
             # Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
             # element wise unit
             curr_flags |= PassFlags.ElementWise

         is_element_wise = True
         for op in reverse_ops_list:
             if not op.type in elem_wise_ops and not op.type in npu_dma_ops:
                 is_element_wise = False
                 break

         placement = PassPlacement.Unknown
         if curr_flags & PassFlags.Npu:
             assert placement == PassPlacement.Unknown
             placement = PassPlacement.Npu
         if curr_flags & PassFlags.Cpu:
             assert placement == PassPlacement.Unknown
             placement = PassPlacement.Cpu
         if curr_flags & PassFlags.MemoryOnly:
             assert placement == PassPlacement.Unknown
             placement = PassPlacement.MemoryOnly
         if curr_flags & PassFlags.StartupInit:
             assert placement == PassPlacement.Unknown
             placement = PassPlacement.StartupInit
         assert placement != PassPlacement.Unknown

         ops_list = list(reversed(reverse_ops_list))
         intermediates = list(reversed(reverse_intermediates))

         if primary_op == None:
             primary_op = create_primary_op(ops_list)
             if primary_op != None:
                 visit_tensor_refcount[primary_op.inputs[0]] += 1
                 npu_block_type = primary_op.attrs["npu_block_type"]
                 for input_tens in primary_op.inputs:
                     if input_tens not in input_set:
                         input_set.add(input_tens)

         ordered_input_list = []
         input_refcounts = collections.defaultdict(int)
         for op in ops_list:
             for inp in op.inputs:
                 if inp in input_set:
                     if input_refcounts[inp] == 0:
                         ordered_input_list.append(inp)
                     input_refcounts[inp] += 1

         name = ops_list[0].name
         non_dma_ops = [op for op in ops_list if op.type != "DMA"]
         if non_dma_ops:
             name = non_dma_ops[0].name
         ps = Pass(name, placement, is_element_wise, npu_block_type)
         ps.ops = ops_list
         ps.primary_op = primary_op
         ps.inputs = ordered_input_list
         ps.intermediates = intermediates
         ps.outputs = list(ops_list[-1].outputs)
         ps.ifm_tensor = ifm_tensor

         # ElementWise operation, 2 IFMs
         if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
             ps.ifm_tensor = ps.inputs[0]

             if len(ps.inputs) == 1:
                 # Only 1 input, IFM and IFM2 are the same tensor
                 ps.ifm2_tensor = ps.inputs[0]
             else:
                 ps.ifm2_tensor = ps.inputs[1]
         else:
             ps.ifm_tensor = ifm_tensor
             ps.ifm2_tensor = None

         ps.ofm_tensor = ofm_tensor
         assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
         ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
         ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]

         for op in ps.ops:
             op.scheduled_pass = ps

         reverse_pass_list.append(ps)

         for inp, refcount in input_refcounts.items():
             for _ in range(refcount):
                 visit_tensor(inp)

         return ps

     def visit_tensor(tens):
         visit_tensor_refcount[tens] += 1
         assert visit_tensor_refcount[tens] <= len(tens.consumers())
         if visit_tensor_refcount[tens] == len(tens.consumers()):
             for op in reversed(tens.ops):
                 visit_op(op, tens)

     def create_primary_op(ops_list):
         if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) for op in ops_list):
             # Configure a 1x1 AvgPool and attach the op onto it
             op = ops_list[0]
             inp = op.inputs[0]
             avgpool_name = op.name + "_avgpool"
             avgpool_op = Operation("AvgPool", avgpool_name)
             avgpool_op.inputs = [inp]
             avgpool_op.inputs[0].consumer_list.append(avgpool_op)
             avgpool_op.attrs["padding"] = b"VALID"
             avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
             avgpool_op.attrs["stride_w"] = 1
             avgpool_op.attrs["stride_h"] = 1
             avgpool_op.attrs["filter_width"] = 1
             avgpool_op.attrs["filter_height"] = 1
             avgpool_op.attrs["strides"] = [1, 1, 1, 1]
             avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
             avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
             avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
             avgpool_out = inp.clone("_avgpooled")
             avgpool_out.consumer_list.append(op)
             avgpool_out.ops = [avgpool_op]
             avgpool_op.outputs = [avgpool_out]

             op.inputs[0] = avgpool_out
             ops_list.insert(0, avgpool_op)

             return avgpool_op

         return None

     for sg in nng.subgraphs:
         reverse_pass_list = []
         visit_op_refcount = collections.defaultdict(int)
         visit_tensor_refcount = collections.defaultdict(int)

         startup_list = []

         for tens in sg.output_tensors:
             visit_tensor(tens)

         if startup_list:
             startup_ps = build_pass(startup_list)
             startup_ps.outputs = [op.outputs[0] for op in startup_list]  # Need to fixup the outputs
             startup_ps.name = "startup_weight_initialisation"

         sg.passes = list(reversed(reverse_pass_list))
         sg.build_pass_links()

     if verbose_packing:
         nng.print_passes()

     return nng
	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	# Description:
	# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.

	from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor
	import collections
	import enum
	from .data_type import BaseType, DataType


	class PassFlags(enum.Flag):
	Empty = 0
	Pre = 1
	Main = 2
	Post = 4
	Mac = 8
	Dma = 32
	ElementWise = 256
	Npu = 512
	Cpu = 1024
	StartupInit = 2048
	MemoryOnly = 4096
	PostFusingLimited = 8192


	npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))

	mac_main_ops = set(
	(
	# convolutions
	"Conv2DBiasAct",
	"Conv2D",
	"QuantizedConv2D",
	"Conv2DBackpropInputSwitched",
	# depth-wise convolutions
	"DepthwiseConv2dBiasAct",
	"DepthwiseConv2dNative",
	"QuantizedDepthwiseConv2D",
	# FC layers
	"QuantizedMatMul",
	"MatMul",
	"FullyConnectedAct",
	# RNN/LSTM/GRU
	"BlockLSTM",
	# pooling
	"QuantizedMaxPool",
	"QuantizedAvgPool",
	"AvgPool",
	"MaxPool",
	"AvgPoolAct",
	"MaxPoolAct",
	)
	)

	binary_elem_wise_main_ops = set(
	(
	# binary element-wise
	"AddAct",
	"MulAct",
	"SubAct",
	"QuantizedAdd",
	"QuantizedSub",
	"QuantizedMul",
	"Mul",
	"Add",
	"Sub",
	"Minimum",
	"Maximum",
	)
	)

	unary_elem_wise_main_ops = set(("LeakyRelu", "Abs")) # Unary element-wise operations

	elem_wise_main_ops = binary_elem_wise_main_ops \| unary_elem_wise_main_ops

	activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
	npu_post_ops = activation_ops \| set(
	# Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
	("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
	)

	npu_post_fuse_limited_ops = set(
	# Set of post operators that should not be fused with main/elementwise ops
	("ConcatSliceWrite", "Sigmoid", "Tanh")
	)

	elem_wise_ops = elem_wise_main_ops \| activation_ops \| set(("Sigmoid", "Tanh"))


	quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
	cpu_ops = (
	set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN"))
	\| quantization_ops
	)

	npu_dma_ops = set(("DMA",))
	startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
	memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))


	test_sequence = [
	(
	# ops_set
	npu_post_ops,
	# incompatible_pack_flags
	PassFlags.Cpu \| PassFlags.MemoryOnly \| PassFlags.Pre \| PassFlags.Main,
	# flags_to_set
	PassFlags.Npu \| PassFlags.Post,
	# flags_to_clear
	PassFlags.Empty,
	),
	(
	# ops_set
	npu_post_fuse_limited_ops,
	# incompatible_pack_flags
	PassFlags.Cpu \| PassFlags.MemoryOnly \| PassFlags.Pre \| PassFlags.Main,
	# flags_to_set
	PassFlags.Npu \| PassFlags.PostFusingLimited,
	# flags_to_clear
	PassFlags.Empty,
	),
	(
	# ops_set
	mac_main_ops,
	# incompatible_pack_flags
	PassFlags.Cpu
	\| PassFlags.MemoryOnly
	\| PassFlags.ElementWise
	\| PassFlags.Pre
	\| PassFlags.Main
	\| PassFlags.PostFusingLimited,
	# flags_to_set
	PassFlags.Npu \| PassFlags.Mac \| PassFlags.Main,
	# flags_to_clear
	PassFlags.Empty,
	),
	(
	# ops_set
	elem_wise_main_ops,
	# incompatible_pack_flags
	PassFlags.Cpu
	\| PassFlags.MemoryOnly
	\| PassFlags.Mac
	\| PassFlags.Pre
	\| PassFlags.Main
	\| PassFlags.PostFusingLimited,
	# flags_to_set
	PassFlags.Npu \| PassFlags.ElementWise \| PassFlags.Main,
	# flags_to_clear
	PassFlags.Empty,
	),
	(
	# ops_set
	npu_pre_ops,
	# incompatible_pack_flags
	PassFlags.Cpu \| PassFlags.MemoryOnly,
	# flags_to_set
	PassFlags.Npu \| PassFlags.Mac \| PassFlags.Pre \| PassFlags.ElementWise,
	# flags_to_clear
	PassFlags.Empty,
	),
	(
	# ops_set
	npu_dma_ops,
	# incompatible_pack_flags
	PassFlags.Cpu \| PassFlags.MemoryOnly,
	# flags_to_set
	PassFlags.Npu \| PassFlags.Dma,
	# flags_to_clear
	PassFlags.Empty
	),
	(
	# ops_set
	startup_init_ops,
	# incompatible_pack_flags
	PassFlags.Npu \| PassFlags.Cpu \| PassFlags.MemoryOnly,
	# flags_to_set
	PassFlags.StartupInit \| PassFlags.Main,
	# flags_to_clear
	PassFlags.Empty,
	),
	(
	# ops_set
	memory_only_ops,
	# incompatible_pack_flags
	PassFlags.Npu \| PassFlags.Cpu,
	# flags_to_set
	PassFlags.MemoryOnly \| PassFlags.Main,
	# flags_to_clear
	PassFlags.Empty
	),
	(
	# ops_set
	cpu_ops,
	# incompatible_pack_flags
	PassFlags.Npu \| PassFlags.MemoryOnly \| PassFlags.Main,
	# flags_to_set
	PassFlags.Cpu \| PassFlags.Main,
	# flags_to_clear
	PassFlags.Empty
	),
	( # This last one is a fallback for unrecognised operations
	# ops_set
	None,
	# incompatible_pack_flags
	PassFlags.Npu \| PassFlags.MemoryOnly \| PassFlags.Main,
	# flags_to_set
	PassFlags.Cpu \| PassFlags.Main,
	# flags_to_clear
	PassFlags.Empty
	),
	]

	# Some sanity checking
	for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
	assert not flags_to_clear & flags_to_set

	if operation_set is not None:
	for op in operation_set:
	assert len(op) > 1 # This is to avoid string literals being decomposed


	def pack_into_passes(nng, arch, verbose_packing=False):
	def visit_op(op, ignored):
	visit_op_refcount[op] += 1

	if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors
	for tens in op.outputs:
	if len(tens.consumers()) == 0:
	visit_op_refcount[op] += 1

	assert visit_op_refcount[op] <= len(op.outputs)
	if visit_op_refcount[op] == len(op.outputs):

	if op.type in startup_init_ops:
	startup_list.append(op)
	else:
	_, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
	if ofm_tensor is None:
	ofm_tensor = op.outputs[0]
	build_pass((op,), ofm_tensor)

	def build_pass(start_ops_to_process, ofm_tensor=None):
	reverse_ops_list = []
	curr_flags = PassFlags.Empty
	npu_block_type = NpuBlockType.Default

	reverse_intermediates = []
	input_set = set()
	ifm_tensor = None
	primary_op = None

	to_process = collections.deque()
	for start_op in start_ops_to_process:
	to_process.append((start_op, None))

	while to_process:
	curr_op, tens = to_process.popleft()

	if curr_op in reverse_ops_list:
	continue

	for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
	if operation_set is None or curr_op.type in operation_set:
	if not (curr_flags & incompatible_pack_flags):
	if flags_to_set & PassFlags.Npu:
	if not curr_op.run_on_npu:
	continue

	reverse_ops_list.append(curr_op)
	new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
	if new_block_type != NpuBlockType.Default:
	assert npu_block_type == NpuBlockType.Default
	npu_block_type = new_block_type # Only one major block type per pass
	assert primary_op is None
	primary_op = curr_op

	curr_flags &= ~flags_to_clear
	curr_flags \|= flags_to_set

	if flags_to_set & PassFlags.Npu:
	if flags_to_set & (
	PassFlags.Mac \| PassFlags.ElementWise \| PassFlags.Post \| PassFlags.PostFusingLimited
	):
	assert len(curr_op.inputs) >= 1
	if curr_op.type == "BlockLSTM":
	ifm_tensor = curr_op.inputs[3]
	else:
	ifm_tensor = curr_op.inputs[0]
	assert ifm_tensor.purpose == TensorPurpose.FeatureMap

	if flags_to_set & PassFlags.Dma:
	# DMAs are special - Output buffers need to be preserved as intermediates,
	# if the pass consumes the results
	if tens is not None:
	reverse_intermediates.append(tens)

	if operation_set is None:
	print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")

	for inp in curr_op.inputs:
	can_pack = True
	if len(inp.ops) == 1:
	next_op = inp.ops[0]
	for outp in next_op.outputs:
	consumers = outp.consumers()
	if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
	can_pack = False
	break
	else:
	can_pack = False

	if can_pack:
	to_process.append((next_op, inp))
	else:
	assert inp is not None
	input_set.add(inp)

	break

	else:
	# This operation is not compatible with already packed operations, just register the tensor as an input
	assert tens is not None
	input_set.add(tens)

	if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise \| PassFlags.Mac):
	# Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
	# element wise unit
	curr_flags \|= PassFlags.ElementWise

	is_element_wise = True
	for op in reverse_ops_list:
	if not op.type in elem_wise_ops and not op.type in npu_dma_ops:
	is_element_wise = False
	break

	placement = PassPlacement.Unknown
	if curr_flags & PassFlags.Npu:
	assert placement == PassPlacement.Unknown
	placement = PassPlacement.Npu
	if curr_flags & PassFlags.Cpu:
	assert placement == PassPlacement.Unknown
	placement = PassPlacement.Cpu
	if curr_flags & PassFlags.MemoryOnly:
	assert placement == PassPlacement.Unknown
	placement = PassPlacement.MemoryOnly
	if curr_flags & PassFlags.StartupInit:
	assert placement == PassPlacement.Unknown
	placement = PassPlacement.StartupInit
	assert placement != PassPlacement.Unknown

	ops_list = list(reversed(reverse_ops_list))
	intermediates = list(reversed(reverse_intermediates))

	if primary_op == None:
	primary_op = create_primary_op(ops_list)
	if primary_op != None:
	visit_tensor_refcount[primary_op.inputs[0]] += 1
	npu_block_type = primary_op.attrs["npu_block_type"]
	for input_tens in primary_op.inputs:
	if input_tens not in input_set:
	input_set.add(input_tens)

	ordered_input_list = []
	input_refcounts = collections.defaultdict(int)
	for op in ops_list:
	for inp in op.inputs:
	if inp in input_set:
	if input_refcounts[inp] == 0:
	ordered_input_list.append(inp)
	input_refcounts[inp] += 1

	name = ops_list[0].name
	non_dma_ops = [op for op in ops_list if op.type != "DMA"]
	if non_dma_ops:
	name = non_dma_ops[0].name
	ps = Pass(name, placement, is_element_wise, npu_block_type)
	ps.ops = ops_list
	ps.primary_op = primary_op
	ps.inputs = ordered_input_list
	ps.intermediates = intermediates
	ps.outputs = list(ops_list[-1].outputs)
	ps.ifm_tensor = ifm_tensor

	# ElementWise operation, 2 IFMs
	if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
	ps.ifm_tensor = ps.inputs[0]

	if len(ps.inputs) == 1:
	# Only 1 input, IFM and IFM2 are the same tensor
	ps.ifm2_tensor = ps.inputs[0]
	else:
	ps.ifm2_tensor = ps.inputs[1]
	else:
	ps.ifm_tensor = ifm_tensor
	ps.ifm2_tensor = None

	ps.ofm_tensor = ofm_tensor
	assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
	ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
	ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]

	for op in ps.ops:
	op.scheduled_pass = ps

	reverse_pass_list.append(ps)

	for inp, refcount in input_refcounts.items():
	for _ in range(refcount):
	visit_tensor(inp)

	return ps

	def visit_tensor(tens):
	visit_tensor_refcount[tens] += 1
	assert visit_tensor_refcount[tens] <= len(tens.consumers())
	if visit_tensor_refcount[tens] == len(tens.consumers()):
	for op in reversed(tens.ops):
	visit_op(op, tens)

	def create_primary_op(ops_list):
	if any(op.type in (npu_pre_ops \| npu_post_ops \| npu_post_fuse_limited_ops) for op in ops_list):
	# Configure a 1x1 AvgPool and attach the op onto it
	op = ops_list[0]
	inp = op.inputs[0]
	avgpool_name = op.name + "_avgpool"
	avgpool_op = Operation("AvgPool", avgpool_name)
	avgpool_op.inputs = [inp]
	avgpool_op.inputs[0].consumer_list.append(avgpool_op)
	avgpool_op.attrs["padding"] = b"VALID"
	avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
	avgpool_op.attrs["stride_w"] = 1
	avgpool_op.attrs["stride_h"] = 1
	avgpool_op.attrs["filter_width"] = 1
	avgpool_op.attrs["filter_height"] = 1
	avgpool_op.attrs["strides"] = [1, 1, 1, 1]
	avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
	avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
	avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
	avgpool_out = inp.clone("_avgpooled")
	avgpool_out.consumer_list.append(op)
	avgpool_out.ops = [avgpool_op]
	avgpool_op.outputs = [avgpool_out]

	op.inputs[0] = avgpool_out
	ops_list.insert(0, avgpool_op)

	return avgpool_op

	return None

	for sg in nng.subgraphs:
	reverse_pass_list = []
	visit_op_refcount = collections.defaultdict(int)
	visit_tensor_refcount = collections.defaultdict(int)

	startup_list = []

	for tens in sg.output_tensors:
	visit_tensor(tens)

	if startup_list:
	startup_ps = build_pass(startup_list)
	startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs
	startup_ps.name = "startup_weight_initialisation"

	sg.passes = list(reversed(reverse_pass_list))
	sg.build_pass_links()

	if verbose_packing:
	nng.print_passes()

	return nng