ethosu/vela/graph_optimiser_util.py - ml/ethos-u/ethos-u-vela - Gitiles

 # SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Description:
 # Common functions and definitions used during the graph optimization.
 from typing import Tuple

 import numpy as np

 from . import lut
 from .architecture_features import Accelerator
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
 from .errors import VelaError
 from .operation import Op
 from .shape4d import Shape4D
 from .tensor import create_const_tensor
 from .tensor import QuantizationParameters

 memory_only_ops = (
     Op.Reshape,
     Op.QuantizedReshape,
     Op.Squeeze,
     Op.ExpandDims,
     Op.Identity,
 )


 def _avoid_nhcwb16_for_concat(tens):
     # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
     # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
     # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
     # and those addresses are always 16 byte aligned due to the NHCWB16 format.
     return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)


 def _avoid_nhcwb16_for_split(tens):
     # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input

     # Return True if NHCWB16 needs to be avoided
     def offset_not_aligned(read_offset):
         return read_offset is not None and (read_offset.depth % 16) != 0

     for cons_op in tens.consumer_list:
         if cons_op.ifm == tens:
             if offset_not_aligned(cons_op.read_offsets[0]):
                 return True
         if cons_op.ifm2 is not None and cons_op.ifm2 == tens:
             if offset_not_aligned(cons_op.read_offsets[1]):
                 return True
     return False


 def _avoid_nhcwb16_for_shapes(tens):
     # check all producers/consumers to see if any op shape is preventing NHCWB16
     for cons_op in tens.consumer_list:
         if cons_op.ifm == tens:
             cons_op_shape = cons_op.ifm_shapes[0]
         elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
             cons_op_shape = cons_op.ifm_shapes[1]
         else:
             assert False
         if Shape4D(tens.shape) != cons_op_shape:
             return True

     for prod_op in tens.ops:
         if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
             return True

     return False


 def _avoid_nhcwb16_for_memory_only(tens):
     # check all producers/consumers to see if any op is preventing NHCWB16
     return any(op.type == Op.Memcpy for op in (tens.consumer_list + tens.ops))


 # Check if non linear format can be used
 def check_format_restrictions(tens, arch):
     if len(tens.ops) < 1:
         return
     if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
         cons is None for cons in tens.consumer_list
     ):
         return

     # Check if any of the producers/consumers is run on CPU
     if not all(cons.run_on_npu for cons in tens.consumer_list):
         return
     if not all(prod.run_on_npu for prod in tens.ops):
         return

     # "Concat" ofm exception:
     if _avoid_nhcwb16_for_concat(tens):
         return

     # "Split" ifm exception:
     if _avoid_nhcwb16_for_split(tens):
         return

     # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
     if _avoid_nhcwb16_for_shapes(tens):
         return

     # Memory only ifm/ofm exception: DMA ops must use NHCW
     if _avoid_nhcwb16_for_memory_only(tens):
         return

     # Resize bilinear half pixel center implementation requires OFM with linear format to
     # allow stride modification in H/W dimensions.
     for op in tens.ops:
         if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:
             return

     for op in tens.consumer_list:
         if op.type == Op.ReduceSum and (
             tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
         ):
             # ReduceSum requires NHWC input
             return
         if op.type == Op.Reshape:
             # Using NHCWB16 format for a no-op reshape is only an option if subsequent
             # consumers do not also need to perform a reshape or if the OFM is going to
             # be processed by CPU operations. No-op reshape consumers with empty lists
             # (those that have no consumers, or null-consumers used as list terminators)
             # must use normal NHWC output.

             def incompatible_consumers(oper):
                 if oper and oper.type == Op.Reshape:
                     for consumer in oper.outputs[0].consumer_list:
                         yield from incompatible_consumers(consumer)
                 yield not oper or not oper.run_on_npu

             if not any(incompatible_consumers(op)):

                 def get_rewrites(oper):
                     if oper and oper.type == Op.Reshape:
                         for consumer in oper.outputs[0].consumer_list:
                             yield from get_rewrites(consumer)
                         yield oper

                 # Detect no-op reshapes by comparing their full input and output tensor shapes.
                 inshape = op.ifm_shapes[0]
                 compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
                 if not (compatible_shape and all(compatible_shape)):
                     return
             else:
                 return

     tens.needs_linear_format = False


 def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
     """
     Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
     that provides equivalent results.
     """
     total_padding = needed_total_padding(input_size, stride, filter_size)

     # The bottom/right padding might need downward adjustment depending on stride/input size
     total_minus_before = total_padding - pad_before
     output_pad_after = pad_after
     while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
         output_pad_after -= 1
     return pad_before, output_pad_after


 def needed_total_padding(input_size, stride, filter_size):
     out_size = (input_size + stride - 1) // stride
     needed_input = (out_size - 1) * stride + filter_size
     total_padding = max(0, needed_input - input_size)
     return total_padding


 # Set input/output tensor equivalence to the same id for memory operations
 def set_tensor_equivalence(op, arch, nng):
     if op.type in memory_only_ops:
         eid = op.outputs[0].equivalence_id
         for inp in op.inputs:
             inp.equivalence_id = eid
     return op


 def set_ifm_ofm_op_shapes(op, arch, nng):
     if op.run_on_npu and op.type.needs_shapes():
         if op.ifm_shapes or op.ofm_shapes:
             # Shapes already set
             return op
         op.set_ifm_ofm_shapes()
     return op


 def move_splitsliceread_to_consumer(op, cons_op):
     assert op.type == Op.SplitSliceRead

     if cons_op.ifm == op.ofm:
         cons_op.read_offsets[0] = op.read_offsets[0]
         cons_op.read_shapes[0] = op.read_shapes[0]
         cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
         cons_op.ifm_shapes[0] = op.ifm_shapes[0]
     elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
         cons_op.read_offsets[1] = op.read_offsets[0]
         cons_op.read_shapes[1] = op.read_shapes[0]
         cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
         cons_op.ifm_shapes[1] = op.ifm_shapes[0]
     op.ofm.consumer_list.remove(cons_op)
     op.ofm.ops = []
     op.ifm.consumer_list.remove(op)


 def check_memory_only_removed(op, arch):
     if op.run_on_npu and op.type in memory_only_ops:
         # Memory only operators should have been removed
         raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")


 def record_optimised(op, arch):
     if op.type not in (Op.Const, Op.Placeholder):
         DebugDatabase.add_optimised(op, op)


 def bypass_memory_only_ops(op, arch, nng):
     if not op.run_on_npu or op.type not in memory_only_ops:
         return op

     # Memory only operators can be completely removed if there is a one to one
     # connection. The reshape OFM can be connected to the previous op.
     #
     #                Bypassed to
     #                    --->
     #       1x6x6x10             1x6x6x10
     #         ADD                  ADD
     #          |          ------->  |
     #       1x6x6x10      |      1x20x3x6
     #        RESHAPE      |        MEAN
     #          | ---------|
     #       1x20x3x10
     #         MEAN
     #
     # In the above the ADD OFM = RESHAPE IFM is removed and replaced by
     # the RESHAPE OFM.
     #
     # Then there are two cases when bypassing is not possible. One is when
     # the IFM is produced by the CPU. This tensor must be preserved. It
     # cannot be removed from the graph. The other case is when the IFM has
     # multiple consumers, then it is not possible to just bypass the op and
     # there is a need for a DMA (nop).
     #
     #                Converts to
     #                    --->
     #       1x6x6x10                1x6x6x10
     #    -----ADD-----           -----ADD-----
     #    |           |           |           |
     # 1x6x6x10    1x6x6x10   1x6x6x10    1x6x6x10
     #  RESHAPE      MEAN       DMA OP      MEAN
     #    |                       |
     # 1x20x3x6               1x20x3x6
     #   MEAN                   MEAN
     #
     # If the DMA IFM and DMA OFM ends up in the same memory area
     # the DMA op will be removed when the cmd stream is generated.

     ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
     ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)

     if ifm_has_multiple_cons or ifm_is_cpu_produced:
         # Convert to a memcpy op
         op.type = Op.Memcpy
         DebugDatabase.add_optimised(op, op)
     else:
         # Bypass op
         ofm = op.ofm
         ifm = op.ifm
         ofm.ops = []
         for prev_op in ifm.ops:
             prev_op.outputs = [ofm]
             ofm.ops.append(prev_op)

     return op


 def convert_depthwise_to_conv(op, arch, nng):
     # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
     # the ofm depth equals the depth multipler.
     # If those conditions are true, then we can perform a simple
     # switch of the operator type (and weight order)

     if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
         ifm_shape = op.ifm_shapes[0]
         weight_tensor = op.inputs[1]
         ofm_shape = op.ofm_shapes[0]
         if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
             # Change op type to Conv2d
             op.type = Op.Conv2DBias
             del op.attrs["channel_multiplier"]
             del op.attrs["depth_multiplier"]

             weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
             weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
             DebugDatabase.add_optimised(op, op)
         else:
             raise UnsupportedFeatureError(
                 f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
                 f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
             )
     return op


 def convert_to_lut(op, lut_values, lut_name):
     # Rewrite the operation by Add with scalar 0 + LUT activation
     ifm = op.ifm
     ofm = op.ofm
     if ifm is None:
         return op
     assert ifm.dtype.size_in_bytes() == 1
     op.type = Op.Add
     op.name = op.name + "_lut_" + lut_name
     # Mark as no-op to enable potential fusing optimizations
     op.attrs["is_nop"] = True
     # Create an input tensor containing scalar zero
     quantization = QuantizationParameters(0.0, 255.0)
     quantization.scale_f32 = ifm.quantization.scale_f32
     quantization.zero_point = 0
     tens = create_const_tensor(ifm.name + "_scalar0", [], ifm.dtype, [0], quantization=quantization)
     op.add_input_tensor(tens)
     op.ifm_shapes.append(Shape4D(tens.shape))  # TODO no shape?

     # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
     # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
     # should be the same as the IFM
     op.forced_output_quantization = ifm.quantization

     # the lut tensor datatype needs to match both; the ofm datatype, because these are the values output; and the
     # datatype used to generate the lut values (which is probably the ifm datatype), because we want to avoid any
     # potential overflow errors in create_lut_tensor() caused by converting Python int (which could represent a uint)
     # to NumPy int. this can be guaranteed by checking that the ifm and ofm datatypes are the same
     assert ifm.dtype == ofm.dtype
     lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, ofm.dtype)
     op.set_activation_lut(lut_tensor)
     op.set_ifm_ofm_shapes()
     DebugDatabase.add_optimised(op, op)
     return op
	# SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Description:
	# Common functions and definitions used during the graph optimization.
	from typing import Tuple

	import numpy as np

	from . import lut
	from .architecture_features import Accelerator
	from .data_type import DataType
	from .debug_database import DebugDatabase
	from .errors import UnsupportedFeatureError
	from .errors import VelaError
	from .operation import Op
	from .shape4d import Shape4D
	from .tensor import create_const_tensor
	from .tensor import QuantizationParameters

	memory_only_ops = (
	Op.Reshape,
	Op.QuantizedReshape,
	Op.Squeeze,
	Op.ExpandDims,
	Op.Identity,
	)


	def _avoid_nhcwb16_for_concat(tens):
	# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
	# multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
	# aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
	# and those addresses are always 16 byte aligned due to the NHCWB16 format.
	return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)


	def _avoid_nhcwb16_for_split(tens):
	# If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input

	# Return True if NHCWB16 needs to be avoided
	def offset_not_aligned(read_offset):
	return read_offset is not None and (read_offset.depth % 16) != 0

	for cons_op in tens.consumer_list:
	if cons_op.ifm == tens:
	if offset_not_aligned(cons_op.read_offsets[0]):
	return True
	if cons_op.ifm2 is not None and cons_op.ifm2 == tens:
	if offset_not_aligned(cons_op.read_offsets[1]):
	return True
	return False


	def _avoid_nhcwb16_for_shapes(tens):
	# check all producers/consumers to see if any op shape is preventing NHCWB16
	for cons_op in tens.consumer_list:
	if cons_op.ifm == tens:
	cons_op_shape = cons_op.ifm_shapes[0]
	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
	cons_op_shape = cons_op.ifm_shapes[1]
	else:
	assert False
	if Shape4D(tens.shape) != cons_op_shape:
	return True

	for prod_op in tens.ops:
	if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
	return True

	return False


	def _avoid_nhcwb16_for_memory_only(tens):
	# check all producers/consumers to see if any op is preventing NHCWB16
	return any(op.type == Op.Memcpy for op in (tens.consumer_list + tens.ops))


	# Check if non linear format can be used
	def check_format_restrictions(tens, arch):
	if len(tens.ops) < 1:
	return
	if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
	cons is None for cons in tens.consumer_list
	):
	return

	# Check if any of the producers/consumers is run on CPU
	if not all(cons.run_on_npu for cons in tens.consumer_list):
	return
	if not all(prod.run_on_npu for prod in tens.ops):
	return

	# "Concat" ofm exception:
	if _avoid_nhcwb16_for_concat(tens):
	return

	# "Split" ifm exception:
	if _avoid_nhcwb16_for_split(tens):
	return

	# Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
	if _avoid_nhcwb16_for_shapes(tens):
	return

	# Memory only ifm/ofm exception: DMA ops must use NHCW
	if _avoid_nhcwb16_for_memory_only(tens):
	return

	# Resize bilinear half pixel center implementation requires OFM with linear format to
	# allow stride modification in H/W dimensions.
	for op in tens.ops:
	if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:
	return

	for op in tens.consumer_list:
	if op.type == Op.ReduceSum and (
	tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
	):
	# ReduceSum requires NHWC input
	return
	if op.type == Op.Reshape:
	# Using NHCWB16 format for a no-op reshape is only an option if subsequent
	# consumers do not also need to perform a reshape or if the OFM is going to
	# be processed by CPU operations. No-op reshape consumers with empty lists
	# (those that have no consumers, or null-consumers used as list terminators)
	# must use normal NHWC output.

	def incompatible_consumers(oper):
	if oper and oper.type == Op.Reshape:
	for consumer in oper.outputs[0].consumer_list:
	yield from incompatible_consumers(consumer)
	yield not oper or not oper.run_on_npu

	if not any(incompatible_consumers(op)):

	def get_rewrites(oper):
	if oper and oper.type == Op.Reshape:
	for consumer in oper.outputs[0].consumer_list:
	yield from get_rewrites(consumer)
	yield oper

	# Detect no-op reshapes by comparing their full input and output tensor shapes.
	inshape = op.ifm_shapes[0]
	compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
	if not (compatible_shape and all(compatible_shape)):
	return
	else:
	return

	tens.needs_linear_format = False


	def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
	"""
	Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
	that provides equivalent results.
	"""
	total_padding = needed_total_padding(input_size, stride, filter_size)

	# The bottom/right padding might need downward adjustment depending on stride/input size
	total_minus_before = total_padding - pad_before
	output_pad_after = pad_after
	while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
	output_pad_after -= 1
	return pad_before, output_pad_after


	def needed_total_padding(input_size, stride, filter_size):
	out_size = (input_size + stride - 1) // stride
	needed_input = (out_size - 1) * stride + filter_size
	total_padding = max(0, needed_input - input_size)
	return total_padding


	# Set input/output tensor equivalence to the same id for memory operations
	def set_tensor_equivalence(op, arch, nng):
	if op.type in memory_only_ops:
	eid = op.outputs[0].equivalence_id
	for inp in op.inputs:
	inp.equivalence_id = eid
	return op


	def set_ifm_ofm_op_shapes(op, arch, nng):
	if op.run_on_npu and op.type.needs_shapes():
	if op.ifm_shapes or op.ofm_shapes:
	# Shapes already set
	return op
	op.set_ifm_ofm_shapes()
	return op


	def move_splitsliceread_to_consumer(op, cons_op):
	assert op.type == Op.SplitSliceRead

	if cons_op.ifm == op.ofm:
	cons_op.read_offsets[0] = op.read_offsets[0]
	cons_op.read_shapes[0] = op.read_shapes[0]
	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
	cons_op.ifm_shapes[0] = op.ifm_shapes[0]
	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
	cons_op.read_offsets[1] = op.read_offsets[0]
	cons_op.read_shapes[1] = op.read_shapes[0]
	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
	cons_op.ifm_shapes[1] = op.ifm_shapes[0]
	op.ofm.consumer_list.remove(cons_op)
	op.ofm.ops = []
	op.ifm.consumer_list.remove(op)


	def check_memory_only_removed(op, arch):
	if op.run_on_npu and op.type in memory_only_ops:
	# Memory only operators should have been removed
	raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")


	def record_optimised(op, arch):
	if op.type not in (Op.Const, Op.Placeholder):
	DebugDatabase.add_optimised(op, op)


	def bypass_memory_only_ops(op, arch, nng):
	if not op.run_on_npu or op.type not in memory_only_ops:
	return op

	# Memory only operators can be completely removed if there is a one to one
	# connection. The reshape OFM can be connected to the previous op.
	#
	# Bypassed to
	# --->
	# 1x6x6x10 1x6x6x10
	# ADD ADD
	# \| -------> \|
	# 1x6x6x10 \| 1x20x3x6
	# RESHAPE \| MEAN
	# \| ---------\|
	# 1x20x3x10
	# MEAN
	#
	# In the above the ADD OFM = RESHAPE IFM is removed and replaced by
	# the RESHAPE OFM.
	#
	# Then there are two cases when bypassing is not possible. One is when
	# the IFM is produced by the CPU. This tensor must be preserved. It
	# cannot be removed from the graph. The other case is when the IFM has
	# multiple consumers, then it is not possible to just bypass the op and
	# there is a need for a DMA (nop).
	#
	# Converts to
	# --->
	# 1x6x6x10 1x6x6x10
	# -----ADD----- -----ADD-----
	# \| \| \| \|
	# 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10
	# RESHAPE MEAN DMA OP MEAN
	# \| \|
	# 1x20x3x6 1x20x3x6
	# MEAN MEAN
	#
	# If the DMA IFM and DMA OFM ends up in the same memory area
	# the DMA op will be removed when the cmd stream is generated.

	ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
	ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)

	if ifm_has_multiple_cons or ifm_is_cpu_produced:
	# Convert to a memcpy op
	op.type = Op.Memcpy
	DebugDatabase.add_optimised(op, op)
	else:
	# Bypass op
	ofm = op.ofm
	ifm = op.ifm
	ofm.ops = []
	for prev_op in ifm.ops:
	prev_op.outputs = [ofm]
	ofm.ops.append(prev_op)

	return op


	def convert_depthwise_to_conv(op, arch, nng):
	# Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
	# the ofm depth equals the depth multipler.
	# If those conditions are true, then we can perform a simple
	# switch of the operator type (and weight order)

	if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
	ifm_shape = op.ifm_shapes[0]
	weight_tensor = op.inputs[1]
	ofm_shape = op.ofm_shapes[0]
	if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
	# Change op type to Conv2d
	op.type = Op.Conv2DBias
	del op.attrs["channel_multiplier"]
	del op.attrs["depth_multiplier"]

	weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
	DebugDatabase.add_optimised(op, op)
	else:
	raise UnsupportedFeatureError(
	f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
	f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
	)
	return op


	def convert_to_lut(op, lut_values, lut_name):
	# Rewrite the operation by Add with scalar 0 + LUT activation
	ifm = op.ifm
	ofm = op.ofm
	if ifm is None:
	return op
	assert ifm.dtype.size_in_bytes() == 1
	op.type = Op.Add
	op.name = op.name + "_lut_" + lut_name
	# Mark as no-op to enable potential fusing optimizations
	op.attrs["is_nop"] = True
	# Create an input tensor containing scalar zero
	quantization = QuantizationParameters(0.0, 255.0)
	quantization.scale_f32 = ifm.quantization.scale_f32
	quantization.zero_point = 0
	tens = create_const_tensor(ifm.name + "_scalar0", [], ifm.dtype, [0], quantization=quantization)
	op.add_input_tensor(tens)
	op.ifm_shapes.append(Shape4D(tens.shape)) # TODO no shape?

	# The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
	# so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
	# should be the same as the IFM
	op.forced_output_quantization = ifm.quantization

	# the lut tensor datatype needs to match both; the ofm datatype, because these are the values output; and the
	# datatype used to generate the lut values (which is probably the ifm datatype), because we want to avoid any
	# potential overflow errors in create_lut_tensor() caused by converting Python int (which could represent a uint)
	# to NumPy int. this can be guaranteed by checking that the ifm and ofm datatypes are the same
	assert ifm.dtype == ofm.dtype
	lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, ofm.dtype)
	op.set_activation_lut(lut_tensor)
	op.set_ifm_ofm_shapes()
	DebugDatabase.add_optimised(op, op)
	return op