MLBEDSW-4022: support PAD followed by pool operator

PAD followed by max/average pool is run on NPU if NPU
padding can be used. Average pool is converted to depthwise.

Change-Id: Icc3652e6d9ecff5ac3dc7d92080313d90c245404
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index f5006c6..e1ceb9f 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -26,6 +26,7 @@
 from . import lut
 from . import rewrite_graph
 from . import scaling
+from .api import NpuRoundingMode
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
@@ -46,6 +47,7 @@
 from .tensor import create_const_tensor
 from .tensor import QuantizationParameters
 from .tensor import Tensor
+from .tensor import TensorPurpose
 from .tflite_mapping import optype_to_builtintype
 
 passthrough_nodes = (Op.Identity,)
@@ -1174,19 +1176,55 @@
     return op
 
 
-def optimise_pad(op, arch, nng):
+def optimise_pad(op: Operation, arch, nng):
     """
     Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
     if both operations can be run on the NPU.
     """
     if (
-        (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op())
+        (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_pool_op())
         and op.run_on_npu
         and op.attrs["padding"] == Padding.VALID
     ):
         pad_op = op.ifm.ops[0]
         if pad_op.type != Op.Pad or not pad_op.run_on_npu:
             return op
+        if op.type.is_avgpool_op():
+            # Average pool is converted to depthwise, because NPU average pool + same padding
+            # has a special implementation that is different from PAD followed by average pool with
+            # valid padding.
+            k_w, k_h = op.kernel.width, op.kernel.height
+            ifm = op.ifm
+            # Remember other inputs
+            other_inputs = op.inputs[1:]
+            # Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
+            quantization = QuantizationParameters(0.0, 255.0)
+            quantization.scale_f32 = 1.0 / (k_w * k_h)
+            quantization.zero_point = 0
+            shape = [k_h, k_w, 1, op.ofm.shape[-1]]
+            weights = np.full(shape, 1)
+
+            weight_tens = create_const_tensor(
+                op.name + "_weights",
+                shape,
+                op.ifm.dtype,
+                weights,
+                np.uint8,
+                purpose=TensorPurpose.Weights,
+                quantization=quantization,
+            )
+            weight_tens.quant_values = weights
+            op.type = Op.DepthwiseConv2DBias
+            op.inputs = []
+            op.add_input_tensor(ifm)
+            op.add_input_tensor(weight_tens)
+            # Add bias tensor, all biases set to 0
+            op.inputs.append(None)
+            fixup_bias_tensors(op, arch, nng)
+            # Add other inputs
+            op.inputs.extend(other_inputs)
+            op.rounding_mode = NpuRoundingMode.NATURAL
+
         # Bypass the PAD operator
         op.set_input_tensor(pad_op.ifm, 0)
         # Adjust the padding attributes of the convolution operator
@@ -1231,7 +1269,7 @@
         bias_values = [0] * nr_biases
         bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
         bias_tensor.quant_values = bias_tensor.values
-        op.set_input_tensor(bias_tensor, -1)
+        op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
 
     return op