TOSA: Fix AVGPOOL scaling

-Only support for avgpool when there is
no padding. For this case, global scaling can be used.

Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: I026b83b05f02c57c79f49935f5ec501a6d28bb91
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index f3cddad..2d1245b 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -15,6 +15,8 @@
 # limitations under the License.
 # Description:
 # Early optimisation of the TOSA based network graph, using the rewrite_graph module to do the traversal of the graph.
+import numpy as np
+
 from . import rewrite_graph
 from .api import NpuRoundingMode
 from .data_type import DataType
@@ -80,6 +82,39 @@
     return op
 
 
+# Counts leading zeroes for a (int32)
+def count_leading_zeros(a):
+    lz = int(32)
+    if a != 0:
+        mask = 1 << (32 - 1)
+        lz = 0
+        while (mask & a) == 0:
+            mask = mask >> 1
+            lz = lz + 1
+    return lz
+
+
+def calc_scaling_avgpool(op, arch, nng):
+    if op.type == Op.AvgPool:
+        top, left, _, _ = op.attrs["explicit_padding"]
+        # TODO Only support for when global scaling can be used.
+        # That is when there is no padding
+        assert top == 0 and left == 0
+        assert op.explicit_scaling is None
+        multiplier = []
+        shift = []
+
+        kernel_wh = op.kernel.elements_wh()
+        k = 32 - count_leading_zeros(kernel_wh - 1)
+        numerator = np.int64(((1 << 30) + 1) << k)
+        multiplier.append(numerator // kernel_wh)
+        shift.append(30 + k)
+
+        op.rounding_mode = NpuRoundingMode.NATURAL
+        op.explicit_scaling = ExplicitScaling(False, shift, multiplier)
+    return op
+
+
 def remove_const_transpose(op, arch, nng):
     if op.type == Op.Transpose:
         removed = False
@@ -432,6 +467,12 @@
         rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_reshapes])
         sg.refresh_after_modification()
 
+    # TODO, when and where to best handle calc_scaling_avgpool
+    for idx, sg in enumerate(nng.subgraphs):
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+            nng, sg, arch, [], [calc_scaling_avgpool], rewrite_unsupported=False,
+        )
+
     # Rewite Operators step
     op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv]
 
diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py
index c619f2f..d368616 100644
--- a/ethosu/vela/tosa_supported_operators.py
+++ b/ethosu/vela/tosa_supported_operators.py
@@ -54,11 +54,13 @@
     # Supported data types
     # TODO will differ compared to TensorFlow Lite, currently set to the same
     supported_op_dtypes = set((DataType.uint8, DataType.int8, DataType.int16, DataType.int32))  # TODO add bool
+    tens_dim_range = (1, 65535)  # TODO HW limitation, that is to be resolved in SW
 
     def __init__(self):
         # Setup the generic constraints. Note: the order matters
         self.generic_constraints = []
         self.generic_constraints.append(TosaSupportedOperators.constraint_tens_dtype)
+        self.generic_constraints.append(TosaSupportedOperators.constraint_tens_dimension)
 
         # Setup specific constraints. Note: the order matters
         self.specific_constraints = defaultdict(list)
@@ -69,6 +71,10 @@
         for op_type in TosaSupportedOperators.depthwise_convolution_ops:
             self.specific_constraints[op_type].append(TosaSupportedOperators.constraint_depth_multiplier)
 
+        # Avgpool specific checks
+        for op_type in TosaSupportedOperators.avg_pooling_ops:
+            self.specific_constraints[op_type].append(TosaSupportedOperators.constraint_padding)
+
     def is_operator_supported(self, op):
         ext_type = optype_to_tosa_op_type(op.type)
         if op.type not in TosaSupportedOperators.supported_operators:
@@ -103,13 +109,41 @@
                 extra.append(f"Tensor '{tens.name}' has data type: {tens.dtype}")
         return valid, ", ".join(extra)
 
+    # TODO Duplicates check present for TFLite. But it is only temporarily added
+    # This is for a HW limitation, that is to be resolved in SW later on
+    @classmethod
+    @docstring_format_args(tens_dim_range)
+    def constraint_tens_dimension(cls, op):
+        "Tensor dimensions must be in the range [{}, {}]"
+        tens_min, tens_max = cls.tens_dim_range
+        valid = True
+        extra = []
+        tensors = [tens for tens in op.get_ifm_ifm2_weights_ofm() if tens]
+        if not tensors:
+            tensors = [tens for tens in op.inputs if tens]
+        for tens in tensors:
+            if not all(tens_min <= dim <= tens_max for dim in tens.shape):
+                valid = False
+                extra.append(f"Tensor '{tens.name}' has shape: {tens.shape}")
+        return valid, ", ".join(extra)
+
     @staticmethod
     def constraint_ifm_producer(cls, op):
         "Input must be constant data"
         valid = op.ifm.ops and op.ifm.ops[0].type == Op.Const
         return valid, "Op has ifm with non-constant data"
 
-    # TODO duplicates TFLite_supported operators, but support for depth multiplier should be added at a later stage
+    @staticmethod
+    def constraint_padding(op):
+        # TODO Only support for when global scaling can be used.
+        # That is when there is padding no padding
+        "Avgpool only supported for no padding"
+        top, left, _, _ = op.attrs["explicit_padding"]
+        valid = top == 0 and left == 0
+
+        return valid, "Avgpool with pad_top {top} and pad_left {left}"
+
+    # TODO duplicates tflite_supported operators, but support for depth multiplier should be added at a later stage
     @staticmethod
     def constraint_depth_multiplier(op):
         "For depth multipliers > 1, IFM channels must be 1 and OFM channels must be equal to the depth multiplier"