MLBEDSW-4075 PACK axis 0 + tanh fails with output diff

The test failed since the tanh had batch size > 1.
Added checks for batch size for all supported operators.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: I3570352740c40eb96bd9db965dfa3c91c81ff2ad
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index 3ccf3ab..9488a78 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -1,7 +1,7 @@
 # Supported Ops
 
 This file was automatically generated by Vela using the `--supported-ops-report` parameter.  
-Vela version: `3.6.0rc1.dev11+gac5e33e`
+Vela version: `3.5.1.dev10+gf616c9d6.d20220915`
 
 This file complies with
 [**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md)
@@ -61,19 +61,20 @@
 This is a list of constraints most NPU operators must satisfy in order to be scheduled on the NPU.
 (Operators excluded from certain constraints are shown in brackets [ ] )
 
-- Input(s) and Output tensors must not be dynamic - [Quantize]
-- Input(s) and Output tensors must have a defined shape 
-- Output tensors cannot be scalar - [Quantize]
-- Scalar Input tensors are only valid for op type: ADD, EXPAND_DIMS, MAXIMUM, MEAN, MINIMUM, MUL, QUANTIZE, SPLIT, SPLIT_V, SUB 
-- Input(s) and Output tensors must not be greater than 4D 
-- Input(s), Output and Weight tensors must have quantization parameters - [Shape]
-- Input(s), Output and Weight tensors with quantization scales must be finite 
-- Input and Output tensors must have quantization scales that fit within float32 precision 
-- Constant tensors should not have NoneType-values 
+- Input(s) and Output tensors must not be dynamic - [QUANTIZE]
+- Input(s) and Output tensors must have a defined shape
+- Output tensors cannot be scalar - [QUANTIZE]
+- Scalar Input tensors are only valid for op type: ADD, EXPAND_DIMS, MAXIMUM, MEAN, MINIMUM, MUL, QUANTIZE, SPLIT, SPLIT_V, SUB
+- Input(s) and Output tensors must not be greater than 4D
+- Input(s), Output and Weight tensors must have quantization parameters - [SHAPE]
+- Input(s), Output and Weight tensors with quantization scales must be finite
+- Input and Output tensors must have quantization scales that fit within float32 precision
+- Constant tensors should not have NoneType-values
 - Tensors must be of type: int16, int32, int8, uint8
 - Tensors which are int32 are only valid when op type is: ADD, MUL, SHAPE, SUB
 - Tensor dimensions must be in the range [1, 65535]
 - Per-axis quantization is only supported for the following op types: CONV_2D, DEPTHWISE_CONV_2D, TRANSPOSE_CONV
+- IFM Tensor batch size must be 1 - [FULLY_CONNECTED, RESHAPE, SHAPE, SLICE, SOFTMAX, SPLIT, SPLIT_V, SQUEEZE, STRIDED_SLICE, UNPACK]
 - The fused activation function (if present) must be one of type: LOGISTIC, RELU, RELU6, RELU_N1_TO_1, TANH
 - If a fused activation function is present, the Output tensor must be one of type: int16, int8, uint8
 
@@ -83,7 +84,6 @@
 
 - At least one Input's shape must match the OFM's shape
 - IFM and OFM data types must match
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 
 ### TFLite ADD Constraints
 
@@ -93,7 +93,6 @@
 - Both Input data types must match
 - For IFM that are signed, OFM must also be signed
 - For IFM that are unsigned, OFM must either be the same type or int32
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
 
 ### TFLite AVERAGE_POOL_2D Constraints
@@ -103,7 +102,6 @@
 - Stride values for both width and height must be integer types
 - IFM and OFM data types must match
 - Kernel filter values for both width and height must be integer types
-- IFM Tensor batch size must be 1
 - Stride values for both width and height must be in the range [1, 3]
 - Kernel filter values for both width and height must be in the range [1, 8]
 - VALID padding: Kernel filter height must be in the range [1, 256]
@@ -134,7 +132,6 @@
 - The sum of the weights cannot exceed 8323072
 - Optional Bias tensor must be of type: int32, int64
 - Optional Bias tensor values must fit within 40-bits
-- IFM Tensor batch size must be 1
 
 ### TFLite DEPTHWISE_CONV_2D Constraints
 
@@ -151,7 +148,6 @@
 - The sum of the weights cannot exceed 8323072
 - Optional Bias tensor must be of type: int32, int64
 - Optional Bias tensor values must fit within 40-bits
-- IFM Tensor batch size must be 1
 - For depth multipliers > 1, IFM channels must be 1 and OFM channels must be equal to the depth multiplier
 
 ### TFLite EXPAND_DIMS Constraints
@@ -184,7 +180,6 @@
 
 - At least one Input's shape must match the OFM's shape
 - IFM and OFM data types must match
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 
 ### TFLite MAXIMUM Constraints
 
@@ -192,7 +187,6 @@
 
 - At least one Input's shape must match the OFM's shape
 - IFM and OFM data types must match
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 - Both Input quantization parameters must match OFM quantization parameters
 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
 
@@ -203,7 +197,6 @@
 - Stride values for both width and height must be integer types
 - IFM and OFM data types must match
 - Kernel filter values for both width and height must be integer types
-- IFM Tensor batch size must be 1
 - Stride values for both width and height must be in the range [1, 3]
 - Kernel filter height must be in the range [1, 256]
 - Product of kernel filter width and height must be in the range [1, 65536]
@@ -215,7 +208,6 @@
 - IFM must be int8 or uint8
 - Input tensor must be at least 2D
 - Axis indices must correspond to height and width axes
-- IFM Tensor batch size must be 1
 - Product of height and width must be no greater than 65536
 - Product of height and width must be no greater than 4096 when:  
         IFM and OFM have different scale or zero point; or  
@@ -236,7 +228,6 @@
 
 - At least one Input's shape must match the OFM's shape
 - IFM and OFM data types must match
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 - Both Input quantization parameters must match OFM quantization parameters
 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
 
@@ -248,7 +239,6 @@
 - Both Input data types must match
 - For IFM that are signed, OFM must also be signed
 - For IFM that are unsigned, OFM must either be the same type or int32
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
 
 ### TFLite PAD Constraints
@@ -335,7 +325,6 @@
 - Both Input data types must match
 - For IFM that are signed, OFM must also be signed
 - For IFM that are unsigned, OFM must either be the same type or int32
-- Batch size must be 1 for Input tensors with more than 2 dimensions
 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
 
 ### TFLite TRANSPOSE_CONV Constraints
@@ -353,7 +342,6 @@
 - The sum of the weights cannot exceed 8323072
 - Optional Bias tensor must be of type: int32, int64
 - Optional Bias tensor values must fit within 40-bits
-- IFM Tensor batch size must be 1
 - Stride values for both width and height must be 2
 - SAME padding: OFM dimensions must equal IFM dimensions multiplied by stride
 - VALID padding: OFM dimensions must equal IFM dimensions multiplied by stride,  
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index 3872bdc..35fc1a6 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -550,25 +550,25 @@
 
 def test_constraint_elemwise_batch_size():
     # BINARY CASE
-    # Batch can be >1 if dims is <=2D
-    op = testutil.create_elemwise_op(Op.Add, "op", [2, 2], [2, 2], [2, 2])
+    # Batch can be >1 if dims is <=3D
+    op = testutil.create_elemwise_op(Op.Add, "op", [2, 2, 2], [2, 2, 2], [2, 2, 2])
     assert support.is_operator_supported(op)
-    # For dims >2D, batch must be 1
-    op = testutil.create_elemwise_op(Op.Add, "op", [1, 2, 2], [1, 2, 2], [1, 2, 2])
+    # For dims >3D, batch must be 1
+    op = testutil.create_elemwise_op(Op.Add, "op", [1, 2, 2, 2], [1, 2, 2, 2], [1, 2, 2, 2])
     assert support.is_operator_supported(op)
     # invalid case
-    op = testutil.create_elemwise_op(Op.Add, "op", [2, 2, 2], [2, 2, 2], [2, 2, 2])
+    op = testutil.create_elemwise_op(Op.Add, "op", [2, 2, 2, 2], [2, 2, 2, 2], [2, 2, 2, 2])
     assert not support.is_operator_supported(op)
 
     # UNARY CASE
-    # Batch can be >1 if dims is <=2D
-    op = testutil.create_elemwise_op(Op.CLZ, "op", [2, 2], None, [2, 2], datatype=DataType.int32)
+    # Batch can be >1 if dims is <=3D
+    op = testutil.create_elemwise_op(Op.CLZ, "op", [2, 2, 2], None, [2, 2, 2], datatype=DataType.int32)
     assert support.is_operator_supported(op)
-    # For dims >2D, batch must be 1
-    op = testutil.create_elemwise_op(Op.CLZ, "op", [1, 2, 2], None, [1, 2, 2], datatype=DataType.int32)
+    # For dims >3D, batch must be 1
+    op = testutil.create_elemwise_op(Op.CLZ, "op", [1, 2, 2, 2], None, [1, 2, 2, 2], datatype=DataType.int32)
     assert support.is_operator_supported(op)
     # invalid case
-    op = testutil.create_elemwise_op(Op.CLZ, "op", [2, 2, 2], None, [2, 2, 2], datatype=DataType.int32)
+    op = testutil.create_elemwise_op(Op.CLZ, "op", [2, 2, 2, 2], None, [2, 2, 2, 2], datatype=DataType.int32)
     assert not support.is_operator_supported(op)
 
 
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index b6f9796..d42caf5 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from .data_type import DataType
+from .numeric_util import full_shape
 from .operation import Op
 from .operation import Padding
 from .supported_operators_util import docstring_format_args
@@ -206,9 +207,20 @@
         self.generic_constraints.append(TFLiteSupportedOperators.constraint_tens_int32_ops)
         self.generic_constraints.append(TFLiteSupportedOperators.constraint_tens_dimension)
         self.generic_constraints.append(TFLiteSupportedOperators.constraint_tens_quant_per_axis)
+        self.generic_constraints.append(TFLiteSupportedOperators.constraint_batch_size)
         self.generic_constraints.append(TFLiteSupportedOperators.constraint_faf)
         self.generic_constraints.append(TFLiteSupportedOperators.constraint_faf_type)
 
+        # Setup generic constraint exceptions
+        self.generic_constraints_exceptions = defaultdict(list)
+        self.generic_constraints_exceptions[Op.FullyConnected].append(TFLiteSupportedOperators.constraint_batch_size)
+        self.generic_constraints_exceptions[Op.Softmax].append(TFLiteSupportedOperators.constraint_batch_size)
+        self.generic_constraints_exceptions[Op.Reshape].append(TFLiteSupportedOperators.constraint_batch_size)
+        self.generic_constraints_exceptions[Op.Shape].append(TFLiteSupportedOperators.constraint_batch_size)
+        self.generic_constraints_exceptions[Op.Squeeze].append(TFLiteSupportedOperators.constraint_batch_size)
+        for op_type in TFLiteSupportedOperators.split_ops - set((Op.UnpackReshaped,)):
+            self.generic_constraints_exceptions[op_type].append(TFLiteSupportedOperators.constraint_batch_size)
+
         # Setup specific constraints. Note: the order matters
         self.specific_constraints = defaultdict(list)
 
@@ -223,7 +235,6 @@
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_weights_limit)
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_type)
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_40bit)
-            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_batch_size)
         # Depthwise Conv specific checks:
         for op_type in TFLiteSupportedOperators.depthwise_convolution_ops:
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_depth_multiplier)
@@ -235,7 +246,6 @@
 
         # Pooling checks:
         for op_type in TFLiteSupportedOperators.pooling_ops:
-            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_batch_size)
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_stride_range)
         # AVG pooling specific checks:
         for op_type in TFLiteSupportedOperators.avg_pooling_ops:
@@ -268,9 +278,7 @@
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_type)
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_40bit)
 
-        # Element-wise checks:
-        for op_type in TFLiteSupportedOperators.elem_wise_main_ops:
-            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_elemwise_batch_size)
+        # Element-wise checks
         # Binary Min/Max specific checks:
         for op_type in TFLiteSupportedOperators.binary_elem_wise_min_max_ops:
             self.specific_constraints[op_type].append(
@@ -302,7 +310,6 @@
         self.specific_constraints[Op.Pad].append(TFLiteSupportedOperators.constraint_pad_type)
 
         # Mean specific checks:
-        self.specific_constraints[Op.Mean].append(TFLiteSupportedOperators.constraint_batch_size)
         self.specific_constraints[Op.Mean].append(TFLiteSupportedOperators.constraint_mean_height_width_product_avgpool)
         self.specific_constraints[Op.Mean].append(TFLiteSupportedOperators.constraint_mean_height_width_product)
         self.specific_constraints[Op.Mean].append(TFLiteSupportedOperators.constraint_mean_height_width_product_int8)
@@ -319,7 +326,10 @@
                 print(f"Info: {ext_type} '{op.name}' is a CPU only op")
             return False
 
-        for constraint in self.generic_constraints + self.specific_constraints[op.type]:
+        op_exceptions = self.generic_constraints_exceptions[op.type]
+        generic_constraints = [constraint for constraint in self.generic_constraints if constraint not in op_exceptions]
+
+        for constraint in generic_constraints + self.specific_constraints[op.type]:
             valid, extra = constraint(op)
             if not valid:
                 print(f"Warning: {ext_type} '{op.name}' is not supported on the NPU. Placing on CPU instead")
@@ -497,9 +507,16 @@
     @staticmethod
     def constraint_batch_size(op):
         "IFM Tensor batch size must be 1"
-        ifm = op.ifm
-        valid = ifm.shape[0] == 1
-        return valid, f"Tensor '{ifm.name}' has batch size: {ifm.shape[0]}"
+        valid = True
+        extra = []
+        for tens in (op.ifm, op.ifm2):
+            if tens is not None:
+                batch_size = full_shape(4, tens.shape, 1)[0]
+                if batch_size != 1:
+                    valid = False
+                    extra.append(f"Tensor '{tens.name}' has batch size: {batch_size}")
+        extra = "\n   ".join(extra)
+        return valid, extra
 
     @staticmethod
     def constraint_depth_multiplier(op):
@@ -753,20 +770,6 @@
         return valid, f"Op has tensors with different quantization parameters to the OFM '{op.ofm.name}': {extra}"
 
     @staticmethod
-    def constraint_elemwise_batch_size(op):
-        "Batch size must be 1 for Input tensors with more than 2 dimensions"
-        valid = True
-        extra = []
-        for tens in (op.ifm, op.ifm2):
-            # Unary ops have ifm2 as None
-            if tens is not None:
-                if (len(tens.shape) > 2) and (tens.shape[0] != 1):
-                    valid = False
-                    extra.append(tens.name)
-        extra = ", ".join(extra)
-        return valid, f"Op has invalid input tensors: {extra}"
-
-    @staticmethod
     def constraint_broadcast_shapes(op):
         "Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2"
         ifm_shape = op.ifm.shape
diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py
index 15e1569..192862e 100644
--- a/ethosu/vela/tosa_supported_operators.py
+++ b/ethosu/vela/tosa_supported_operators.py
@@ -94,6 +94,9 @@
         self.generic_constraints.append(TosaSupportedOperators.constraint_rank)  # TODO not supported for all ops yet
         self.generic_constraints.append(TosaSupportedOperators.constraint_batch)  # TODO not supported for all ops yet
 
+        # Setup generic constraint exceptions
+        self.generic_constraints_exceptions = defaultdict(list)
+
         # Setup specific constraints. Note: the order matters
         self.specific_constraints = defaultdict(list)
 
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index a42b218..7740711 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -45,6 +45,7 @@
 from .tflite.Model import Model
 from .tflite_mapping import builtin_operator_map
 from .tflite_mapping import builtin_operator_name_map
+from .tflite_mapping import optype_to_builtintype
 from .tflite_model_semantic import TFLiteSemantic
 from .tflite_supported_operators import TFLiteSupportedOperators
 from .tosa_model_semantic import TosaSemantic
@@ -178,6 +179,12 @@
     # To easily exclude NetworkType from generated documentation.
     exclude_generation_network_type_value = [NetworkType.TOSA.value]
 
+    def _exclude_list_names(constraint, exclude_list):
+        constraints_excluded_names = [
+            optype_to_builtintype(op) for op, exclude_constraint in exclude_list if constraint in exclude_constraint
+        ]
+        return f" - [{', '.join(sorted(constraints_excluded_names))}]" if constraints_excluded_names else ""
+
     lines = [
         "# Supported Ops",
         "",
@@ -256,20 +263,13 @@
         for constraint in semantic_checker.generic_constraints:
             # Markdown needs two spaces at the end of a line to render it as a separate line
             reason = constraint.__doc__.replace("\n", "  \n")
-
             exclude_list = TFLiteSemantic.get_generic_constraint_exclude_list().items()
-            constraints_excluded_names = [
-                op.name for op, exclude_constraint in exclude_list if constraint in exclude_constraint
-            ]
-            excluded_constraint_text = ""
-            if constraints_excluded_names:
-                excluded_constraint_text = f"- [{', '.join(constraints_excluded_names)}]"
-
-            lines.append(f"- {reason} {excluded_constraint_text}")
+            lines.append(f"- {reason}{_exclude_list_names(constraint, exclude_list)}")
         for constraint in supported.generic_constraints:
             # Markdown needs two spaces at the end of a line to render it as a separate line
             reason = constraint.__doc__.replace("\n", "  \n")
-            lines.append(f"- {reason}")
+            exclude_list = supported.generic_constraints_exceptions.items()
+            lines.append(f"- {reason}{_exclude_list_names(constraint, exclude_list)}")
         for op, name in op_constraint_links:
             lines += [
                 "",