CONV ops int16 tests failed after TensorFlow update

Adds support for setting the accumulator type using the quantized_bias_type attribute

Change-Id: Ibde1149143b510a1c650a5a037d3ab92d878d7cd
Signed-off-by: William Isaksson <william.isaksson@arm.com>
diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py
index 589a283..7125e88 100644
--- a/ethosu/vela/api.py
+++ b/ethosu/vela/api.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -27,7 +27,7 @@
 
 
 API_VERSION_MAJOR = 1
-API_VERSION_MINOR = 4
+API_VERSION_MINOR = 5
 API_VERSION = f"{API_VERSION_MAJOR}.{API_VERSION_MINOR}"
 
 
@@ -273,6 +273,16 @@
         self.dilation_y = dilation_y
 
 
+class NpuAccumulatorType(Enum):
+    """
+    Accumulator dtype of NPU operation
+    """
+
+    Default = auto()
+    Int32 = auto()
+    Int40 = auto()
+
+
 class NpuOperationType(Enum):
     """
     Type of NPU operation
@@ -343,6 +353,7 @@
         self.fused_quantize: bool = False
         # IFM upscaling to be applied
         self.ifm_upscale: NpuResamplingMode = NpuResamplingMode.NONE
+        self.accumulator_type: NpuAccumulatorType = NpuAccumulatorType.Default
 
 
 class NpuConv2DOperation(NpuBlockOperation):
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 53df096..06d91a6 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -65,6 +65,7 @@
 from .operation import RoundingMode
 from .register_command_stream_generator import generate_command_stream
 from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
+from .register_command_stream_util import to_npu_acc_type
 from .register_command_stream_util import to_npu_kernel
 from .register_command_stream_util import UNARY_ELEMWISE_OPS
 from .shape4d import Shape4D
@@ -545,6 +546,7 @@
         npu_op.padding = create_padding(cmd, op, npu_op)
         npu_op.kernel = to_npu_kernel(op.kernel)
     npu_op.ifm_upscale = resampling_mode_inv_map[op.ifm_resampling_mode]
+    npu_op.accumulator_type = to_npu_acc_type(op.attrs.get("quantized_bias_type", None))
     return npu_op
 
 
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 9d9a1e6..ec01d3e 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -31,6 +31,7 @@
 
 from . import scaling
 from .api import NpuAccelerator
+from .api import NpuAccumulatorType
 from .api import NpuActivation
 from .api import NpuActivationOp
 from .api import NpuAddressRange
@@ -270,6 +271,11 @@
     SHRAMElements.Acc40: acc_format.INT_40BIT.value,
 }
 
+npu_acc_format_map = {
+    NpuAccumulatorType.Int32: acc_format.INT_32BIT.value,
+    NpuAccumulatorType.Int40: acc_format.INT_40BIT.value,
+}
+
 resampling_mode_map = {
     NpuResamplingMode.NONE: resampling_mode.NONE,
     NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
@@ -574,7 +580,10 @@
     emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
     if has_ifm2(npu_op):
         emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
-    emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
+    if npu_op.accumulator_type != NpuAccumulatorType.Default:
+        emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, npu_acc_format_map[npu_op.accumulator_type])
+    else:
+        emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
 
 
 def get_block_config_for_npu_op(
diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py
index 74c4f90..8a6f94e 100644
--- a/ethosu/vela/register_command_stream_util.py
+++ b/ethosu/vela/register_command_stream_util.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -21,6 +21,7 @@
 from typing import Optional
 
 from . import numeric_util
+from .api import NpuAccumulatorType
 from .api import NpuActivationOp
 from .api import NpuAddressRange
 from .api import NpuBlockOperation
@@ -42,6 +43,7 @@
 from .operation import Kernel
 from .operation import PointXYZ
 from .tensor import TensorFormat
+from .tflite.TensorType import TensorType
 from ethosu.vela.range_set import AccessDirection
 from ethosu.vela.range_set import MemoryAccessSet
 from ethosu.vela.range_set import MemoryRangeSet
@@ -74,6 +76,15 @@
     check_size(length, required_multiple, "length")
 
 
+def to_npu_acc_type(accType: TensorType) -> NpuAccumulatorType:
+    if accType == TensorType.INT32:
+        return NpuAccumulatorType.Int32
+    elif accType == TensorType.INT64:
+        return NpuAccumulatorType.Int40
+    else:
+        return NpuAccumulatorType.Default
+
+
 def to_npu_kernel(kernel: Kernel) -> NpuKernel:
     """Converts the given internally used kernel object to NpuKernel (of public API)"""
     return NpuKernel(
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index ada2136..48813fe 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -58,7 +58,6 @@
     depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,))
     transpose_convolution_ops = set((Op.Conv2DBackpropInput,))
     convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops
-    conv_depth_fc_op = convolution_ops | depthwise_convolution_ops | set((Op.FullyConnected,))
     max_pooling_ops = Op.op_set(Op.is_maxpool_op)
     avg_pooling_ops = Op.op_set(Op.is_avgpool_op)
     pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops
@@ -239,8 +238,6 @@
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_shape)
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_type)
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_bias_40bit)
-        for op_type in TFLiteSupportedOperators.conv_depth_fc_op:
-            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_no_quantized_bias_type)
         # Transpose Conv specific checks:
         for op_type in TFLiteSupportedOperators.transpose_convolution_ops:
             self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_tconv_stride)
@@ -534,12 +531,6 @@
             return valid, f"Tensor '{bias.name}' has values larger than 40-bits"
         return True, "Op has no bias tensor, or it fits in 40-bit"
 
-    def constraint_no_quantized_bias_type(op):
-        "Attribute quantized_bias_type must not be set"
-        quantized_bias_type = op.attrs.get("quantized_bias_type", False)
-        valid = quantized_bias_type == 0
-        return valid, f"Op has quantized_bias_type={quantized_bias_type}"
-
     @staticmethod
     def constraint_batch_size(op):
         "IFM Tensor batch size must be 1"