MLBEDSW-6909: Use int32 acc for the Mean op Changed acc type from int16 to int32. This will solve saturation problems and the constraint added in commit "MLBEDSW-5029: Output diff for Mean op" can be removed. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I05ec8835b43313b1a264d61a2b147fa62da123fe

commit: 059166304f9ef47f0b916c1325700ed826f25581 [log] [tgz]
author: Johan Alfvén <johan.alfven@arm.com> Tue Sep 06 20:33:22 2022 +0200
committer: Fredrik Svedberg <fredrik.svedberg@arm.com> Mon Sep 12 08:02:11 2022 +0000
tree: 49200e8bc2014e2cdbae52ba49a9e9b5b2ed776f
parent: d6efcd3cdcd8295fbbe7dd47a7074be39eaf03e4 [diff] [blame]
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index aaa778e..0f199de 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py

@@ -1476,9 +1476,10 @@
                 # followed by a multiplication with 1/N to get the MEAN
                 weight_scale = 1
                 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
-                intermediate.dtype = DataType.int16
+                intermediate.dtype = DataType.int32
                 mul_op = Operation(Op.Mul, op.name + "_mul")
                 mul_op.add_input_tensor(intermediate)
+                mul_op.set_output_tensor(op.ofm)
                 # Create scalar containing 1/N
                 quant = QuantizationParameters()
                 quant.zero_point = 0
@@ -1492,11 +1493,23 @@
                 n = int(h * w)
                 eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0
                 quant.scale_f32 = 1 / (n - eps)
+
+                # For int8/int16 we could use IFM/OFM scaling to do the division
+                # intermediate * 1 -> scale > round and shift.
+                #
+                # For int32 scaling is not supported so instead multiply with the scale
+                # intermediate * scale -> round and shift.
+                #
+                # Calculate the scale and shift value. const Tensor must be created
+                # with correct quantization since the scale and shift is calculated later
+                # in the command stream generator.
+                mul_scale, _ = scaling.elementwise_mul_scale(
+                    mul_op.ifm.quantization.scale_f32, quant.scale_f32, mul_op.ofm.quantization.scale_f32
+                )
                 scalar = create_const_tensor(
-                    op.name + "_scalar", [1, 1, 1, 1], DataType.uint8, [1], np.uint8, quantization=quant
+                    op.name + "_scalar", [1, 1, 1, 1], DataType.int32, [mul_scale], np.int32, quantization=quant
                 )
                 mul_op.add_input_tensor(scalar)
-                mul_op.set_output_tensor(op.ofm)
                 mul_op.set_ifm_ofm_shapes()
                 mul_op.rounding_mode = NpuRoundingMode.NATURAL
                 mul_op.activation = op.activation
commit	059166304f9ef47f0b916c1325700ed826f25581	[log] [tgz]
author	Johan Alfvén <johan.alfven@arm.com>	Tue Sep 06 20:33:22 2022 +0200
committer	Fredrik Svedberg <fredrik.svedberg@arm.com>	Mon Sep 12 08:02:11 2022 +0000
tree	49200e8bc2014e2cdbae52ba49a9e9b5b2ed776f
parent	d6efcd3cdcd8295fbbe7dd47a7074be39eaf03e4 [diff] [blame]