[MLBEDSW-2846] Do not use NHCWB16 for reduce_sum int32

Added checks for not using NHCWB16 for reduce_sum int32 which makes
int8/uint8 softmax work.

Also enabled softmax graph rewrite by default and fixed a saturation
problem.

Change-Id: Ic01bd9ece7e5c3edb2900b7915cc747efe9e5760
Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 9b492f0..41902d6 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -24,6 +24,7 @@
 from . import live_range
 from . import npu_performance
 from . import stats_writer
+from .data_type import DataType
 from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list
 from .nn_graph import CascadedPass
 from .nn_graph import PassPlacement
@@ -963,7 +964,7 @@
                         use_NHCWB16 = True
                         rewrites = []
                         for op in output.consumer_list:
-                            if op is None:
+                            if op is None or (op.type == "ReduceSum" and output.dtype == DataType.int32):
                                 use_NHCWB16 = False
                             elif op.type == "Reshape":
                                 # Detect no-op reshapes by comparing their full input and output tensor shapes.