MLEMBED-1918: Issue with REDUCE_SUM on Ethos-U65-512

 - Ethos-U65-512 requires the input to REDUCE_SUM to use NHWC format
 - Updated the graph optimiser format check to cover this condition
 - Added a exception check to the backend of the compiler to verify that
this condition is not been violated by the external api or Vela internals

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I2f1fabcbd264daf77d5822349d855a3a32b12c64
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index 57fd7db..5e7e112 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from . import lut
+from .architecture_features import Accelerator
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
@@ -111,7 +112,10 @@
         return
 
     for op in tens.consumer_list:
-        if op.type == Op.ReduceSum and tens.dtype == DataType.int32:
+        if op.type == Op.ReduceSum and (
+            tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
+        ):
+            # ReduceSum requires NHWC input
             return
         if op.type == Op.Reshape:
             # Using NHCWB16 format for a no-op reshape is only an option if subsequent
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index a8d1ddf..5680c96 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -931,6 +931,19 @@
 
 def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
     """Generates register commands for pooling operations"""
+    # check that reduce_sum input is NHWC
+    if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
+        if npu_op.ifm.data_type == NpuDataType.INT32:
+            raise VelaError(
+                f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
+                f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
+            )
+        elif arch.accelerator_config == Accelerator.Ethos_U65_512:
+            raise VelaError(
+                f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
+                f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
+            )
+
     use_global_scale = (
         npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
     )