MLBEDSW-6148: Reduce SRAM usage for elementwise op Reduce memory footprint when using optimization strategy Size for elementwise operations. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I30380aed587c31adbf7615f74179b4c5da686773

commit: 5e0ae5598ab1d7debd603bdd32c7e8f9cad9d581 [log] [tgz]
author: Johan Alfvén <johan.alfven@arm.com> Wed Feb 09 21:20:10 2022 +0100
committer: tim.hall <tim.hall@arm.com> Mon Feb 21 15:52:37 2022 +0000
tree: 5d699fa9c03e50a9d98d2eef1279fcda9374f3b5
parent: f3760e1155258cc04d98b5c5bc5c9fdcf45eef03 [diff]
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index d160777..8f2426c 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py

@@ -18,6 +18,7 @@
 # The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
 # subdivisions for the Operators
 import copy
+from collections import namedtuple
 from enum import auto
 from enum import IntEnum
 from typing import Dict
@@ -342,6 +343,45 @@
         self.max_schedule = None
         self.scheduler_options = options
 
+    def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
+        # Only run this check for opt strategy Size
+        if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
+            return False
+
+        op = ps.primary_op
+        if not op.type.is_elementwise_op():
+            return False
+
+        depth = op.ofm_shapes[0][-1]
+        if (depth % 16) == 0:
+            return False
+
+        # Check if overwriting the inputs can be allowed
+        OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
+        outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
+        inps = []
+        if op.ifm is not None:
+            inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
+        if op.ifm2 is not None:
+            inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
+
+        # Find an input tensor that can be overwritten by the output
+        for inp in inps:
+            if (
+                # check op input and output shapes allow overlapping
+                inp.op_shape == outp.op_shape
+                # check input tensor is valid
+                and inp.tens is not None
+                and inp.tens.shape != []
+                # check input and output tensors are compatible
+                and inp.tens.format == outp.tens.format
+                and inp.tens.dtype == outp.tens.dtype
+            ):
+                if inp.tens.format == TensorFormat.NHWC:
+                    return True
+
+        return False
+
     def create_scheduler_representation(self, arch: ArchitectureFeatures):
         """Creates a Scheduler Graph representation"""
         # Temporary dict for creating connections between the Operations
@@ -354,8 +394,15 @@
                 for output in ps.outputs:
                     if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
                         continue
-                    if not output.needs_linear_format:
-                        output.set_format(TensorFormat.NHCWB16, arch)
+
+                    if output.needs_linear_format:
+                        continue
+
+                    if self.avoid_nhcwb16_for_ofm(output, ps, arch):
+                        output.needs_linear_format = True
+                        continue
+
+                    output.set_format(TensorFormat.NHCWB16, arch)
 
                 # Create SchedulerOperations
                 op = SchedulerOperation(ps, arch, self.nng)
commit	5e0ae5598ab1d7debd603bdd32c7e8f9cad9d581	[log] [tgz]
author	Johan Alfvén <johan.alfven@arm.com>	Wed Feb 09 21:20:10 2022 +0100
committer	tim.hall <tim.hall@arm.com>	Mon Feb 21 15:52:37 2022 +0000
tree	5d699fa9c03e50a9d98d2eef1279fcda9374f3b5
parent	f3760e1155258cc04d98b5c5bc5c9fdcf45eef03 [diff]