MLBEDSW-2688: Improved LUT support

- Support for more than one 256-byte LUT in SHRAM
- No DMA is performed for a LUT that is already located in SHRAM
- Added MemArea.Shram, used for LUT, to avoid false address collision
  asserts during SRAM tensor allocation
- Added read access to LUT in memory access calculation

Change-Id: If4d1eded5ed029d253f4f5efb2d80495fc3eac99
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/test/test_live_range.py b/ethosu/vela/test/test_live_range.py
index 395d0f3..d087dd9 100644
--- a/ethosu/vela/test/test_live_range.py
+++ b/ethosu/vela/test/test_live_range.py
@@ -18,6 +18,7 @@
 from unittest.mock import MagicMock
 
 import pytest
+
 from ethosu.vela.live_range import LiveRange
 
 
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
new file mode 100644
index 0000000..3b7f57b
--- /dev/null
+++ b/ethosu/vela/test/test_lut.py
@@ -0,0 +1,180 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Unit tests for LUT support
+import numpy as np
+
+from ethosu.vela import insert_dma
+from ethosu.vela import lut
+from ethosu.vela import mark_tensors
+from ethosu.vela import pass_packing
+from ethosu.vela.data_type import DataType
+from ethosu.vela.high_level_command_stream import DMA
+from ethosu.vela.nn_graph import Graph
+from ethosu.vela.rewrite_graph import verify_graph_health
+from ethosu.vela.tensor import create_const_tensor
+from ethosu.vela.tensor import TensorPurpose
+from ethosu.vela.test import testutil
+
+
+def set_256_lut(op, key):
+    values = list(range(256))
+    lut_tensor = create_const_tensor(
+        op.name + "_lut", [1, 1, 1, 256], DataType.int8, values, np.uint8, TensorPurpose.LUT
+    )
+    lut_tensor.equivalence_id = lut.create_equivalence_id(key)
+    op.set_activation_lut(lut_tensor)
+
+
+def set_1K_lut(op, key):
+    values = list(range(256))
+    lut_tensor = create_const_tensor(
+        op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, np.uint32, TensorPurpose.LUT
+    )
+    lut_tensor.equivalence_id = lut.create_equivalence_id(key)
+    op.set_activation_lut(lut_tensor)
+
+
+def set_2K_lut(op, key):
+    values = list(range(512))
+    lut_tensor = create_const_tensor(
+        op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, np.uint32, TensorPurpose.LUT
+    )
+    lut_tensor.equivalence_id = lut.create_equivalence_id(key)
+    op.set_activation_lut(lut_tensor)
+
+
+def process(arch, op_list):
+    # Returns subgraph with given operations
+    nng = Graph()
+    sg = testutil.create_subgraph(op_list)
+    nng.subgraphs.append(sg)
+    assert verify_graph_health(nng)
+    nng = mark_tensors.mark_tensor_purpose(nng, arch, False)
+    assert verify_graph_health(nng)
+    nng = insert_dma.insert_dma_commands(nng, arch, False)
+    assert verify_graph_health(nng)
+    pass_packing.pack_into_passes(nng, arch, False)
+    assert verify_graph_health(nng)
+    # Create a DMA instruction for every op
+    cmd_list = []
+    for ps in sg.passes:
+        for intermediate in ps.intermediates:
+            if intermediate.needs_dma():
+                cmd_list.append(DMA(ps, intermediate.get_dma_src_tensor(), intermediate, None))
+    sg.high_level_command_stream = cmd_list
+    return sg
+
+
+def test_optimize_high_level_cmd_stream_2K():
+    # Tests lut.optimize_high_level_cmd_stream, blending 256 byte and 2K luts
+    arch = testutil.create_arch()
+    shape = [1, 1, 1, 1]
+    # u8 LUT op, should lead to DMA
+    op0 = testutil.create_elemwise_op("AddAct", "op0", shape, shape, shape)
+    set_256_lut(op0, "lut0")
+    # u8 LUT op, should lead to DMA
+    op1 = testutil.create_elemwise_op("AddAct", "op1", shape, shape, shape)
+    set_256_lut(op1, "lut1")
+    # u8 LUT op with different LUT, should lead to DMA
+    op2 = testutil.create_elemwise_op("AddAct", "op2", shape, shape, shape)
+    set_256_lut(op2, "lut2")
+    # u8 LUT op with same LUT as in op1, should not lead to DMA
+    op3 = testutil.create_elemwise_op("AddAct", "op3", shape, shape, shape)
+    set_256_lut(op3, "lut1")
+    # u8 LUT op with same LUT as in op2, should not lead to DMA
+    op4 = testutil.create_elemwise_op("AddAct", "op4", shape, shape, shape)
+    set_256_lut(op4, "lut2")
+    # 2K LUT op, should lead to DMA, and will overwrite all previous LUTs in SHRAM
+    op5_2K = testutil.create_elemwise_op("AddAct", "op5", shape, shape, shape)
+    set_2K_lut(op5_2K, "lut5")
+    # Another 2K LUT op, should lead to DMA, and will overwrite the previous LUT in SHRAM
+    op6_2K = testutil.create_elemwise_op("AddAct", "op6", shape, shape, shape)
+    set_2K_lut(op6_2K, "lut6")
+    # u8 LUT op with same LUT as in op1, should lead to DMA
+    op7 = testutil.create_elemwise_op("AddAct", "op7", shape, shape, shape)
+    set_256_lut(op7, "lut1")
+
+    op_list = [op0, op1, op2, op3, op4, op5_2K, op6_2K, op7]
+    sg = process(arch, op_list)
+    orig_cmd_list = sg.high_level_command_stream
+    sg.high_level_command_stream = orig_cmd_list
+    lut.optimize_high_level_cmd_stream(sg, arch)
+    cmd_list = sg.high_level_command_stream
+    # Check that only the needed DMA commands are left
+    expected_dma_ops = [op0, op1, op2, op5_2K, op6_2K, op7]
+    for (cmd, op) in zip(cmd_list, expected_dma_ops):
+        assert cmd.in_tensor == op.activation_lut
+    # Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses
+    assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address
+    assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address
+    assert orig_cmd_list[1].out_tensor.address != orig_cmd_list[2].out_tensor.address
+    # Check that lut1 in op1 and op3 have same address
+    assert orig_cmd_list[1].out_tensor.address == orig_cmd_list[3].out_tensor.address
+    # Check that lut2 in op2 and op4 have same address
+    assert orig_cmd_list[2].out_tensor.address == orig_cmd_list[4].out_tensor.address
+    # Check that lut-s for 16 bit (op5 and op6) are stored on same address
+    assert orig_cmd_list[5].out_tensor.address == orig_cmd_list[6].out_tensor.address
+
+
+def test_optimize_high_level_cmd_stream_1K():
+    # Tests lut.optimize_high_level_cmd_stream, blending 256 and 1K luts
+    arch = testutil.create_arch()
+    shape = [1, 1, 1, 1]
+    # u8 LUT op, should lead to DMA
+    op0 = testutil.create_elemwise_op("AddAct", "op0", shape, shape, shape)
+    set_256_lut(op0, "lut0")
+    # u8 LUT op, should lead to DMA
+    op1 = testutil.create_elemwise_op("AddAct", "op1", shape, shape, shape)
+    set_256_lut(op1, "lut1")
+    # 1K LUT op with different LUT, should lead to DMA
+    op2_1K = testutil.create_elemwise_op("AddAct", "op2", shape, shape, shape)
+    set_1K_lut(op2_1K, "lut2")
+    # u8 LUT op with same LUT as in op1, should not lead to DMA
+    op3 = testutil.create_elemwise_op("AddAct", "op3", shape, shape, shape)
+    set_256_lut(op3, "lut1")
+    # 1K LUT op with same LUT as in op2, should not lead to DMA
+    op4_1K = testutil.create_elemwise_op("AddAct", "op4", shape, shape, shape)
+    set_1K_lut(op4_1K, "lut2")
+    # 1K LUT op, should lead to DMA, and will overwrite lut2
+    op5_2K = testutil.create_elemwise_op("AddAct", "op5", shape, shape, shape)
+    set_1K_lut(op5_2K, "lut5")
+    # u8 LUT op, lut0 should still be present, should not lead to DMA
+    op6 = testutil.create_elemwise_op("AddAct", "op6", shape, shape, shape)
+    set_256_lut(op6, "lut0")
+    # 1K LUT op with same LUT as in op2, should lead to DMA
+    op7 = testutil.create_elemwise_op("AddAct", "op7", shape, shape, shape)
+    set_1K_lut(op7, "lut2")
+
+    op_list = [op0, op1, op2_1K, op3, op4_1K, op5_2K, op6, op7]
+    sg = process(arch, op_list)
+    orig_cmd_list = sg.high_level_command_stream
+    sg.high_level_command_stream = orig_cmd_list
+    lut.optimize_high_level_cmd_stream(sg, arch)
+    cmd_list = sg.high_level_command_stream
+    # Check that only the needed DMA commands are left
+    expected_dma_ops = [op0, op1, op2_1K, op5_2K, op7]
+    for (cmd, op) in zip(cmd_list, expected_dma_ops):
+        assert cmd.in_tensor == op.activation_lut
+    # Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses
+    assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address
+    assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address
+    assert orig_cmd_list[1].out_tensor.address != orig_cmd_list[2].out_tensor.address
+    # Check that lut1 in op1 and op3 have same address
+    assert orig_cmd_list[1].out_tensor.address == orig_cmd_list[3].out_tensor.address
+    # Check that lut2 in op2 and op4 and op7 have same address
+    assert orig_cmd_list[2].out_tensor.address == orig_cmd_list[4].out_tensor.address
+    assert orig_cmd_list[2].out_tensor.address == orig_cmd_list[7].out_tensor.address
diff --git a/ethosu/vela/test/test_model_reader.py b/ethosu/vela/test/test_model_reader.py
index 23e7e90..bd7ca37 100644
--- a/ethosu/vela/test/test_model_reader.py
+++ b/ethosu/vela/test/test_model_reader.py
@@ -16,6 +16,7 @@
 # Description:
 # Unit tests for model_reader.
 import pytest
+
 from ethosu.vela import model_reader
 from ethosu.vela.errors import InputFileError
 
diff --git a/ethosu/vela/test/test_tflite_reader.py b/ethosu/vela/test/test_tflite_reader.py
index 898e384..1ba0742 100644
--- a/ethosu/vela/test/test_tflite_reader.py
+++ b/ethosu/vela/test/test_tflite_reader.py
@@ -16,6 +16,7 @@
 # Description:
 # Contains unit tests for tflite_reader
 import pytest
+
 from ethosu.vela.tflite_reader import TFLiteSubgraph
 
 
diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py
new file mode 100644
index 0000000..116afa4
--- /dev/null
+++ b/ethosu/vela/test/testutil.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Utilities used in vela unit tests
+import numpy as np
+
+from ethosu.vela import architecture_features
+from ethosu.vela.data_type import DataType
+from ethosu.vela.nn_graph import Subgraph
+from ethosu.vela.operation import NpuBlockType
+from ethosu.vela.operation import Operation
+from ethosu.vela.tensor import create_const_tensor
+from ethosu.vela.tensor import MemArea
+from ethosu.vela.tensor import Tensor
+
+
+def create_arch():
+    return architecture_features.ArchitectureFeatures(
+        vela_config=None,
+        system_config=None,
+        accelerator_config=architecture_features.Accelerator.Ethos_U55_128.value,
+        permanent_storage=MemArea.OnChipFlash,
+        override_block_config=None,
+        block_config_limit=None,
+        global_memory_clock_scale=1.0,
+        max_blockdep=0,
+        softmax_support=True,
+    )
+
+
+def create_elemwise_op(type, name, ifm_shape, ifm2_shape, ofm_shape, datatype=DataType.uint8):
+    # Creates elementwise operation with constant IFM/IFM2
+    if datatype.size_in_bytes() == 1:
+        np_type = np.uint8
+    elif datatype.size_in_bytes() == 2:
+        np_type = np.int16
+    else:
+        np_type = np.int32
+    op = Operation(type, name)
+    op.add_input_tensor(create_const_tensor(name + "_ifm", ifm_shape, datatype, np.zeros(ifm_shape), np_type))
+    op.add_input_tensor(create_const_tensor(name + "_ifm2", ifm2_shape, datatype, np.zeros(ifm2_shape), np_type))
+    ofm = Tensor(ofm_shape, datatype, name + "_ofm")
+    op.set_output_tensor(ofm)
+    op.attrs["npu_block_type"] = NpuBlockType.ElementWise
+    return op
+
+
+def create_subgraph(op_list):
+    # Creates subgraph using the given list of operations
+    sg = Subgraph()
+    all_inputs = set(tens for op in op_list for tens in op.inputs)
+    # Reversing, so that the resulting subgraph has same order as op_list
+    for op in op_list[::-1]:
+        for tens in op.outputs:
+            if tens not in all_inputs and tens not in sg.output_tensors:
+                sg.output_tensors.append(tens)
+    return sg