blob: 29ede842448d3a32076d695ec6efd99b6e48acb3 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Serialises and packs an NPU subgraph into tensors.
20
Diego Russoea6111a2020-04-14 18:41:58 +010021import struct
22
23import numpy as np
24
25from . import driver_actions
Tim Hall79d07d22020-04-27 18:20:16 +010026from .nn_graph import PassPlacement
27from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat
28from .operation import Operation
29from .data_type import DataType
Tim Hall79d07d22020-04-27 18:20:16 +010030
31
32def make_memory_tensor(name, mem_area, sz, want_values, arch):
33 tens = Tensor([sz], DataType.uint8, name)
34 tens.mem_area = mem_area
35 tens.purpose = TensorPurpose.FeatureMap
36 tens.set_format(TensorFormat.NHWC, arch)
37 if want_values:
38 tens.values = np.zeros(tens.shape, np.uint8)
39 return tens
40
41
42def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
43 start_addr = src_tensor.address
44 for compressed_values in src_tensor.compressed_values:
45 end_addr = start_addr + len(compressed_values)
46 memory_tensor.values[start_addr:end_addr] = compressed_values
47 start_addr = end_addr
48
49
50def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
51 if sg.placement != PassPlacement.Npu:
52 return scratch_tens, flash_tens
53
54 flash_area = arch.permanent_storage_mem_area
55 scratch_area = MemArea.Sram
56
57 flash_size = sg.memory_used.get(flash_area, 0)
58 scratch_size = sg.memory_used.get(scratch_area, 0)
59
60 # Prepare driver actions for this command tensor
61 da_list = []
62 driver_actions.emit_fourcc(da_list, "COP1")
63 driver_actions.emit_config(da_list, 0, 1, arch)
64 driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream))
65
66 # Append command stream words
67 da_list.extend(sg.register_command_stream)
68
69 # Convert to bytes
70 payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list)
71
72 command_stream_size_bytes = len(payload_bytes)
73
74 # Adjust the bits per element calculation to exclude metadata generated by Vela
75 nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
76 nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
77 nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
78 nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
79
Diego Russoea6111a2020-04-14 18:41:58 +010080 if flash_tens == scratch_tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +010081 # First Npu subgraph, create scratch and flash tensors
82 sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
83 sg.scratch_tensor.purpose = TensorPurpose.Scratch
84 sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch)
85 else:
86 sg.scratch_tensor = scratch_tens
87 sg.scratch_tensor.shape[0] += scratch_size
88 sg.flash_tensor = flash_tens
89 sg.flash_tensor.shape[0] += flash_size
90
91 for cps in sg.cascaded_passes:
92 for ps in cps.passes:
Diego Russoea6111a2020-04-14 18:41:58 +010093 if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None:
Tim Hall79d07d22020-04-27 18:20:16 +010094 # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
95 # is pointing at the destination address of where the weights should be placed in SRAM.
96 # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
97 if ps.weight_tensor.ops[0].type == "DMA":
98 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
99 else:
100 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
101
102 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
103
104 sg.command_stream_tensor = make_memory_tensor(
105 sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
106 )
107 sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
108
109 return sg.scratch_tensor, sg.flash_tensor
110
111
112def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
113 op = Operation("Const", tens.name + "_const")
114 op.outputs = [tens]
115 tens.ops = [op]
116 startup_cps.passes[0].ops.insert(0, op)
117 startup_cps.passes[0].outputs.insert(0, tens)
118 startup_cps.outputs.insert(0, tens)
119
120
121def rewrite_npu_call_ops(nng, sg, arch):
122 if sg.placement != PassPlacement.Cpu:
123 return
124
125 startup_cps = sg.cascaded_passes[0]
126
127 for idx, cps in enumerate(sg.cascaded_passes):
128 for ps in cps.passes:
129 for op in ps.ops:
130 if op.type == "NpuOp":
131 callee = op.attrs["subgraph"]
132 op.attrs["custom_options"] = {"type": op.type}
133
134 sz = 0
135 for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
136 op.inputs.insert(0, tens)
137 ps.inputs.insert(0, tens)
138 cps.inputs.insert(0, tens)
139 if tens != callee.scratch_tensor:
140 add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
141 sz += tens.storage_size()
142
143 for prev_cps in sg.cascaded_passes[: idx + 1]:
144 prev_cps.sram_used += sz
145
146 if callee.scratch_tensor is not None:
147 cps.sram_used += callee.scratch_tensor.storage_size()