blob: 7989fa901ab791712afe67ddd4b7ece6e47cf0d6 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Serialises and packs an NPU subgraph into tensors.
Diego Russoea6111a2020-04-14 18:41:58 +010018import struct
19
20import numpy as np
21
22from . import driver_actions
Tim Hall79d07d22020-04-27 18:20:16 +010023from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010024from .nn_graph import PassPlacement
Louis Verhaardaee5d752020-09-30 09:01:52 +020025from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010026from .operation import Operation
27from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020028from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010029from .tensor import Tensor
30from .tensor import TensorFormat
31from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010032
33
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020034def make_memory_tensor(name, mem_area, mem_type, sz, want_values, arch):
Tim Hall79d07d22020-04-27 18:20:16 +010035 tens = Tensor([sz], DataType.uint8, name)
36 tens.mem_area = mem_area
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020037 tens.mem_type = mem_type
Tim Hall79d07d22020-04-27 18:20:16 +010038 tens.purpose = TensorPurpose.FeatureMap
39 tens.set_format(TensorFormat.NHWC, arch)
40 if want_values:
41 tens.values = np.zeros(tens.shape, np.uint8)
42 return tens
43
44
45def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
46 start_addr = src_tensor.address
47 for compressed_values in src_tensor.compressed_values:
48 end_addr = start_addr + len(compressed_values)
49 memory_tensor.values[start_addr:end_addr] = compressed_values
50 start_addr = end_addr
51
Tim Hallc30f4952020-06-15 20:47:35 +010052
Charles Xu78792222020-05-13 10:15:26 +020053def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
54 start_addr = src_tensor.address
Fredrik Svedberg0f98b362020-09-29 10:00:39 +020055 values = src_tensor.quant_values.flatten() if src_tensor.quant_values is not None else src_tensor.values.flatten()
Fredrik Svedbergbb1a92a2020-08-27 15:51:50 +020056 if src_tensor.dtype.size_in_bytes() > 1:
57 values = np.frombuffer(values.tobytes(), dtype=np.uint8)
Charles Xu9a03fdf2020-07-02 15:12:40 +020058 end_addr = start_addr + values.size
59 memory_tensor.values[start_addr:end_addr] = values
Tim Hall79d07d22020-04-27 18:20:16 +010060
Tim Hallc30f4952020-06-15 20:47:35 +010061
Patrik Gustavsson3ab94522020-06-29 17:36:55 +020062def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens):
Tim Hall79d07d22020-04-27 18:20:16 +010063 if sg.placement != PassPlacement.Npu:
Patrik Gustavsson3ab94522020-06-29 17:36:55 +020064 return scratch_tens, scratch_fast_tens, flash_tens
Tim Hall79d07d22020-04-27 18:20:16 +010065
66 flash_area = arch.permanent_storage_mem_area
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020067 scratch_area = arch.feature_map_storage_mem_area
Patrik Gustavsson3ab94522020-06-29 17:36:55 +020068 scratch_fast_area = arch.fast_storage_mem_area
Tim Hall79d07d22020-04-27 18:20:16 +010069
70 flash_size = sg.memory_used.get(flash_area, 0)
71 scratch_size = sg.memory_used.get(scratch_area, 0)
72
73 # Prepare driver actions for this command tensor
74 da_list = []
75 driver_actions.emit_fourcc(da_list, "COP1")
76 driver_actions.emit_config(da_list, 0, 1, arch)
77 driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream))
78
79 # Append command stream words
80 da_list.extend(sg.register_command_stream)
81
82 # Convert to bytes
83 payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list)
84
85 command_stream_size_bytes = len(payload_bytes)
86
87 # Adjust the bits per element calculation to exclude metadata generated by Vela
88 nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
89 nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
90 nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
91 nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
92
Patrik Gustavsson3ab94522020-06-29 17:36:55 +020093 if scratch_area != scratch_fast_area:
94 nng.total_size[scratch_fast_area] = nng.total_size.get(scratch_fast_area, 0)
95 nng.total_elements[scratch_fast_area] = nng.total_elements.get(scratch_fast_area, 0)
96
Diego Russoea6111a2020-04-14 18:41:58 +010097 if flash_tens == scratch_tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +010098 # First Npu subgraph, create scratch and flash tensors
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020099 sg.scratch_tensor = make_memory_tensor(
100 sg.name + "_scratch", scratch_area, MemType.Scratch, scratch_size, False, arch
101 )
Tim Hall79d07d22020-04-27 18:20:16 +0100102 sg.scratch_tensor.purpose = TensorPurpose.Scratch
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200103 sg.flash_tensor = make_memory_tensor(
104 sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
105 )
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200106 # Scratch fast tensor size set to 0. This forces a minimal allocation in the tensor arena
107 # which causes a slot in the basep registers to be reserved, so that the scratch fast tensor
108 # address can be overridden.
109 sg.scratch_fast_tensor = make_memory_tensor(
110 sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch, 0, False, arch
111 )
112 sg.scratch_fast_tensor.purpose = TensorPurpose.Scratch
Tim Hall79d07d22020-04-27 18:20:16 +0100113 else:
114 sg.scratch_tensor = scratch_tens
115 sg.scratch_tensor.shape[0] += scratch_size
116 sg.flash_tensor = flash_tens
117 sg.flash_tensor.shape[0] += flash_size
118
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200119 sg.scratch_fast_tensor = scratch_fast_tens
120 sg.scratch_fast_tensor.shape[0] = 0
121
Tim Hall79d07d22020-04-27 18:20:16 +0100122 for cps in sg.cascaded_passes:
123 for ps in cps.passes:
Charles Xu78792222020-05-13 10:15:26 +0200124 if ps.placement == PassPlacement.Npu:
Tim Hallc30f4952020-06-15 20:47:35 +0100125 if ps.weight_tensor is not None:
Charles Xu78792222020-05-13 10:15:26 +0200126 # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
127 # is pointing at the destination address of where the weights should be placed in SRAM.
128 # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
Louis Verhaardaee5d752020-09-30 09:01:52 +0200129 if ps.weight_tensor.ops[0].type == Op.DMA:
Charles Xu78792222020-05-13 10:15:26 +0200130 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
131 else:
132 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100133
Charles Xu78792222020-05-13 10:15:26 +0200134 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
135
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200136 if ps.lut_tensor is not None:
137 copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.lut_tensor)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200138 if ps.ifm_tensor is not None and ps.ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
Charles Xu78792222020-05-13 10:15:26 +0200139 copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200140 if ps.ifm2_tensor is not None and (
141 ps.ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)
142 ):
Charles Xu78792222020-05-13 10:15:26 +0200143 copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100144
145 sg.command_stream_tensor = make_memory_tensor(
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200146 sg.name + "_command_stream", flash_area, MemType.Permanent_CPU, command_stream_size_bytes, True, arch
Tim Hall79d07d22020-04-27 18:20:16 +0100147 )
148 sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
149
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200150 return sg.scratch_tensor, sg.scratch_fast_tensor, sg.flash_tensor
Tim Hall79d07d22020-04-27 18:20:16 +0100151
152
153def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200154 op = Operation(Op.Const, tens.name + "_const")
Michael McGeaghc5b549b2020-08-07 11:54:28 +0100155 op.set_output_tensor(tens)
Tim Hall79d07d22020-04-27 18:20:16 +0100156 startup_cps.passes[0].ops.insert(0, op)
157 startup_cps.passes[0].outputs.insert(0, tens)
158 startup_cps.outputs.insert(0, tens)
159
160
161def rewrite_npu_call_ops(nng, sg, arch):
162 if sg.placement != PassPlacement.Cpu:
163 return
164
165 startup_cps = sg.cascaded_passes[0]
166
167 for idx, cps in enumerate(sg.cascaded_passes):
168 for ps in cps.passes:
169 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200170 if op.type == Op.CustomNpuOp:
Tim Hall79d07d22020-04-27 18:20:16 +0100171 callee = op.attrs["subgraph"]
Tim Hall79d07d22020-04-27 18:20:16 +0100172
173 sz = 0
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200174 for tens in [
175 callee.scratch_fast_tensor,
176 callee.scratch_tensor,
177 callee.flash_tensor,
178 callee.command_stream_tensor,
179 ]:
Tim Hall79d07d22020-04-27 18:20:16 +0100180 op.inputs.insert(0, tens)
181 ps.inputs.insert(0, tens)
182 cps.inputs.insert(0, tens)
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200183 if tens != callee.scratch_tensor and tens != callee.scratch_fast_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100184 add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
185 sz += tens.storage_size()
186
187 for prev_cps in sg.cascaded_passes[: idx + 1]:
188 prev_cps.sram_used += sz
189
190 if callee.scratch_tensor is not None:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200191 if callee.scratch_tensor.mem_area == MemArea.Sram:
192 cps.sram_used += callee.scratch_tensor.storage_size()
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200193
194 if callee.scratch_fast_tensor is not None:
195 if callee.scratch_fast_tensor.mem_area == MemArea.Sram:
196 cps.sram_used += callee.scratch_fast_tensor.storage_size()