blob: bd13a3eceecefa0c43fb28a4bbd687230d4a7a4a [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Serialises and packs an NPU subgraph into tensors.
Diego Russoea6111a2020-04-14 18:41:58 +010018import struct
19
20import numpy as np
21
22from . import driver_actions
Tim Hall79d07d22020-04-27 18:20:16 +010023from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010024from .nn_graph import PassPlacement
25from .operation import Operation
26from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020027from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010028from .tensor import Tensor
29from .tensor import TensorFormat
30from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010031
32
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020033def make_memory_tensor(name, mem_area, mem_type, sz, want_values, arch):
Tim Hall79d07d22020-04-27 18:20:16 +010034 tens = Tensor([sz], DataType.uint8, name)
35 tens.mem_area = mem_area
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020036 tens.mem_type = mem_type
Tim Hall79d07d22020-04-27 18:20:16 +010037 tens.purpose = TensorPurpose.FeatureMap
38 tens.set_format(TensorFormat.NHWC, arch)
39 if want_values:
40 tens.values = np.zeros(tens.shape, np.uint8)
41 return tens
42
43
44def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
45 start_addr = src_tensor.address
46 for compressed_values in src_tensor.compressed_values:
47 end_addr = start_addr + len(compressed_values)
48 memory_tensor.values[start_addr:end_addr] = compressed_values
49 start_addr = end_addr
50
Tim Hallc30f4952020-06-15 20:47:35 +010051
Charles Xu78792222020-05-13 10:15:26 +020052def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
53 start_addr = src_tensor.address
54 end_addr = start_addr + src_tensor.quant_values.size
55 memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
Tim Hall79d07d22020-04-27 18:20:16 +010056
Tim Hallc30f4952020-06-15 20:47:35 +010057
Tim Hall79d07d22020-04-27 18:20:16 +010058def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
59 if sg.placement != PassPlacement.Npu:
60 return scratch_tens, flash_tens
61
62 flash_area = arch.permanent_storage_mem_area
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020063 scratch_area = arch.feature_map_storage_mem_area
Tim Hall79d07d22020-04-27 18:20:16 +010064
65 flash_size = sg.memory_used.get(flash_area, 0)
66 scratch_size = sg.memory_used.get(scratch_area, 0)
67
68 # Prepare driver actions for this command tensor
69 da_list = []
70 driver_actions.emit_fourcc(da_list, "COP1")
71 driver_actions.emit_config(da_list, 0, 1, arch)
72 driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream))
73
74 # Append command stream words
75 da_list.extend(sg.register_command_stream)
76
77 # Convert to bytes
78 payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list)
79
80 command_stream_size_bytes = len(payload_bytes)
81
82 # Adjust the bits per element calculation to exclude metadata generated by Vela
83 nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
84 nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
85 nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
86 nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
87
Diego Russoea6111a2020-04-14 18:41:58 +010088 if flash_tens == scratch_tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +010089 # First Npu subgraph, create scratch and flash tensors
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020090 sg.scratch_tensor = make_memory_tensor(
91 sg.name + "_scratch", scratch_area, MemType.Scratch, scratch_size, False, arch
92 )
Tim Hall79d07d22020-04-27 18:20:16 +010093 sg.scratch_tensor.purpose = TensorPurpose.Scratch
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020094 sg.flash_tensor = make_memory_tensor(
95 sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
96 )
Tim Hall79d07d22020-04-27 18:20:16 +010097 else:
98 sg.scratch_tensor = scratch_tens
99 sg.scratch_tensor.shape[0] += scratch_size
100 sg.flash_tensor = flash_tens
101 sg.flash_tensor.shape[0] += flash_size
102
103 for cps in sg.cascaded_passes:
104 for ps in cps.passes:
Charles Xu78792222020-05-13 10:15:26 +0200105 if ps.placement == PassPlacement.Npu:
Tim Hallc30f4952020-06-15 20:47:35 +0100106 if ps.weight_tensor is not None:
Charles Xu78792222020-05-13 10:15:26 +0200107 # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
108 # is pointing at the destination address of where the weights should be placed in SRAM.
109 # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
110 if ps.weight_tensor.ops[0].type == "DMA":
111 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
112 else:
113 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100114
Charles Xu78792222020-05-13 10:15:26 +0200115 copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
116
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200117 if ps.ifm_tensor is not None and ps.ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
Charles Xu78792222020-05-13 10:15:26 +0200118 copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200119 if ps.ifm2_tensor is not None and (
120 ps.ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)
121 ):
Charles Xu78792222020-05-13 10:15:26 +0200122 copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100123
124 sg.command_stream_tensor = make_memory_tensor(
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200125 sg.name + "_command_stream", flash_area, MemType.Permanent_CPU, command_stream_size_bytes, True, arch
Tim Hall79d07d22020-04-27 18:20:16 +0100126 )
127 sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
128
129 return sg.scratch_tensor, sg.flash_tensor
130
131
132def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
133 op = Operation("Const", tens.name + "_const")
134 op.outputs = [tens]
135 tens.ops = [op]
136 startup_cps.passes[0].ops.insert(0, op)
137 startup_cps.passes[0].outputs.insert(0, tens)
138 startup_cps.outputs.insert(0, tens)
139
140
141def rewrite_npu_call_ops(nng, sg, arch):
142 if sg.placement != PassPlacement.Cpu:
143 return
144
145 startup_cps = sg.cascaded_passes[0]
146
147 for idx, cps in enumerate(sg.cascaded_passes):
148 for ps in cps.passes:
149 for op in ps.ops:
150 if op.type == "NpuOp":
151 callee = op.attrs["subgraph"]
Tim Hallc8310b12020-06-17 14:53:11 +0100152 op.attrs["custom_type"] = op.type
Tim Hall79d07d22020-04-27 18:20:16 +0100153
154 sz = 0
155 for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
156 op.inputs.insert(0, tens)
157 ps.inputs.insert(0, tens)
158 cps.inputs.insert(0, tens)
159 if tens != callee.scratch_tensor:
160 add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
161 sz += tens.storage_size()
162
163 for prev_cps in sg.cascaded_passes[: idx + 1]:
164 prev_cps.sram_used += sz
165
166 if callee.scratch_tensor is not None:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200167 if callee.scratch_tensor.mem_area == MemArea.Sram:
168 cps.sram_used += callee.scratch_tensor.storage_size()