Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left |
| 18 | # untouched in the final output. |
| 19 | # |
| 20 | # Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked |
| 21 | # by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and |
| 22 | # attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 23 | import numpy as np |
| 24 | |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 25 | from .nn_graph import Pass |
| 26 | from .nn_graph import PassPlacement |
| 27 | from .nn_graph import Subgraph |
| 28 | from .operation import NpuBlockType |
| 29 | from .operation import Operation |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 30 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 31 | |
| 32 | def make_npu_call_op_pass(npu_subgraph): |
| 33 | op = Operation("NpuOp", "call_" + npu_subgraph.name) |
| 34 | op.attrs["subgraph"] = npu_subgraph |
| 35 | ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default) |
| 36 | ps.ops = [op] |
| 37 | ps.primary_op = op |
| 38 | op.attrs["npu_block_type"] = ps.npu_block_type |
| 39 | op.scheduled_pass = ps |
| 40 | |
| 41 | # Inputs and outputs filled in later as we cut the graphs |
| 42 | return ps |
| 43 | |
| 44 | |
| 45 | def switch_tensor_for_op(op, orig_tens, new_tens): |
| 46 | |
| 47 | op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs] |
| 48 | op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs] |
| 49 | |
| 50 | ps = op.scheduled_pass |
| 51 | if ps is None: |
| 52 | return |
| 53 | |
| 54 | ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs] |
| 55 | ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs] |
| 56 | |
| 57 | if ps.ifm_tensor == orig_tens: |
| 58 | ps.ifm_tensor = new_tens |
| 59 | if ps.ifm2_tensor == orig_tens: |
| 60 | ps.ifm2_tensor = new_tens |
| 61 | if ps.ofm_tensor == orig_tens: |
| 62 | ps.ofm_tensor = new_tens |
| 63 | if ps.weight_tensor == orig_tens: |
| 64 | ps.weight_tensor = new_tens |
| 65 | if ps.scale_tensor == orig_tens: |
| 66 | ps.scale_tensor = new_tens |
| 67 | |
| 68 | |
| 69 | def rewrite_tensor_cpu_producer_npu_consumers( |
| 70 | orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass |
| 71 | ): |
| 72 | is_const = orig_tens.ops[0].type == "Const" |
| 73 | |
| 74 | new_tens = orig_tens.clone("_npu") |
| 75 | orig_tens.npu_tensor = new_tens |
| 76 | new_tens.cpu_tensor = orig_tens |
| 77 | |
| 78 | op_type = "SubgraphInput" |
| 79 | if is_const: |
| 80 | op_type = "Const" |
| 81 | op = Operation(op_type, orig_tens.name + "_input") |
| 82 | op.attrs["npu_block_type"] = NpuBlockType.Default |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 83 | op.scheduled_pass = startup_init_ps |
Michael McGeagh | c5b549b | 2020-08-07 11:54:28 +0100 | [diff] [blame] | 84 | op.set_output_tensor(new_tens) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 85 | startup_init_ps.ops.append(op) |
| 86 | startup_init_ps.outputs.append(new_tens) |
| 87 | |
| 88 | if not is_const: |
| 89 | call_ps.inputs.append(orig_tens) |
| 90 | call_ps.primary_op.inputs.append(orig_tens) |
| 91 | |
| 92 | for op in list(orig_tens.consumers()): |
| 93 | if op is None: |
| 94 | continue # Subgraph consumers handled separately. |
| 95 | ps = op.scheduled_pass |
| 96 | if subgraph_for_pass[ps] == npu_subgraph: |
| 97 | switch_tensor_for_op(op, orig_tens, new_tens) |
| 98 | orig_tens.consumer_list.remove(op) |
| 99 | new_tens.consumer_list.append(op) |
| 100 | |
| 101 | # Deal with output tensors for the NPU graph. These are special. |
| 102 | npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors] |
| 103 | |
| 104 | |
| 105 | def rewrite_tensor_npu_producer_cpu_consumers( |
| 106 | orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass |
| 107 | ): |
| 108 | |
| 109 | new_tens = orig_tens.clone("_cpu") |
| 110 | new_tens.npu_tensor = orig_tens |
| 111 | orig_tens.cpu_tensor = new_tens |
| 112 | |
| 113 | npu_subgraph.output_tensors.append(orig_tens) |
| 114 | |
| 115 | call_ps.outputs.append(new_tens) |
| 116 | call_ps.primary_op.outputs.append(new_tens) |
| 117 | new_tens.ops = [call_ps.primary_op] |
| 118 | |
| 119 | for op in list(orig_tens.consumers()): |
| 120 | if op is None: |
| 121 | continue # Subgraph consumers handled separately. |
| 122 | ps = op.scheduled_pass |
| 123 | if subgraph_for_pass[ps] != npu_subgraph: |
| 124 | switch_tensor_for_op(op, orig_tens, new_tens) |
| 125 | orig_tens.consumer_list.remove(op) |
| 126 | new_tens.consumer_list.append(op) |
| 127 | |
| 128 | # Deal with output tensors for the CPU graph. These are special. |
| 129 | cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors] |
| 130 | |
| 131 | |
| 132 | def extract_subgraph(nng, orig_sg, arch): |
| 133 | assert orig_sg.placement == PassPlacement.Cpu |
| 134 | |
| 135 | passes = list(orig_sg.passes) |
| 136 | place_vec = np.array([ps.placement for ps in passes]) |
| 137 | place_vec[ |
| 138 | place_vec == PassPlacement.StartupInit |
| 139 | ] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU. |
| 140 | |
| 141 | # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU |
| 142 | # passes should be assigned to the NPU. |
| 143 | |
| 144 | # Forward, then backwards |
| 145 | for is_reversed in range(2): |
| 146 | last_place = PassPlacement.Cpu |
| 147 | seq = enumerate(place_vec) |
| 148 | if is_reversed: |
| 149 | seq = reversed(list(seq)) |
| 150 | for idx, place in seq: |
| 151 | if place == PassPlacement.MemoryOnly: |
| 152 | if last_place == PassPlacement.Npu: |
| 153 | place = PassPlacement.Npu |
| 154 | place_vec[idx] = place |
| 155 | |
| 156 | if place != PassPlacement.MemoryOnly: |
| 157 | last_place = place |
| 158 | |
| 159 | # Anything left, assign to the CPU. |
| 160 | place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu |
| 161 | |
| 162 | if np.all(place_vec == PassPlacement.Cpu): |
| 163 | return [] # Nothing to do |
| 164 | |
| 165 | # Create the subgraphs and split passes between them |
| 166 | |
| 167 | new_subgraphs = [] |
| 168 | split_count = 0 |
| 169 | subgraph_for_pass = {} |
| 170 | orig_sg.passes = [] |
| 171 | call_pass = {} |
| 172 | startup_init_passes = {} |
| 173 | |
| 174 | last_place = PassPlacement.Cpu |
| 175 | curr_sg = orig_sg |
| 176 | |
| 177 | for idx, place in enumerate(place_vec): |
| 178 | if place != last_place: |
| 179 | if place == PassPlacement.Npu: |
| 180 | split_count += 1 |
| 181 | curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu) |
| 182 | new_subgraphs.append(curr_sg) |
| 183 | call_ps = make_npu_call_op_pass(curr_sg) |
| 184 | subgraph_for_pass[call_ps] = orig_sg |
| 185 | orig_sg.passes.append(call_ps) |
| 186 | call_pass[curr_sg] = call_ps |
| 187 | |
| 188 | startup_init_ps = Pass( |
| 189 | curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default |
| 190 | ) |
| 191 | curr_sg.passes.append(startup_init_ps) |
| 192 | startup_init_passes[curr_sg] = startup_init_ps |
| 193 | subgraph_for_pass[startup_init_ps] = curr_sg |
| 194 | |
| 195 | else: |
| 196 | curr_sg = orig_sg |
| 197 | last_place = place |
| 198 | ps = passes[idx] |
| 199 | subgraph_for_pass[ps] = curr_sg |
| 200 | curr_sg.passes.append(ps) |
| 201 | |
| 202 | # Rewrite tensors to fix up graphs. |
| 203 | |
| 204 | for curr_sg in new_subgraphs: |
| 205 | for ps in curr_sg.passes: |
| 206 | for tens in ps.inputs: |
| 207 | source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops] |
| 208 | assert len(source_sgs) >= 0 |
| 209 | producer_sg = source_sgs[0] |
| 210 | for sg in source_sgs: |
| 211 | assert sg == producer_sg # All need to be the same. |
| 212 | |
| 213 | if producer_sg != curr_sg: |
| 214 | assert ( |
| 215 | producer_sg == orig_sg |
| 216 | ) # Because we go in-order, all the producers must be the original graph. |
| 217 | rewrite_tensor_cpu_producer_npu_consumers( |
| 218 | tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass |
| 219 | ) |
| 220 | |
| 221 | for tens in ps.outputs: |
| 222 | |
| 223 | dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None] |
| 224 | need_rewrite = False |
| 225 | for sg in dest_sgs: |
| 226 | if sg != curr_sg: |
| 227 | need_rewrite = True |
| 228 | break |
| 229 | if tens in orig_sg.output_tensors: |
| 230 | need_rewrite = True |
| 231 | |
| 232 | if need_rewrite: |
| 233 | rewrite_tensor_npu_producer_cpu_consumers( |
| 234 | tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass |
| 235 | ) |
| 236 | |
| 237 | return new_subgraphs |
| 238 | |
| 239 | |
| 240 | def extract_npu_subgraphs(nng, arch): |
| 241 | |
| 242 | nng.refresh_after_modification() |
| 243 | |
| 244 | for sg in list(nng.subgraphs): |
| 245 | if sg.placement == PassPlacement.Cpu: |
| 246 | new_subgraphs = extract_subgraph(nng, sg, arch) |
| 247 | nng.subgraphs += new_subgraphs |
| 248 | |
| 249 | nng.refresh_after_modification() |
| 250 | nng.prune_startup_init_pass() |
| 251 | |
| 252 | for sg in nng.subgraphs: |
| 253 | sg.build_pass_links() |