blob: 015a8c49df36a2505b40800fc3f6920c8f7ecce3 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
Louis Verhaarde8a5a782020-11-02 18:04:27 +010029from . import numeric_util
Diego Russoea6111a2020-04-14 18:41:58 +010030from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
38from .api import NpuDataType
39from .api import NpuDmaOperation
40from .api import NpuElementWiseOp
41from .api import NpuElementWiseOperation
42from .api import NpuFeatureMap
43from .api import NpuKernel
44from .api import NpuLayout
45from .api import NpuOperation
46from .api import NpuOperationType
47from .api import NpuPadding
48from .api import NpuPoolingOp
49from .api import NpuPoolingOperation
50from .api import NpuQuantization
51from .api import NpuResamplingMode
52from .api import NpuRoundingMode
53from .api import NpuShape3D
54from .api import NpuTileBox
55from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010056from .architecture_features import ArchitectureFeatures
57from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010058from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import Rect
60from .architecture_features import SharedBufferArea
61from .architecture_features import SHRAMElements
Tim Halle6ccd872020-11-09 16:46:37 +000062from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010071from .high_level_command_stream import CommandType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010072from .high_level_command_to_npu_op import convert_command_to_npu_op
73from .high_level_command_to_npu_op import to_kernel
74from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russoe8a10452020-04-21 17:39:10 +010075from .numeric_util import quantise_float32
76from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010077from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010078from .operation import NpuBlockType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010079from .range_set import AccessDirection
80from .range_set import MemoryAccessSet
81from .range_set import MemoryRangeSet
82from .shared_buffer_allocation import find_suitable_block_configs
83from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
84from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010085
86
87class RegisterMachine:
88 def __init__(self):
89 self.n_banks = 1
90 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
91 self.bank_idx = 0
92
93 def set_register(self, reg, value):
94 is_changed = self.registers[self.bank_idx][reg] != value
95 self.registers[self.bank_idx][reg] = value
96 # is_changed = True # force command
97 return is_changed
98
99 def switch_bank(self):
100 self.bank_idx = (self.bank_idx + 1) % self.n_banks
101
102
103class CmdMode(IntEnum):
104 NoPayload = 0x0000
105 Payload32 = 0x4000
106 Mask = 0xC000
107 CmdOpMask = 0x03FF
108
109
Tim Hall79d07d22020-04-27 18:20:16 +0100110class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000111 WORD_SIZE = 4
112
Tim Hall79d07d22020-04-27 18:20:16 +0100113 def __init__(self):
114 self.cmd_stream = []
115 self.reg_machine = [RegisterMachine(), RegisterMachine()]
116 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000117 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100118
119 def get_reg_machine(self, cmd):
120 if "DMA" in cmd.name:
121 return self.reg_machine[1]
122 else:
123 return self.reg_machine[0]
124
125 def size_in_bytes(self):
126 sz = 0
127 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000128 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100129 return sz
130
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100131 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100132 return [elem for cmd in self.cmd_stream for elem in cmd]
133
134 def print_cmds(self):
135 print("Code: Command: Param: Payload:")
136 for words_for_one_command in self.cmd_stream:
137 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
138 param = words_for_one_command[0] >> 16 # higher 16 bits
139
140 payload_mode = CmdMode(code & CmdMode.Mask)
141
142 # code and command
143 s = " 0x%04x " % code
144 if payload_mode == CmdMode.NoPayload:
145 s += str(cmd0(code & CmdMode.CmdOpMask))
146 else:
147 s += str(cmd1(code & CmdMode.CmdOpMask))
148
149 s = s.ljust(40)
150 s += "%5d" % param
151
152 # payload
153 if payload_mode == CmdMode.Payload32:
154 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
155 else:
156 s += " -"
157
158 print(s)
159
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100160 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100161 if isinstance(param, Enum):
162 param = int(param.value)
163 else:
164 param = int(param)
165 param = param & 0xFFFF
166 command = cmd.value | (param << 16)
167 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
168 return
169
170 # This is not a redundant command, actually write it
171 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000172 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100173
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100174 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100175 offset = int(offset) & 0xFFFFFFFFF
176 command = cmd.value | CmdMode.Payload32.value | (param << 16)
177
178 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
179 return
180
181 # This is not a redundant command, actually write it
182 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000183 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100184
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100185 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100186 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100187 command = ((param & 0xFFFF) << 16) | cmd.value
188 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000189 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100190
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100191 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100192 param = int(param)
193 command = ((param & 0xFFFF) << 16) | cmd.value
194
195 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000196 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100197 self.get_reg_machine(cmd).switch_bank()
198
199
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100200# -------------------------------------------------------------------
201# REGISTER GENERATION
202# -------------------------------------------------------------------
203
204
205class BasePointerIndex(IntEnum):
206 WeightTensor = 0 # base address index for the Weight tensor
207 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
208 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
209 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
210
211
212# TODO: Replace with definitions from ethos_u55_regs
213class IFM2Broadcast(IntEnum):
214 BroadcastHdim = 1 << 0
215 BroadcastWdim = 1 << 1
216 BroadcastCdim = 1 << 2
217 ReverseOperandOrder = 1 << 6
218 UseIFM2Scalar = 1 << 7
219
220
221pooling_op_map = {
222 NpuPoolingOp.MAX: pooling_mode.MAX.value,
223 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
224 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
225}
226
227elementwise_op_map = {
228 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
229 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
230 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
231 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
232 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
233 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
234 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
235 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
236 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
237 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
238}
239
240activation_op_map = {
241 NpuActivationOp.NONE_OR_RELU: activation.NONE,
242 NpuActivationOp.TANH: activation.TANH,
243 NpuActivationOp.SIGMOID: activation.SIGMOID,
244}
245
246# Maps an AccumulatorType enum to the corresponding acc_format value
247acc_format_map = {
248 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
249 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
250 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
251}
252
253resampling_mode_map = {
254 NpuResamplingMode.NONE: resampling_mode.NONE,
255 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
256 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
257}
258
259# Maps data type size in bits to activation precision
260precision_map = {8: 0, 16: 1, 32: 2}
261
262# Maps rounding mode to the corresponding value
263rounding_mode_map = {
264 NpuRoundingMode.TFL: rounding.TFL.value,
265 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
266 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
267}
268
269
270def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
271 """Quantizes the given value"""
272 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
273 zp = 0 if quant is None else quant.zero_point
274 return quantise_float32(value, scale, zp)
275
276
277def has_ifm2(npu_op: NpuBlockOperation) -> bool:
278 """Checks if op has non-scalar IFM2"""
279 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
280
281
282def is_dma_op(npu_op: NpuOperation) -> bool:
283 """Checks if op is a DMA operation"""
284 return npu_op.op_type == NpuOperationType.Dma
285
286
287def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
288 """Generates IFM_PAD registers"""
289 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
290 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
291 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
292 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
293
294
295def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
296 """Generates ACTIVATION registers"""
297 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
298
299 if act.min is None:
300 quantized_min = ofm.data_type.min_value()
301 else:
302 quantized_min = quantise(act.min, ofm.quantization)
303 if act.max is None:
304 quantized_max = ofm.data_type.max_value()
305 else:
306 quantized_max = quantise(act.max, ofm.quantization)
307 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
308 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
309 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
310 assert 0 <= act.lookup_table_index < 8
311 activation_value = 16 + act.lookup_table_index
312 if ofm.data_type == NpuDataType.INT32:
313 activation_value |= 3 << 12 # Force I8 range
314 quantized_min = max(-128, quantized_min)
315 quantized_max = min(127, quantized_max)
316 else:
317 activation_value = activation_op_map[act.op_type]
318 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
319 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
320 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
321
322
323def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
324 """Generates xFM_BASE registers"""
325 if layout == NpuLayout.NHCWB16:
326 # Check that all BasePointer addresses are aligned to 16 bytes
327 assert all((int(addr) % 16) == 0 for addr in addresses)
328 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
329 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
330 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
331 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
332
333
334def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
335 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
336 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
337 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
338 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
339
340
341def generate_strides(
342 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
343):
344 """Generates STRIDE_C/Y/X registers"""
345 strides = get_strides(fm)
346 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
347 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
348 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
349
350
351def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
352 """Generates IFM/IFM2_PRECISION register"""
353 dtype = fm.data_type
354 prec = 1 if dtype.is_signed() else 0
355 activation_precision = precision_map[dtype.size_in_bits()]
356 prec += activation_precision << 2
357
358 if fm.layout == NpuLayout.NHCWB16:
359 prec |= 1 << 6
360
361 prec |= op_to_scale << 8
362 emit.cmd0_with_param(precision_cmd, prec)
363
364
365def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
366 """Generates OFM_PRECISION register"""
367 dtype = npu_op.ofm.data_type
368 prec = 1 if dtype.is_signed() else 0
369 activation_precision = precision_map[dtype.size_in_bits()]
370 prec += activation_precision << 1
371
372 if use_global_scale:
373 # Set global scale bit, as opposed to using per channel scale
374 prec |= 1 << 8
375 if npu_op.ofm.layout == NpuLayout.NHCWB16:
376 prec |= 1 << 6
377 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
378 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
379
380
381def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
382 """Generates IFM2_BROADCAST register for binary elementwise operations"""
383 ifm2_broadcast = 0
384 ifm = npu_op.ifm
385 ifm2 = npu_op.ifm2
386 if npu_op.reversed_operands:
387 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
388 if npu_op.ifm2_scalar is not None:
389 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
390 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
391 else:
392 if ifm.shape.height != ifm2.shape.height:
393 # Broadcast in 'H' dimension
394 assert ifm2.shape.height == 1
395 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
396
397 if ifm.shape.width != ifm2.shape.width:
398 # Broadcast in 'W' dimension
399 assert ifm2.shape.width == 1
400 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
401
402 if ifm.shape.depth != ifm2.shape.depth:
403 # Broadcast in 'C' dimension
404 assert ifm2.shape.depth == 1
405 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
406
407 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
408
409
410def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
411 """Generates general IFM registers"""
412 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
413 generate_addresses(
414 emit,
415 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
416 ifm.tiles.addresses,
417 ifm.layout,
418 )
419 generate_tiles(
420 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
421 )
422 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
423 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
424 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
425
426
427def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
428 """Generates general IFM2 registers"""
429 if not has_scalar:
430 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
431 generate_addresses(
432 emit,
433 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
434 ifm2.tiles.addresses,
435 ifm2.layout,
436 )
437 generate_tiles(
438 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
439 )
440 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
441 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
442
443
444def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
445 """Generates general OFM registers"""
446 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
447 generate_addresses(
448 emit,
449 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
450 ofm.tiles.addresses,
451 ofm.layout,
452 )
453 generate_tiles(
454 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
455 )
456 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
457 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
459 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
461
462
463def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
464 """Generates KERNEL related registers"""
465 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
466 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
467 # set kernel x stride low bit
468 stride = (kernel.stride_x - 1) & 1
469 # set kernel y stride low bit
470 stride |= (kernel.stride_y - 1 & 1) << 1
471 # set kernel x stride extension bits
472 stride |= (kernel.stride_x - 1 >> 1) << 6
473 # set kernel y stride extension bits
474 stride |= (kernel.stride_y - 1 >> 1) << 9
475 stride |= (kernel.dilation_x - 1) << 3
476 stride |= (kernel.dilation_y - 1) << 4
477 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
478 stride |= 1 << 2
479 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
480
481
482def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
483 """Generates WEIGHT registers"""
484 if len(weights) == 0:
485 return
486 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
487 # Set weights sources for active and present cores
488 for core, (addr, length) in enumerate(
489 [
490 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
491 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
492 ]
493 ):
494 if core < len(weights):
495 emit.cmd1_with_offset(addr, weights[core].address)
496 emit.cmd1_with_offset(length, weights[core].length)
497 elif core < arch.ncores:
498 emit.cmd1_with_offset(addr, weights[0].address)
499 emit.cmd1_with_offset(length, 0)
500
501
502def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
503 """Generates SCALE registers"""
504 if len(biases) == 0:
505 return
506 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
507 # Set weights sources for active and present cores
508 for core, (addr, length) in enumerate(
509 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
510 ):
511 if core < len(biases):
512 emit.cmd1_with_offset(addr, biases[core].address)
513 emit.cmd1_with_offset(length, biases[core].length)
514 elif core < arch.ncores:
515 emit.cmd1_with_offset(addr, biases[0].address)
516 emit.cmd1_with_offset(length, 0)
517
518
519def generate_block_config(
520 emit: CommandStreamEmitter,
521 npu_op: NpuBlockOperation,
522 arch: ArchitectureFeatures,
523 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100524):
525 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100526 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100527 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100528 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
529 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
530 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
531 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
532 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100533
534
535def generate_shram_registers_elementwise(
536 emit: CommandStreamEmitter,
537 npu_op: NpuElementWiseOperation,
538 arch: ArchitectureFeatures,
539 shared_buffer: SharedBufferAllocation,
540):
541 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
542 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
543 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
544 shram_required = arch.available_shram_banks(uses_lut)
545
546 # Acc buffers not needed so set AB_START to size of SHRAM
547 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
548 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
549 if has_ifm2(npu_op):
550 # Set IFM2_IB_START to the latter half of the IB space
551 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
552 emit.cmd0_with_param(
553 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
554 )
555 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
556
557
558def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
559 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
560 emit.cmd0_with_param(
561 cmd0.NPU_SET_IFM_IB_END,
562 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
563 )
564 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
565 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
566
567
Louis Verhaard933f55e2020-11-25 14:10:30 +0100568def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
569 """Creates shared buffer allocation for the given operation"""
570 op_type = npu_op.op_type
571 block_type = NpuBlockType.Default
572 if op_type == NpuOperationType.Conv2D:
573 block_type = NpuBlockType.ConvolutionMxN
574 elif op_type == NpuOperationType.ConvDepthWise:
575 block_type = NpuBlockType.ConvolutionDepthWise
576 elif op_type == NpuOperationType.Pooling:
577 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
578 elif op_type == NpuOperationType.ElementWise:
579 block_type = NpuBlockType.ElementWise
580 else:
581 assert 0, "Unsupported operation"
582 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
583 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
584
585
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100586def generate_common(
587 emit: CommandStreamEmitter,
588 npu_op: NpuBlockOperation,
589 block_traversal: NpuBlockTraversal,
590 arch: ArchitectureFeatures,
591 use_global_scale: bool = False,
592 op_to_scale: int = 0,
593):
594 """Generate registers that are common to most operations"""
595 assert npu_op.ifm is not None and npu_op.ofm is not None
596 generate_ifm(emit, npu_op.ifm)
597 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
598 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
599 if npu_op.padding is not None:
600 generate_padding(emit, npu_op.padding)
601 generate_ofm(emit, npu_op.ofm)
602 generate_ofm_precision(emit, npu_op, use_global_scale)
603 if npu_op.op_type != NpuOperationType.ElementWise:
604 assert npu_op.kernel is not None
605 generate_kernel(emit, npu_op.kernel, block_traversal)
606 generate_weights(emit, npu_op.weights, arch)
607 generate_biases(emit, npu_op.biases, arch)
608 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100609 shared_buffer = create_shared_buffer(npu_op, arch)
610 generate_block_config(emit, npu_op, arch, shared_buffer)
611 if npu_op.op_type == NpuOperationType.ElementWise:
612 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
613 else:
614 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100615
616
617# -------------------------------------------------------------------
618# SCALING
619# -------------------------------------------------------------------
620
621
622def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
623 """Generates OFM_SCALE register for pooling operations"""
624 # For valid padding vela has to output scaling values
625 kernel = pool_op.kernel
626 ifm_quant = pool_op.ifm.quantization
627 ofm_quant = pool_op.ofm.quantization
628 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
629 assert ifm_quant.scale_f32 is not None
630 rescale = 0x3000 * ifm_quant.scale_f32
631 if pool_op.ifm.data_type == NpuDataType.INT16:
632 # Calculate scale and shift for the output scale of 1/(3*4096)
633 shift = 0
634 max_rescale = np.iinfo(np.int16).max / 2
635 while rescale <= max_rescale and shift <= 30:
636 shift += 1
637 rescale *= 2
638 scale = int(rescale)
639 else:
640 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
641 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
642 scale = int(round_away_zero(scale * rescale))
643 elif pool_op.fused_quantize:
644 # Quantize op requires different scaling
645 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
646 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
647 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
648 elif pool_op.rescale is not None:
649 # for ResizeBilinear operations with "rescale" in primary_op.attrs
650 rescale = pool_op.rescale
651 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
652 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
653 scale = int(round_away_zero(scale * rescale))
654 else:
655 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
656 # kernel height == kernel width == 1 is always true in this case
657 # Normally the scale is maximised, to get maximum precision, which means that
658 # if rescale != 1, scale need to consider the number of bits needed for rescaling
659 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
660 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
661 rescale_bits = 0
662 if kernel.height == kernel.width == 1:
663 if rescale > 1:
664 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
665 elif rescale < 1:
666 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
667 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
668 scale = int(round_away_zero(scale * rescale))
669 else:
670 scale = 1
671 shift = 0
672
673 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
674
675
676def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
677 """
678 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
679 Returns the operator to scale
680 """
681 op_to_scale = 0
682 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
683 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
684 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
685 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
686
687 if npu_op.activation is not None and npu_op.activation.op_type in (
688 NpuActivationOp.SIGMOID,
689 NpuActivationOp.TANH,
690 ):
691 output_scale = 1 / 0x3000
692
693 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
694 if None in (input_scale, input2_scale, output_scale):
695 ofm_scale = 1
696 shift = 0
697 else:
698 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
699 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
700 else: # Add/Sub
701 if None in (input_scale, input2_scale, output_scale):
702 opa_scale = opb_scale = ofm_scale = 1
703 opa_shift = shift = 0
704 if npu_op.rescale is not None:
705 ofm_scale, shift = npu_op.rescale
706 elif input_scale == input2_scale:
707 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
708 input_scale, input2_scale, output_scale
709 )
710 opa_shift = 0 # Unused for this case
711 else:
712 # Use advanced implementation only when input scales differ
713 bitdepth = npu_op.ifm.data_type.size_in_bits()
714 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
715 input_scale, input2_scale, output_scale, bitdepth
716 )
717 opb_scale = 0 # Unused for this case
718 if npu_op.reversed_operands:
719 # If the operand order is reversed we also have to swap which operand is scaled
720 if op_to_scale == scaling.OperandToScale.OPa:
721 op_to_scale = scaling.OperandToScale.OPb
722 else:
723 op_to_scale = scaling.OperandToScale.OPa
724 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
725 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
726 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
727 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
728 output_scale = npu_op.ofm.quantization.scale_f32
729 ofm_scale, shift = scaling.quantise_scale(output_scale)
730 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
731 else:
732 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
733 return op_to_scale
734
735
736# -------------------------------------------------------------------
737# ADDRESSING/STRIDES (helper functions)
738# -------------------------------------------------------------------
739
740
741def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
742 """Checks if the ranges overlap"""
743 return range1.region == range2.region and numeric_util.overlaps(
744 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
745 )
746
747
748def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
749 """Calculates STRIDE_C/Y/X"""
750 if fm.strides is not None:
751 return fm.strides
752 elem_size = fm.data_type.size_in_bytes()
753 if fm.layout == NpuLayout.NHWC:
754 stride_c = elem_size
755 stride_x = fm.shape.depth * stride_c
756 stride_y = fm.shape.width * stride_x
757 else:
758 stride_x = 16 * elem_size
759 stride_c = stride_x * fm.shape.width
760 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
761 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
762
763
764def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
765 """Returns address of given coordinate"""
766 t = 0
767 BRICK = 16
768 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
769 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
770 if x >= fm.tiles.width_0:
771 x -= fm.tiles.width_0
772 t = 1
773 if y >= fm.tiles.height_1:
774 y -= fm.tiles.height_1
775 t += 2
776 elif y >= fm.tiles.height_0:
777 y -= fm.tiles.height_0
778 t += 2
779 elem_size = fm.data_type.size_in_bytes()
780 return (
781 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
782 )
783
784
785def get_address_range(
786 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
787) -> NpuAddressRange:
788 """Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
789 addr0 = get_address(fm, strides, y0, x0, c0)
790 addr1 = get_address(fm, strides, y1, x1, c1)
791 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
792
793
794def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
795 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
796 strides = get_strides(fm)
797 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
798 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
799 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
800 if width > width_0:
801 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
802 else:
803 t1 = None
804 if height > height_0:
805 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
806 else:
807 t2 = None
808 if t1 is not None and t2 is not None:
809 t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
810 else:
811 t3 = None
812 return [t0, t1, t2, t3]
813
814
815# -------------------------------------------------------------------
816# DMA_WAIT/KERNEL_WAIT
817# -------------------------------------------------------------------
818
819
Tim Hall289a41d2020-08-04 21:40:14 +0100820Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100821
Tim Hall79d07d22020-04-27 18:20:16 +0100822
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100823def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
824 return MemoryRangeSet(range.region, range.address, range.address + range.length)
825
826
827def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
828 """Returns the address that are read and written by the given DMA operation"""
829 res = MemoryAccessSet()
830 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
831 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
832 return res
833
834
835def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
836 """Returns the addresses that are read and written by the given operation"""
837 assert npu_op.ifm is not None and npu_op.ofm is not None
838 # Read addresses
839 read_ranges = get_address_ranges(npu_op.ifm)
840 if has_ifm2(npu_op):
841 assert npu_op.ifm2 is not None
842 read_ranges.extend(get_address_ranges(npu_op.ifm2))
843 read_ranges.extend(npu_op.weights)
844 read_ranges.extend(npu_op.biases)
845 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
846 address = arch.available_shram_banks(True) * arch.shram_bank_size
847 read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
848 # Written addresses
849 write_ranges = get_address_ranges(npu_op.ofm)
850 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
851 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
852 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
853 write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
854
855 res = MemoryAccessSet()
856 for read_range in read_ranges:
857 if read_range is not None:
858 res.add(memory_range_set(read_range), AccessDirection.Read)
859 for write_range in write_ranges:
860 if write_range is not None:
861 res.add(memory_range_set(write_range), AccessDirection.Write)
862 return res
863
864
865def get_wait_dependency(
866 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
867):
868 """Used to calculate whether DMA wait or kernel wait operations are needed"""
869 npu_op = npu_op_list[op_index]
870 op_access = memory_accesses[npu_op]
871 index = op_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100872
Tim Hall289a41d2020-08-04 21:40:14 +0100873 # NPU dependency tracking
874 npu_outstanding = -1
875 npu_ops = 0
876 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100877
Tim Hall289a41d2020-08-04 21:40:14 +0100878 # DMA dependency tracking
879 dma_outstanding = -1
880 dma_ops = 0
881 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100882
Tim Hall289a41d2020-08-04 21:40:14 +0100883 # Seek back in the command stream looking for NPU or DMA dependencies
884 # but only as far as the first dependency or the watermarks (dependencies
885 # before this point have been satisfied already).
886 # The watermark moves to after the latest element we must wait for, not
887 # the command that issues the wait.
888 # NPU->NPU dependency is handled via blockdep.
889 while (index >= npu_index) or (index >= dma_index):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100890 prev_op = npu_op_list[index]
891 prev_access = memory_accesses[prev_op]
Tim Hall79d07d22020-04-27 18:20:16 +0100892
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100893 # Check NPU consuming DMA output
894 if is_dma_op(prev_op):
895 if index >= dma_index:
896 if not is_dma_op(npu_op):
897 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
898 dma_outstanding = dma_ops
899 dma_ops += 1 # Count DMA ops in the pipeline
900 if dma_ops >= arch.max_outstanding_dma:
901 dma_index = max(index + 1, dma_index)
Tim Hall289a41d2020-08-04 21:40:14 +0100902 # Check DMA consuming NPU output
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100903 else:
Tim Hall289a41d2020-08-04 21:40:14 +0100904 if index >= npu_index:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100905 if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall289a41d2020-08-04 21:40:14 +0100906 npu_outstanding = npu_ops
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100907 npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall289a41d2020-08-04 21:40:14 +0100908 if npu_ops >= arch.max_outstanding_kernels:
909 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100910
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100911 index -= 1
Tim Hall79d07d22020-04-27 18:20:16 +0100912
Tim Hall289a41d2020-08-04 21:40:14 +0100913 # Update DMA watermark if we didn't see any and the NPU pipeline is full
914 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100915 dma_index = op_index
Tim Hall289a41d2020-08-04 21:40:14 +0100916
917 # Bring the search watermark forwards as we complete for those dependencies
918 watermark = Watermark(npu_index, dma_index)
919 outstanding = Watermark(npu_outstanding, dma_outstanding)
920
921 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100922
923
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100924def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
925 if cmd_waits.npu >= 0:
926 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
927
928 if cmd_waits.dma >= 0:
929 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
930
931
932# -------------------------------------------------------------------
933# BLOCKDEP
934# -------------------------------------------------------------------
935
936
937def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
938 """Checks if npu_op's input is dependent on prev_op's output"""
939 assert npu_op.ifm is not None
940 assert prev_op.ofm is not None
941 curr_input_ranges = get_address_ranges(npu_op.ifm)
942
943 if has_ifm2(npu_op):
944 assert npu_op.ifm2 is not None
945 curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
946 for prev_range in get_address_ranges(prev_op.ofm):
947 if prev_range is None:
948 continue
949 for curr_range in curr_input_ranges:
950 if curr_range is not None and ranges_overlap(prev_range, curr_range):
951 return True
Tim Hall79d07d22020-04-27 18:20:16 +0100952 return False
953
954
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100955def shape3d_to_rect(shape: NpuShape3D) -> Rect:
956 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100957
958
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100959def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall79d07d22020-04-27 18:20:16 +0100960 # Note: NOT equivalent to the normal ifm block depth calculation since
961 # it takes into account 'depthless' block operations by returning full
962 # depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100963 if npu_op.op_type == NpuOperationType.Conv2D:
964 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
965 return res
966 return npu_op.ofm.shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100967
968
Louis Verhaard933f55e2020-11-25 14:10:30 +0100969def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100970 """Calculates the value of the BLOCKDEP register"""
971 if prev_op is None:
972 return 0
973 if not is_dependent_on_prev_op(prev_op, npu_op):
974 return ArchitectureFeatures.MAX_BLOCKDEP
975 if prev_op.ofm.shape != npu_op.ifm.shape:
976 return 0
Louis Verhaard933f55e2020-11-25 14:10:30 +0100977 prev_block_config = prev_op.block_config
978 block_config = npu_op.block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100979 prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
980 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
981 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
982 prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
983 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
984 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
985 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
986 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
987 cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
988 blockdep = arch.calc_block_dep(
989 prev_ifm_rect,
990 prev_ofm_rect,
991 prev_ifm_block_depth,
992 prev_ofm_block,
993 to_kernel(prev_op.kernel),
994 cur_ifm_rect,
995 cur_ofm_rect,
996 cur_ifm_block_depth,
997 cur_ofm_block,
998 to_kernel(npu_op.kernel),
999 cur_padLT,
1000 )
1001 return blockdep
Tim Hall79d07d22020-04-27 18:20:16 +01001002
1003
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001004# -------------------------------------------------------------------
1005# PRINT
1006# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +02001007
1008
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001009def print_feature_map(fm: NpuFeatureMap, name: str):
1010 if fm is not None:
1011 q = (
1012 "no quantization"
1013 if fm.quantization is None
1014 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
1015 )
1016 h, w, c = fm.shape
1017 sz = h * w * c * fm.data_type.size_in_bytes()
1018 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
1019 strides = get_strides(fm)
1020 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
1021 t = fm.tiles
1022 addresses = [hex(addr) for addr in t.addresses]
1023 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +01001024
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001025
1026def print_operation(npu_op: NpuOperation, index: int = 0):
1027 pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
1028 if is_dma_op(npu_op):
1029 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
1030 return
1031 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
1032 if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
1033 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +02001034 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001035 if (
1036 npu_op.op_type == NpuOperationType.Conv2D
1037 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
1038 ):
1039 fc = "FullyConnected "
1040 else:
1041 fc = ""
1042 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
1043 print_feature_map(npu_op.ifm, "IFM")
1044 if npu_op.ifm2_scalar is not None:
1045 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1046 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
1047 else:
1048 print_feature_map(npu_op.ifm2, "IFM2")
1049 print_feature_map(npu_op.ofm, "OFM")
1050 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
1051 print(f" Kernel: {k}")
1052 if npu_op.padding is not None:
1053 print(f" {npu_op.padding}")
1054 for weights in npu_op.weights:
1055 print(f" Weights: {weights}")
1056 for bias in npu_op.biases:
1057 print(f" Scales: {bias}")
1058 if npu_op.activation is not None:
1059 act = npu_op.activation
1060 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
1061 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
1062 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
1063 if npu_op.op_type == NpuOperationType.Conv2D:
1064 print(f" {npu_op.block_traversal}")
1065 bh, bw, bc = npu_op.block_config
1066 rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
1067 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +01001068
Tim Hall79d07d22020-04-27 18:20:16 +01001069
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001070def print_operations(npu_op_list: List[NpuOperation]):
1071 for index, npu_op in enumerate(npu_op_list):
1072 print_operation(npu_op, index)
Tim Hall79d07d22020-04-27 18:20:16 +01001073
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001074
1075# -------------------------------------------------------------------
1076# OPERATIONS
1077# -------------------------------------------------------------------
1078
1079
1080def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
1081 """Generates NPU_OP_* command"""
1082 op_type = npu_op.op_type
1083 if op_type == NpuOperationType.Dma:
1084 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
1085 elif op_type == NpuOperationType.Conv2D:
1086 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1087 elif op_type == NpuOperationType.ConvDepthWise:
1088 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1089 elif op_type == NpuOperationType.Pooling:
1090 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
1091 elif op_type == NpuOperationType.ElementWise:
1092 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
1093 else:
1094 assert 0, "Unsupported operation"
1095
1096
Louis Verhaard933f55e2020-11-25 14:10:30 +01001097def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001098 """Generates register commands for Conv2D operations"""
1099 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001100
1101
1102def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1103 """Generates register commands for depthwise convolution operations"""
1104 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001105
1106
1107def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1108 """Generates register commands for pooling operations"""
1109 use_global_scale = (
1110 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
1111 )
1112 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
1113 # Pooling op specific
1114 if use_global_scale:
1115 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001116
1117
1118def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
1119 """Generates register commands for elementwise operations"""
1120 use_global_scale = npu_op.sub_op_type in (
1121 NpuElementWiseOp.ADD,
1122 NpuElementWiseOp.SUB,
1123 NpuElementWiseOp.MUL,
1124 NpuElementWiseOp.LRELU,
1125 NpuElementWiseOp.ABS,
1126 )
1127 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
1128 generate_common(
1129 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
1130 )
1131 # Elementwise op specific
1132 if npu_op.sub_op_type not in unary_elementwise_ops:
1133 # Binary operation; generate IFM2 registers
1134 assert npu_op.ifm2 is not None
1135 has_scalar = npu_op.ifm2_scalar is not None
1136 generate_ifm2(emit, npu_op.ifm2, has_scalar)
1137 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1138 generate_ifm2_broadcast(emit, npu_op)
1139 if has_scalar:
1140 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1141 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1142 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001143
1144
1145def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
1146 """Generates register commands for DMA operations"""
1147 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
1148 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
1149 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1150
1151 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1152 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
1153
1154
Louis Verhaard933f55e2020-11-25 14:10:30 +01001155def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001156 """
1157 Generates register commands for the given operation, but not the final NPU_OP_... command.
1158 Returns the selected block config
1159 """
1160 op_type = npu_op.op_type
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001161 if op_type == NpuOperationType.Conv2D:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001162 generate_conv2d_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001163 elif op_type == NpuOperationType.ConvDepthWise:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001164 generate_conv_depthwise_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001165 elif op_type == NpuOperationType.Pooling:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001166 generate_pooling_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001167 elif op_type == NpuOperationType.ElementWise:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001168 generate_elementwise_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001169 elif op_type == NpuOperationType.Dma:
1170 generate_dma_op(emit, npu_op)
1171 else:
1172 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001173
1174
1175def generate_command_stream(
1176 emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
1177):
1178 """Generates register commands for the given list of NPU operations"""
1179 # Calculate memory accesses for every operation
Tim Hall289a41d2020-08-04 21:40:14 +01001180 memory_accesses = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001181 for npu_op in npu_op_list:
1182 if is_dma_op(npu_op):
1183 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
1184 else:
1185 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hallc8a73862020-10-27 12:43:14 +00001186 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001187 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1188 dep_watermark = Watermark(0, 0)
1189 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001190 # Generate register commands for all operations
1191 for op_index, npu_op in enumerate(npu_op_list):
1192 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard933f55e2020-11-25 14:10:30 +01001193 generate_registers_for_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001194 if not is_dma_op(npu_op):
1195 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001196 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001197 blockdep = min(blockdep, arch.max_blockdep)
1198 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1199 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001200
1201 generate_cmd_waits(emit, cmd_waits)
1202 # Generate the actual NPU_OP command
1203 generate_operation_code(emit, npu_op)
1204 if add_to_debug_db is not None:
1205 add_to_debug_db(npu_op, emit.offset)
1206 # Fill in final part of command stream:
1207 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1208
1209
1210def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
1211 """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
1212 # Convert high level command stream to list of NpuOperation
1213 npu_op_list = []
1214 npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall79d07d22020-04-27 18:20:16 +01001215 for cmd in sg.high_level_command_stream:
1216 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
1217 print("Warning: Skipping register command stream generation for", cmd.ps)
1218 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001219 npu_op = convert_command_to_npu_op(cmd, arch)
1220 npu_op_list.append(npu_op)
1221 npu_op_to_cmd[npu_op] = cmd
1222 if verbose:
1223 print_operations(npu_op_list)
1224 # Generate register commands
Tim Halle6ccd872020-11-09 16:46:37 +00001225 stream_id = DebugDatabase.add_stream(sg)
1226 DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001227 emit = CommandStreamEmitter()
Tim Halle6ccd872020-11-09 16:46:37 +00001228
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001229 def add_to_debug_db(npu_op: NpuOperation, offset: int):
1230 """Adds info to the debug database"""
1231 if not is_dma_op(npu_op):
1232 cmd = npu_op_to_cmd[npu_op]
1233 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall289a41d2020-08-04 21:40:14 +01001234
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001235 generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall79d07d22020-04-27 18:20:16 +01001236 sg.register_command_stream = emit.to_list()
1237 if verbose:
1238 emit.print_cmds()
1239 print("number of commands", len(emit.cmd_stream))
1240 print("command stream length in words", len(sg.register_command_stream))
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001241
1242
Louis Verhaard933f55e2020-11-25 14:10:30 +01001243def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
1244 """
1245 Internal implementation of the public facing API for finding block configs.
1246 """
1247 if is_dma_op(npu_op):
1248 return []
1249 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
1250 shared_buffer = create_shared_buffer(npu_op, arch)
1251 blocks = find_suitable_block_configs(arch, shared_buffer)
1252 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
1253
1254
Louis Verhaardaeae5672020-11-02 18:04:27 +01001255def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001256 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001257 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001258 Calculates dependencies between commands and inserts wait operations if needed.
1259
1260 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001261 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1262 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001263 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001264 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001265 emit = CommandStreamEmitter()
Louis Verhaard52078302020-11-18 13:35:06 +01001266 arch = create_default_arch(accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001267 generate_command_stream(emit, npu_op_list, arch)
1268 return emit.to_list()