blob: 04f7072d11d2dc8eaf4a48fd17a6dd536ae89df3 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
Louis Verhaarde8a5a782020-11-02 18:04:27 +010029from . import numeric_util
Diego Russoea6111a2020-04-14 18:41:58 +010030from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
38from .api import NpuDataType
39from .api import NpuDmaOperation
40from .api import NpuElementWiseOp
41from .api import NpuElementWiseOperation
42from .api import NpuFeatureMap
43from .api import NpuKernel
44from .api import NpuLayout
45from .api import NpuOperation
46from .api import NpuOperationType
47from .api import NpuPadding
48from .api import NpuPoolingOp
49from .api import NpuPoolingOperation
50from .api import NpuQuantization
51from .api import NpuResamplingMode
52from .api import NpuRoundingMode
53from .api import NpuShape3D
54from .api import NpuTileBox
55from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010056from .architecture_features import ArchitectureFeatures
57from .architecture_features import Block
Diego Russoe8a10452020-04-21 17:39:10 +010058from .architecture_features import Rect
59from .architecture_features import SharedBufferArea
60from .architecture_features import SHRAMElements
Tim Halle6ccd872020-11-09 16:46:37 +000061from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010062from .ethos_u55_regs.ethos_u55_regs import acc_format
63from .ethos_u55_regs.ethos_u55_regs import activation
64from .ethos_u55_regs.ethos_u55_regs import cmd0
65from .ethos_u55_regs.ethos_u55_regs import cmd1
66from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020067from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020068from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010069from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010070from .high_level_command_stream import CommandType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010071from .high_level_command_to_npu_op import convert_command_to_npu_op
72from .high_level_command_to_npu_op import to_kernel
73from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russoe8a10452020-04-21 17:39:10 +010074from .numeric_util import quantise_float32
75from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010076from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010077from .operation import NpuBlockType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010078from .range_set import AccessDirection
79from .range_set import MemoryAccessSet
80from .range_set import MemoryRangeSet
81from .shared_buffer_allocation import find_suitable_block_configs
82from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
83from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010084
85
86class RegisterMachine:
87 def __init__(self):
88 self.n_banks = 1
89 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
90 self.bank_idx = 0
91
92 def set_register(self, reg, value):
93 is_changed = self.registers[self.bank_idx][reg] != value
94 self.registers[self.bank_idx][reg] = value
95 # is_changed = True # force command
96 return is_changed
97
98 def switch_bank(self):
99 self.bank_idx = (self.bank_idx + 1) % self.n_banks
100
101
102class CmdMode(IntEnum):
103 NoPayload = 0x0000
104 Payload32 = 0x4000
105 Mask = 0xC000
106 CmdOpMask = 0x03FF
107
108
Tim Hall79d07d22020-04-27 18:20:16 +0100109class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000110 WORD_SIZE = 4
111
Tim Hall79d07d22020-04-27 18:20:16 +0100112 def __init__(self):
113 self.cmd_stream = []
114 self.reg_machine = [RegisterMachine(), RegisterMachine()]
115 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000116 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100117
118 def get_reg_machine(self, cmd):
119 if "DMA" in cmd.name:
120 return self.reg_machine[1]
121 else:
122 return self.reg_machine[0]
123
124 def size_in_bytes(self):
125 sz = 0
126 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000127 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100128 return sz
129
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100130 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100131 return [elem for cmd in self.cmd_stream for elem in cmd]
132
133 def print_cmds(self):
134 print("Code: Command: Param: Payload:")
135 for words_for_one_command in self.cmd_stream:
136 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
137 param = words_for_one_command[0] >> 16 # higher 16 bits
138
139 payload_mode = CmdMode(code & CmdMode.Mask)
140
141 # code and command
142 s = " 0x%04x " % code
143 if payload_mode == CmdMode.NoPayload:
144 s += str(cmd0(code & CmdMode.CmdOpMask))
145 else:
146 s += str(cmd1(code & CmdMode.CmdOpMask))
147
148 s = s.ljust(40)
149 s += "%5d" % param
150
151 # payload
152 if payload_mode == CmdMode.Payload32:
153 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
154 else:
155 s += " -"
156
157 print(s)
158
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100159 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100160 if isinstance(param, Enum):
161 param = int(param.value)
162 else:
163 param = int(param)
164 param = param & 0xFFFF
165 command = cmd.value | (param << 16)
166 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
167 return
168
169 # This is not a redundant command, actually write it
170 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000171 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100172
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100173 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100174 offset = int(offset) & 0xFFFFFFFFF
175 command = cmd.value | CmdMode.Payload32.value | (param << 16)
176
177 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
178 return
179
180 # This is not a redundant command, actually write it
181 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000182 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100183
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100184 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100185 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100186 command = ((param & 0xFFFF) << 16) | cmd.value
187 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000188 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100190 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100191 param = int(param)
192 command = ((param & 0xFFFF) << 16) | cmd.value
193
194 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000195 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100196 self.get_reg_machine(cmd).switch_bank()
197
198
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100199# -------------------------------------------------------------------
200# REGISTER GENERATION
201# -------------------------------------------------------------------
202
203
204class BasePointerIndex(IntEnum):
205 WeightTensor = 0 # base address index for the Weight tensor
206 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
207 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
208 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
209
210
211# TODO: Replace with definitions from ethos_u55_regs
212class IFM2Broadcast(IntEnum):
213 BroadcastHdim = 1 << 0
214 BroadcastWdim = 1 << 1
215 BroadcastCdim = 1 << 2
216 ReverseOperandOrder = 1 << 6
217 UseIFM2Scalar = 1 << 7
218
219
220pooling_op_map = {
221 NpuPoolingOp.MAX: pooling_mode.MAX.value,
222 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
223 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
224}
225
226elementwise_op_map = {
227 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
228 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
229 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
230 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
231 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
232 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
233 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
234 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
235 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
236 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
237}
238
239activation_op_map = {
240 NpuActivationOp.NONE_OR_RELU: activation.NONE,
241 NpuActivationOp.TANH: activation.TANH,
242 NpuActivationOp.SIGMOID: activation.SIGMOID,
243}
244
245# Maps an AccumulatorType enum to the corresponding acc_format value
246acc_format_map = {
247 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
248 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
249 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
250}
251
252resampling_mode_map = {
253 NpuResamplingMode.NONE: resampling_mode.NONE,
254 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
255 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
256}
257
258# Maps data type size in bits to activation precision
259precision_map = {8: 0, 16: 1, 32: 2}
260
261# Maps rounding mode to the corresponding value
262rounding_mode_map = {
263 NpuRoundingMode.TFL: rounding.TFL.value,
264 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
265 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
266}
267
268
269def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
270 """Quantizes the given value"""
271 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
272 zp = 0 if quant is None else quant.zero_point
273 return quantise_float32(value, scale, zp)
274
275
276def has_ifm2(npu_op: NpuBlockOperation) -> bool:
277 """Checks if op has non-scalar IFM2"""
278 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
279
280
281def is_dma_op(npu_op: NpuOperation) -> bool:
282 """Checks if op is a DMA operation"""
283 return npu_op.op_type == NpuOperationType.Dma
284
285
286def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
287 """Generates IFM_PAD registers"""
288 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
289 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
290 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
291 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
292
293
294def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
295 """Generates ACTIVATION registers"""
296 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
297
298 if act.min is None:
299 quantized_min = ofm.data_type.min_value()
300 else:
301 quantized_min = quantise(act.min, ofm.quantization)
302 if act.max is None:
303 quantized_max = ofm.data_type.max_value()
304 else:
305 quantized_max = quantise(act.max, ofm.quantization)
306 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
307 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
308 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
309 assert 0 <= act.lookup_table_index < 8
310 activation_value = 16 + act.lookup_table_index
311 if ofm.data_type == NpuDataType.INT32:
312 activation_value |= 3 << 12 # Force I8 range
313 quantized_min = max(-128, quantized_min)
314 quantized_max = min(127, quantized_max)
315 else:
316 activation_value = activation_op_map[act.op_type]
317 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
318 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
319 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
320
321
322def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
323 """Generates xFM_BASE registers"""
324 if layout == NpuLayout.NHCWB16:
325 # Check that all BasePointer addresses are aligned to 16 bytes
326 assert all((int(addr) % 16) == 0 for addr in addresses)
327 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
328 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
329 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
330 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
331
332
333def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
334 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
335 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
336 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
337 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
338
339
340def generate_strides(
341 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
342):
343 """Generates STRIDE_C/Y/X registers"""
344 strides = get_strides(fm)
345 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
346 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
347 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
348
349
350def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
351 """Generates IFM/IFM2_PRECISION register"""
352 dtype = fm.data_type
353 prec = 1 if dtype.is_signed() else 0
354 activation_precision = precision_map[dtype.size_in_bits()]
355 prec += activation_precision << 2
356
357 if fm.layout == NpuLayout.NHCWB16:
358 prec |= 1 << 6
359
360 prec |= op_to_scale << 8
361 emit.cmd0_with_param(precision_cmd, prec)
362
363
364def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
365 """Generates OFM_PRECISION register"""
366 dtype = npu_op.ofm.data_type
367 prec = 1 if dtype.is_signed() else 0
368 activation_precision = precision_map[dtype.size_in_bits()]
369 prec += activation_precision << 1
370
371 if use_global_scale:
372 # Set global scale bit, as opposed to using per channel scale
373 prec |= 1 << 8
374 if npu_op.ofm.layout == NpuLayout.NHCWB16:
375 prec |= 1 << 6
376 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
377 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
378
379
380def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
381 """Generates IFM2_BROADCAST register for binary elementwise operations"""
382 ifm2_broadcast = 0
383 ifm = npu_op.ifm
384 ifm2 = npu_op.ifm2
385 if npu_op.reversed_operands:
386 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
387 if npu_op.ifm2_scalar is not None:
388 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
389 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
390 else:
391 if ifm.shape.height != ifm2.shape.height:
392 # Broadcast in 'H' dimension
393 assert ifm2.shape.height == 1
394 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
395
396 if ifm.shape.width != ifm2.shape.width:
397 # Broadcast in 'W' dimension
398 assert ifm2.shape.width == 1
399 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
400
401 if ifm.shape.depth != ifm2.shape.depth:
402 # Broadcast in 'C' dimension
403 assert ifm2.shape.depth == 1
404 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
405
406 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
407
408
409def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
410 """Generates general IFM registers"""
411 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
412 generate_addresses(
413 emit,
414 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
415 ifm.tiles.addresses,
416 ifm.layout,
417 )
418 generate_tiles(
419 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
420 )
421 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
422 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
423 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
424
425
426def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
427 """Generates general IFM2 registers"""
428 if not has_scalar:
429 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
430 generate_addresses(
431 emit,
432 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
433 ifm2.tiles.addresses,
434 ifm2.layout,
435 )
436 generate_tiles(
437 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
438 )
439 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
440 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
441
442
443def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
444 """Generates general OFM registers"""
445 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
446 generate_addresses(
447 emit,
448 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
449 ofm.tiles.addresses,
450 ofm.layout,
451 )
452 generate_tiles(
453 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
454 )
455 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
456 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
457 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
458 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
460
461
462def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
463 """Generates KERNEL related registers"""
464 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
465 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
466 # set kernel x stride low bit
467 stride = (kernel.stride_x - 1) & 1
468 # set kernel y stride low bit
469 stride |= (kernel.stride_y - 1 & 1) << 1
470 # set kernel x stride extension bits
471 stride |= (kernel.stride_x - 1 >> 1) << 6
472 # set kernel y stride extension bits
473 stride |= (kernel.stride_y - 1 >> 1) << 9
474 stride |= (kernel.dilation_x - 1) << 3
475 stride |= (kernel.dilation_y - 1) << 4
476 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
477 stride |= 1 << 2
478 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
479
480
481def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
482 """Generates WEIGHT registers"""
483 if len(weights) == 0:
484 return
485 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
486 # Set weights sources for active and present cores
487 for core, (addr, length) in enumerate(
488 [
489 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
490 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
491 ]
492 ):
493 if core < len(weights):
494 emit.cmd1_with_offset(addr, weights[core].address)
495 emit.cmd1_with_offset(length, weights[core].length)
496 elif core < arch.ncores:
497 emit.cmd1_with_offset(addr, weights[0].address)
498 emit.cmd1_with_offset(length, 0)
499
500
501def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
502 """Generates SCALE registers"""
503 if len(biases) == 0:
504 return
505 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
506 # Set weights sources for active and present cores
507 for core, (addr, length) in enumerate(
508 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
509 ):
510 if core < len(biases):
511 emit.cmd1_with_offset(addr, biases[core].address)
512 emit.cmd1_with_offset(length, biases[core].length)
513 elif core < arch.ncores:
514 emit.cmd1_with_offset(addr, biases[0].address)
515 emit.cmd1_with_offset(length, 0)
516
517
518def generate_block_config(
519 emit: CommandStreamEmitter,
520 npu_op: NpuBlockOperation,
521 arch: ArchitectureFeatures,
522 shared_buffer: SharedBufferAllocation,
523) -> NpuShape3D:
524 """Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
525 block_config = npu_op.block_config
526 if block_config is None or block_config.height < 0:
527 # Note: this code only used if the public API to generate command streams is used;
528 # in the "normal" flow, the block config selected by the scheduler is used
529 if npu_op.weights:
530 assert block_config is not None, "block_config.depth must be provided for ops with weights"
531 # Block config has not been provided: find one
532 blocks = find_suitable_block_configs(arch, shared_buffer)
533 # Return the block with biggest volume
534 # TODO: use a better algorithm to find the best block
535 best_block = None
536 best_value = 0
537 for block in blocks:
538 if block_config is not None and block[3] != block_config.depth:
539 continue
540 value = block[0] * block[1] * block[3]
541 if value > best_value:
542 best_value = value
543 best_block = block
544 assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"
545 block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])
546 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
547 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
548 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
549 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
550 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
551 return block_config
552
553
554def generate_shram_registers_elementwise(
555 emit: CommandStreamEmitter,
556 npu_op: NpuElementWiseOperation,
557 arch: ArchitectureFeatures,
558 shared_buffer: SharedBufferAllocation,
559):
560 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
561 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
562 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
563 shram_required = arch.available_shram_banks(uses_lut)
564
565 # Acc buffers not needed so set AB_START to size of SHRAM
566 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
567 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
568 if has_ifm2(npu_op):
569 # Set IFM2_IB_START to the latter half of the IB space
570 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
571 emit.cmd0_with_param(
572 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
573 )
574 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
575
576
577def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
578 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
579 emit.cmd0_with_param(
580 cmd0.NPU_SET_IFM_IB_END,
581 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
582 )
583 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
584 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
585
586
587def generate_common(
588 emit: CommandStreamEmitter,
589 npu_op: NpuBlockOperation,
590 block_traversal: NpuBlockTraversal,
591 arch: ArchitectureFeatures,
592 use_global_scale: bool = False,
593 op_to_scale: int = 0,
594):
595 """Generate registers that are common to most operations"""
596 assert npu_op.ifm is not None and npu_op.ofm is not None
597 generate_ifm(emit, npu_op.ifm)
598 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
599 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
600 if npu_op.padding is not None:
601 generate_padding(emit, npu_op.padding)
602 generate_ofm(emit, npu_op.ofm)
603 generate_ofm_precision(emit, npu_op, use_global_scale)
604 if npu_op.op_type != NpuOperationType.ElementWise:
605 assert npu_op.kernel is not None
606 generate_kernel(emit, npu_op.kernel, block_traversal)
607 generate_weights(emit, npu_op.weights, arch)
608 generate_biases(emit, npu_op.biases, arch)
609 generate_activation(emit, npu_op.activation, npu_op.ofm)
610
611
612# -------------------------------------------------------------------
613# SCALING
614# -------------------------------------------------------------------
615
616
617def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
618 """Generates OFM_SCALE register for pooling operations"""
619 # For valid padding vela has to output scaling values
620 kernel = pool_op.kernel
621 ifm_quant = pool_op.ifm.quantization
622 ofm_quant = pool_op.ofm.quantization
623 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
624 assert ifm_quant.scale_f32 is not None
625 rescale = 0x3000 * ifm_quant.scale_f32
626 if pool_op.ifm.data_type == NpuDataType.INT16:
627 # Calculate scale and shift for the output scale of 1/(3*4096)
628 shift = 0
629 max_rescale = np.iinfo(np.int16).max / 2
630 while rescale <= max_rescale and shift <= 30:
631 shift += 1
632 rescale *= 2
633 scale = int(rescale)
634 else:
635 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
636 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
637 scale = int(round_away_zero(scale * rescale))
638 elif pool_op.fused_quantize:
639 # Quantize op requires different scaling
640 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
641 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
642 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
643 elif pool_op.rescale is not None:
644 # for ResizeBilinear operations with "rescale" in primary_op.attrs
645 rescale = pool_op.rescale
646 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
647 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
648 scale = int(round_away_zero(scale * rescale))
649 else:
650 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
651 # kernel height == kernel width == 1 is always true in this case
652 # Normally the scale is maximised, to get maximum precision, which means that
653 # if rescale != 1, scale need to consider the number of bits needed for rescaling
654 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
655 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
656 rescale_bits = 0
657 if kernel.height == kernel.width == 1:
658 if rescale > 1:
659 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
660 elif rescale < 1:
661 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
662 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
663 scale = int(round_away_zero(scale * rescale))
664 else:
665 scale = 1
666 shift = 0
667
668 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
669
670
671def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
672 """
673 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
674 Returns the operator to scale
675 """
676 op_to_scale = 0
677 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
678 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
679 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
680 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
681
682 if npu_op.activation is not None and npu_op.activation.op_type in (
683 NpuActivationOp.SIGMOID,
684 NpuActivationOp.TANH,
685 ):
686 output_scale = 1 / 0x3000
687
688 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
689 if None in (input_scale, input2_scale, output_scale):
690 ofm_scale = 1
691 shift = 0
692 else:
693 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
694 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
695 else: # Add/Sub
696 if None in (input_scale, input2_scale, output_scale):
697 opa_scale = opb_scale = ofm_scale = 1
698 opa_shift = shift = 0
699 if npu_op.rescale is not None:
700 ofm_scale, shift = npu_op.rescale
701 elif input_scale == input2_scale:
702 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
703 input_scale, input2_scale, output_scale
704 )
705 opa_shift = 0 # Unused for this case
706 else:
707 # Use advanced implementation only when input scales differ
708 bitdepth = npu_op.ifm.data_type.size_in_bits()
709 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
710 input_scale, input2_scale, output_scale, bitdepth
711 )
712 opb_scale = 0 # Unused for this case
713 if npu_op.reversed_operands:
714 # If the operand order is reversed we also have to swap which operand is scaled
715 if op_to_scale == scaling.OperandToScale.OPa:
716 op_to_scale = scaling.OperandToScale.OPb
717 else:
718 op_to_scale = scaling.OperandToScale.OPa
719 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
720 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
721 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
722 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
723 output_scale = npu_op.ofm.quantization.scale_f32
724 ofm_scale, shift = scaling.quantise_scale(output_scale)
725 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
726 else:
727 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
728 return op_to_scale
729
730
731# -------------------------------------------------------------------
732# ADDRESSING/STRIDES (helper functions)
733# -------------------------------------------------------------------
734
735
736def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
737 """Checks if the ranges overlap"""
738 return range1.region == range2.region and numeric_util.overlaps(
739 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
740 )
741
742
743def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
744 """Calculates STRIDE_C/Y/X"""
745 if fm.strides is not None:
746 return fm.strides
747 elem_size = fm.data_type.size_in_bytes()
748 if fm.layout == NpuLayout.NHWC:
749 stride_c = elem_size
750 stride_x = fm.shape.depth * stride_c
751 stride_y = fm.shape.width * stride_x
752 else:
753 stride_x = 16 * elem_size
754 stride_c = stride_x * fm.shape.width
755 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
756 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
757
758
759def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
760 """Returns address of given coordinate"""
761 t = 0
762 BRICK = 16
763 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
764 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
765 if x >= fm.tiles.width_0:
766 x -= fm.tiles.width_0
767 t = 1
768 if y >= fm.tiles.height_1:
769 y -= fm.tiles.height_1
770 t += 2
771 elif y >= fm.tiles.height_0:
772 y -= fm.tiles.height_0
773 t += 2
774 elem_size = fm.data_type.size_in_bytes()
775 return (
776 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
777 )
778
779
780def get_address_range(
781 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
782) -> NpuAddressRange:
783 """Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
784 addr0 = get_address(fm, strides, y0, x0, c0)
785 addr1 = get_address(fm, strides, y1, x1, c1)
786 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
787
788
789def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
790 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
791 strides = get_strides(fm)
792 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
793 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
794 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
795 if width > width_0:
796 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
797 else:
798 t1 = None
799 if height > height_0:
800 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
801 else:
802 t2 = None
803 if t1 is not None and t2 is not None:
804 t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
805 else:
806 t3 = None
807 return [t0, t1, t2, t3]
808
809
810# -------------------------------------------------------------------
811# DMA_WAIT/KERNEL_WAIT
812# -------------------------------------------------------------------
813
814
Tim Hall289a41d2020-08-04 21:40:14 +0100815Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100816
Tim Hall79d07d22020-04-27 18:20:16 +0100817
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100818def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
819 return MemoryRangeSet(range.region, range.address, range.address + range.length)
820
821
822def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
823 """Returns the address that are read and written by the given DMA operation"""
824 res = MemoryAccessSet()
825 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
826 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
827 return res
828
829
830def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
831 """Returns the addresses that are read and written by the given operation"""
832 assert npu_op.ifm is not None and npu_op.ofm is not None
833 # Read addresses
834 read_ranges = get_address_ranges(npu_op.ifm)
835 if has_ifm2(npu_op):
836 assert npu_op.ifm2 is not None
837 read_ranges.extend(get_address_ranges(npu_op.ifm2))
838 read_ranges.extend(npu_op.weights)
839 read_ranges.extend(npu_op.biases)
840 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
841 address = arch.available_shram_banks(True) * arch.shram_bank_size
842 read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
843 # Written addresses
844 write_ranges = get_address_ranges(npu_op.ofm)
845 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
846 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
847 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
848 write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
849
850 res = MemoryAccessSet()
851 for read_range in read_ranges:
852 if read_range is not None:
853 res.add(memory_range_set(read_range), AccessDirection.Read)
854 for write_range in write_ranges:
855 if write_range is not None:
856 res.add(memory_range_set(write_range), AccessDirection.Write)
857 return res
858
859
860def get_wait_dependency(
861 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
862):
863 """Used to calculate whether DMA wait or kernel wait operations are needed"""
864 npu_op = npu_op_list[op_index]
865 op_access = memory_accesses[npu_op]
866 index = op_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100867
Tim Hall289a41d2020-08-04 21:40:14 +0100868 # NPU dependency tracking
869 npu_outstanding = -1
870 npu_ops = 0
871 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100872
Tim Hall289a41d2020-08-04 21:40:14 +0100873 # DMA dependency tracking
874 dma_outstanding = -1
875 dma_ops = 0
876 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100877
Tim Hall289a41d2020-08-04 21:40:14 +0100878 # Seek back in the command stream looking for NPU or DMA dependencies
879 # but only as far as the first dependency or the watermarks (dependencies
880 # before this point have been satisfied already).
881 # The watermark moves to after the latest element we must wait for, not
882 # the command that issues the wait.
883 # NPU->NPU dependency is handled via blockdep.
884 while (index >= npu_index) or (index >= dma_index):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100885 prev_op = npu_op_list[index]
886 prev_access = memory_accesses[prev_op]
Tim Hall79d07d22020-04-27 18:20:16 +0100887
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100888 # Check NPU consuming DMA output
889 if is_dma_op(prev_op):
890 if index >= dma_index:
891 if not is_dma_op(npu_op):
892 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
893 dma_outstanding = dma_ops
894 dma_ops += 1 # Count DMA ops in the pipeline
895 if dma_ops >= arch.max_outstanding_dma:
896 dma_index = max(index + 1, dma_index)
Tim Hall289a41d2020-08-04 21:40:14 +0100897 # Check DMA consuming NPU output
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100898 else:
Tim Hall289a41d2020-08-04 21:40:14 +0100899 if index >= npu_index:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100900 if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall289a41d2020-08-04 21:40:14 +0100901 npu_outstanding = npu_ops
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100902 npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall289a41d2020-08-04 21:40:14 +0100903 if npu_ops >= arch.max_outstanding_kernels:
904 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100905
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100906 index -= 1
Tim Hall79d07d22020-04-27 18:20:16 +0100907
Tim Hall289a41d2020-08-04 21:40:14 +0100908 # Update DMA watermark if we didn't see any and the NPU pipeline is full
909 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100910 dma_index = op_index
Tim Hall289a41d2020-08-04 21:40:14 +0100911
912 # Bring the search watermark forwards as we complete for those dependencies
913 watermark = Watermark(npu_index, dma_index)
914 outstanding = Watermark(npu_outstanding, dma_outstanding)
915
916 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100917
918
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100919def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
920 if cmd_waits.npu >= 0:
921 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
922
923 if cmd_waits.dma >= 0:
924 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
925
926
927# -------------------------------------------------------------------
928# BLOCKDEP
929# -------------------------------------------------------------------
930
931
932def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
933 """Checks if npu_op's input is dependent on prev_op's output"""
934 assert npu_op.ifm is not None
935 assert prev_op.ofm is not None
936 curr_input_ranges = get_address_ranges(npu_op.ifm)
937
938 if has_ifm2(npu_op):
939 assert npu_op.ifm2 is not None
940 curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
941 for prev_range in get_address_ranges(prev_op.ofm):
942 if prev_range is None:
943 continue
944 for curr_range in curr_input_ranges:
945 if curr_range is not None and ranges_overlap(prev_range, curr_range):
946 return True
Tim Hall79d07d22020-04-27 18:20:16 +0100947 return False
948
949
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100950def shape3d_to_rect(shape: NpuShape3D) -> Rect:
951 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100952
953
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100954def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall79d07d22020-04-27 18:20:16 +0100955 # Note: NOT equivalent to the normal ifm block depth calculation since
956 # it takes into account 'depthless' block operations by returning full
957 # depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100958 if npu_op.op_type == NpuOperationType.Conv2D:
959 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
960 return res
961 return npu_op.ofm.shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100962
963
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100964def calc_blockdep(
965 arch: ArchitectureFeatures,
966 prev_op: Optional[NpuBlockOperation],
967 prev_block_config: Optional[NpuShape3D],
968 npu_op: NpuBlockOperation,
969 block_config: NpuShape3D,
970) -> int:
971 """Calculates the value of the BLOCKDEP register"""
972 if prev_op is None:
973 return 0
974 if not is_dependent_on_prev_op(prev_op, npu_op):
975 return ArchitectureFeatures.MAX_BLOCKDEP
976 if prev_op.ofm.shape != npu_op.ifm.shape:
977 return 0
978 prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
979 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
980 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
981 prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
982 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
983 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
984 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
985 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
986 cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
987 blockdep = arch.calc_block_dep(
988 prev_ifm_rect,
989 prev_ofm_rect,
990 prev_ifm_block_depth,
991 prev_ofm_block,
992 to_kernel(prev_op.kernel),
993 cur_ifm_rect,
994 cur_ofm_rect,
995 cur_ifm_block_depth,
996 cur_ofm_block,
997 to_kernel(npu_op.kernel),
998 cur_padLT,
999 )
1000 return blockdep
Tim Hall79d07d22020-04-27 18:20:16 +01001001
1002
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001003# -------------------------------------------------------------------
1004# PRINT
1005# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +02001006
1007
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001008def print_feature_map(fm: NpuFeatureMap, name: str):
1009 if fm is not None:
1010 q = (
1011 "no quantization"
1012 if fm.quantization is None
1013 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
1014 )
1015 h, w, c = fm.shape
1016 sz = h * w * c * fm.data_type.size_in_bytes()
1017 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
1018 strides = get_strides(fm)
1019 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
1020 t = fm.tiles
1021 addresses = [hex(addr) for addr in t.addresses]
1022 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +01001023
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001024
1025def print_operation(npu_op: NpuOperation, index: int = 0):
1026 pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
1027 if is_dma_op(npu_op):
1028 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
1029 return
1030 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
1031 if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
1032 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +02001033 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001034 if (
1035 npu_op.op_type == NpuOperationType.Conv2D
1036 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
1037 ):
1038 fc = "FullyConnected "
1039 else:
1040 fc = ""
1041 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
1042 print_feature_map(npu_op.ifm, "IFM")
1043 if npu_op.ifm2_scalar is not None:
1044 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1045 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
1046 else:
1047 print_feature_map(npu_op.ifm2, "IFM2")
1048 print_feature_map(npu_op.ofm, "OFM")
1049 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
1050 print(f" Kernel: {k}")
1051 if npu_op.padding is not None:
1052 print(f" {npu_op.padding}")
1053 for weights in npu_op.weights:
1054 print(f" Weights: {weights}")
1055 for bias in npu_op.biases:
1056 print(f" Scales: {bias}")
1057 if npu_op.activation is not None:
1058 act = npu_op.activation
1059 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
1060 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
1061 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
1062 if npu_op.op_type == NpuOperationType.Conv2D:
1063 print(f" {npu_op.block_traversal}")
1064 bh, bw, bc = npu_op.block_config
1065 rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
1066 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +01001067
Tim Hall79d07d22020-04-27 18:20:16 +01001068
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001069def print_operations(npu_op_list: List[NpuOperation]):
1070 for index, npu_op in enumerate(npu_op_list):
1071 print_operation(npu_op, index)
Tim Hall79d07d22020-04-27 18:20:16 +01001072
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001073
1074# -------------------------------------------------------------------
1075# OPERATIONS
1076# -------------------------------------------------------------------
1077
1078
1079def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
1080 """Generates NPU_OP_* command"""
1081 op_type = npu_op.op_type
1082 if op_type == NpuOperationType.Dma:
1083 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
1084 elif op_type == NpuOperationType.Conv2D:
1085 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1086 elif op_type == NpuOperationType.ConvDepthWise:
1087 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1088 elif op_type == NpuOperationType.Pooling:
1089 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
1090 elif op_type == NpuOperationType.ElementWise:
1091 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
1092 else:
1093 assert 0, "Unsupported operation"
1094
1095
1096def generate_conv2d_op(
1097 emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures
1098) -> NpuShape3D:
1099 """Generates register commands for Conv2D operations"""
1100 generate_common(emit, npu_op, npu_op.block_traversal, arch)
1101 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1102 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)
1103 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1104 generate_shram_registers_non_elementwise(emit, shared_buffer)
1105 return block_config
1106
1107
1108def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1109 """Generates register commands for depthwise convolution operations"""
1110 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
1111 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1112 shared_buffer = shared_buffer_allocation_for_npu_op(
1113 arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode
1114 )
1115 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1116 generate_shram_registers_non_elementwise(emit, shared_buffer)
1117 return block_config
1118
1119
1120def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1121 """Generates register commands for pooling operations"""
1122 use_global_scale = (
1123 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
1124 )
1125 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
1126 # Pooling op specific
1127 if use_global_scale:
1128 generate_ofm_scaling_for_pooling(emit, npu_op)
1129 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1130 npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
1131 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)
1132 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1133 generate_shram_registers_non_elementwise(emit, shared_buffer)
1134 return block_config
1135
1136
1137def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
1138 """Generates register commands for elementwise operations"""
1139 use_global_scale = npu_op.sub_op_type in (
1140 NpuElementWiseOp.ADD,
1141 NpuElementWiseOp.SUB,
1142 NpuElementWiseOp.MUL,
1143 NpuElementWiseOp.LRELU,
1144 NpuElementWiseOp.ABS,
1145 )
1146 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
1147 generate_common(
1148 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
1149 )
1150 # Elementwise op specific
1151 if npu_op.sub_op_type not in unary_elementwise_ops:
1152 # Binary operation; generate IFM2 registers
1153 assert npu_op.ifm2 is not None
1154 has_scalar = npu_op.ifm2_scalar is not None
1155 generate_ifm2(emit, npu_op.ifm2, has_scalar)
1156 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1157 generate_ifm2_broadcast(emit, npu_op)
1158 if has_scalar:
1159 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1160 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1161 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
1162 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1163 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)
1164 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1165 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
1166 return block_config
1167
1168
1169def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
1170 """Generates register commands for DMA operations"""
1171 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
1172 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
1173 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1174
1175 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1176 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
1177
1178
1179def generate_registers_for_op(
1180 emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures
1181) -> Optional[NpuShape3D]:
1182 """
1183 Generates register commands for the given operation, but not the final NPU_OP_... command.
1184 Returns the selected block config
1185 """
1186 op_type = npu_op.op_type
1187 block_config = None
1188 if op_type == NpuOperationType.Conv2D:
1189 block_config = generate_conv2d_op(emit, npu_op, arch)
1190 elif op_type == NpuOperationType.ConvDepthWise:
1191 block_config = generate_conv_depthwise_op(emit, npu_op, arch)
1192 elif op_type == NpuOperationType.Pooling:
1193 block_config = generate_pooling_op(emit, npu_op, arch)
1194 elif op_type == NpuOperationType.ElementWise:
1195 block_config = generate_elementwise_op(emit, npu_op, arch)
1196 elif op_type == NpuOperationType.Dma:
1197 generate_dma_op(emit, npu_op)
1198 else:
1199 assert 0, "Unsupported operation"
1200 return block_config
1201
1202
1203def generate_command_stream(
1204 emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
1205):
1206 """Generates register commands for the given list of NPU operations"""
1207 # Calculate memory accesses for every operation
Tim Hall289a41d2020-08-04 21:40:14 +01001208 memory_accesses = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001209 for npu_op in npu_op_list:
1210 if is_dma_op(npu_op):
1211 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
1212 else:
1213 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hallc8a73862020-10-27 12:43:14 +00001214 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001215 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1216 dep_watermark = Watermark(0, 0)
1217 prev_op = None
1218 prev_block_config = None
1219 # Generate register commands for all operations
1220 for op_index, npu_op in enumerate(npu_op_list):
1221 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1222 block_config = generate_registers_for_op(emit, npu_op, arch)
1223 if not is_dma_op(npu_op):
1224 # Generate BLOCKDEP
1225 assert block_config is not None
1226 blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)
1227 blockdep = min(blockdep, arch.max_blockdep)
1228 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1229 prev_op = npu_op
1230 prev_block_config = block_config
1231
1232 generate_cmd_waits(emit, cmd_waits)
1233 # Generate the actual NPU_OP command
1234 generate_operation_code(emit, npu_op)
1235 if add_to_debug_db is not None:
1236 add_to_debug_db(npu_op, emit.offset)
1237 # Fill in final part of command stream:
1238 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1239
1240
1241def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
1242 """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
1243 # Convert high level command stream to list of NpuOperation
1244 npu_op_list = []
1245 npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall79d07d22020-04-27 18:20:16 +01001246 for cmd in sg.high_level_command_stream:
1247 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
1248 print("Warning: Skipping register command stream generation for", cmd.ps)
1249 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001250 npu_op = convert_command_to_npu_op(cmd, arch)
1251 npu_op_list.append(npu_op)
1252 npu_op_to_cmd[npu_op] = cmd
1253 if verbose:
1254 print_operations(npu_op_list)
1255 # Generate register commands
Tim Halle6ccd872020-11-09 16:46:37 +00001256 stream_id = DebugDatabase.add_stream(sg)
1257 DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001258 emit = CommandStreamEmitter()
Tim Halle6ccd872020-11-09 16:46:37 +00001259
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001260 def add_to_debug_db(npu_op: NpuOperation, offset: int):
1261 """Adds info to the debug database"""
1262 if not is_dma_op(npu_op):
1263 cmd = npu_op_to_cmd[npu_op]
1264 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall289a41d2020-08-04 21:40:14 +01001265
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001266 generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall79d07d22020-04-27 18:20:16 +01001267 sg.register_command_stream = emit.to_list()
1268 if verbose:
1269 emit.print_cmds()
1270 print("number of commands", len(emit.cmd_stream))
1271 print("command stream length in words", len(sg.register_command_stream))
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001272
1273
Louis Verhaardaeae5672020-11-02 18:04:27 +01001274def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001275 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001276 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001277 Calculates dependencies between commands and inserts wait operations if needed.
1278
1279 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001280 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1281 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001282 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001283 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001284 emit = CommandStreamEmitter()
1285 arch = ArchitectureFeatures(
Tim Hall1bd531d2020-11-01 20:59:36 +00001286 vela_config_files=None,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001287 accelerator_config=accelerator.value,
Tim Hall1bd531d2020-11-01 20:59:36 +00001288 system_config=ArchitectureFeatures.DEFAULT_CONFIG,
1289 memory_mode=ArchitectureFeatures.DEFAULT_CONFIG,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001290 override_block_config=None,
1291 block_config_limit=None,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001292 max_blockdep=ArchitectureFeatures.MAX_BLOCKDEP,
1293 weight_estimation_scaling=1.0,
Tim Hall1bd531d2020-11-01 20:59:36 +00001294 verbose_config=False,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001295 )
1296 generate_command_stream(emit, npu_op_list, arch)
1297 return emit.to_list()