blob: 30b5e04a12badca2d0b05fcdfb38ad6a310d0d72 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Louis Verhaarde8a5a782020-11-02 18:04:27 +010017# Register level (low-level) command stream generation for Ethos-U55. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
19# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
Louis Verhaarde8a5a782020-11-02 18:04:27 +010029from . import numeric_util
Diego Russoea6111a2020-04-14 18:41:58 +010030from . import scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +010031from .api import NpuActivation
32from .api import NpuActivationOp
33from .api import NpuAddressRange
34from .api import NpuBlockOperation
35from .api import NpuBlockTraversal
36from .api import NpuConv2DOperation
37from .api import NpuDataType
38from .api import NpuDmaOperation
39from .api import NpuElementWiseOp
40from .api import NpuElementWiseOperation
41from .api import NpuFeatureMap
42from .api import NpuKernel
43from .api import NpuLayout
44from .api import NpuOperation
45from .api import NpuOperationType
46from .api import NpuPadding
47from .api import NpuPoolingOp
48from .api import NpuPoolingOperation
49from .api import NpuQuantization
50from .api import NpuResamplingMode
51from .api import NpuRoundingMode
52from .api import NpuShape3D
53from .api import NpuTileBox
54from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010055from .architecture_features import ArchitectureFeatures
56from .architecture_features import Block
Diego Russoe8a10452020-04-21 17:39:10 +010057from .architecture_features import Rect
58from .architecture_features import SharedBufferArea
59from .architecture_features import SHRAMElements
Tim Halle6ccd872020-11-09 16:46:37 +000060from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010061from .ethos_u55_regs.ethos_u55_regs import acc_format
62from .ethos_u55_regs.ethos_u55_regs import activation
63from .ethos_u55_regs.ethos_u55_regs import cmd0
64from .ethos_u55_regs.ethos_u55_regs import cmd1
65from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020066from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020067from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010068from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010069from .high_level_command_stream import CommandType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010070from .high_level_command_to_npu_op import convert_command_to_npu_op
71from .high_level_command_to_npu_op import to_kernel
72from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import quantise_float32
74from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010075from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010076from .operation import NpuBlockType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010077from .range_set import AccessDirection
78from .range_set import MemoryAccessSet
79from .range_set import MemoryRangeSet
80from .shared_buffer_allocation import find_suitable_block_configs
81from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
82from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010083
84
85class RegisterMachine:
86 def __init__(self):
87 self.n_banks = 1
88 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
89 self.bank_idx = 0
90
91 def set_register(self, reg, value):
92 is_changed = self.registers[self.bank_idx][reg] != value
93 self.registers[self.bank_idx][reg] = value
94 # is_changed = True # force command
95 return is_changed
96
97 def switch_bank(self):
98 self.bank_idx = (self.bank_idx + 1) % self.n_banks
99
100
101class CmdMode(IntEnum):
102 NoPayload = 0x0000
103 Payload32 = 0x4000
104 Mask = 0xC000
105 CmdOpMask = 0x03FF
106
107
Tim Hall79d07d22020-04-27 18:20:16 +0100108class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000109 WORD_SIZE = 4
110
Tim Hall79d07d22020-04-27 18:20:16 +0100111 def __init__(self):
112 self.cmd_stream = []
113 self.reg_machine = [RegisterMachine(), RegisterMachine()]
114 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000115 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100116
117 def get_reg_machine(self, cmd):
118 if "DMA" in cmd.name:
119 return self.reg_machine[1]
120 else:
121 return self.reg_machine[0]
122
123 def size_in_bytes(self):
124 sz = 0
125 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000126 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100127 return sz
128
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100129 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100130 return [elem for cmd in self.cmd_stream for elem in cmd]
131
132 def print_cmds(self):
133 print("Code: Command: Param: Payload:")
134 for words_for_one_command in self.cmd_stream:
135 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
136 param = words_for_one_command[0] >> 16 # higher 16 bits
137
138 payload_mode = CmdMode(code & CmdMode.Mask)
139
140 # code and command
141 s = " 0x%04x " % code
142 if payload_mode == CmdMode.NoPayload:
143 s += str(cmd0(code & CmdMode.CmdOpMask))
144 else:
145 s += str(cmd1(code & CmdMode.CmdOpMask))
146
147 s = s.ljust(40)
148 s += "%5d" % param
149
150 # payload
151 if payload_mode == CmdMode.Payload32:
152 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
153 else:
154 s += " -"
155
156 print(s)
157
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100158 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100159 if isinstance(param, Enum):
160 param = int(param.value)
161 else:
162 param = int(param)
163 param = param & 0xFFFF
164 command = cmd.value | (param << 16)
165 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
166 return
167
168 # This is not a redundant command, actually write it
169 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000170 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100171
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100172 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100173 offset = int(offset) & 0xFFFFFFFFF
174 command = cmd.value | CmdMode.Payload32.value | (param << 16)
175
176 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
177 return
178
179 # This is not a redundant command, actually write it
180 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000181 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100182
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100183 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100184 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100185 command = ((param & 0xFFFF) << 16) | cmd.value
186 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000187 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100188
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100189 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100190 param = int(param)
191 command = ((param & 0xFFFF) << 16) | cmd.value
192
193 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000194 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100195 self.get_reg_machine(cmd).switch_bank()
196
197
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100198# -------------------------------------------------------------------
199# REGISTER GENERATION
200# -------------------------------------------------------------------
201
202
203class BasePointerIndex(IntEnum):
204 WeightTensor = 0 # base address index for the Weight tensor
205 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
206 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
207 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
208
209
210# TODO: Replace with definitions from ethos_u55_regs
211class IFM2Broadcast(IntEnum):
212 BroadcastHdim = 1 << 0
213 BroadcastWdim = 1 << 1
214 BroadcastCdim = 1 << 2
215 ReverseOperandOrder = 1 << 6
216 UseIFM2Scalar = 1 << 7
217
218
219pooling_op_map = {
220 NpuPoolingOp.MAX: pooling_mode.MAX.value,
221 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
222 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
223}
224
225elementwise_op_map = {
226 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
227 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
228 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
229 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
230 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
231 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
232 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
233 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
234 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
235 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
236}
237
238activation_op_map = {
239 NpuActivationOp.NONE_OR_RELU: activation.NONE,
240 NpuActivationOp.TANH: activation.TANH,
241 NpuActivationOp.SIGMOID: activation.SIGMOID,
242}
243
244# Maps an AccumulatorType enum to the corresponding acc_format value
245acc_format_map = {
246 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
247 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
248 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
249}
250
251resampling_mode_map = {
252 NpuResamplingMode.NONE: resampling_mode.NONE,
253 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
254 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
255}
256
257# Maps data type size in bits to activation precision
258precision_map = {8: 0, 16: 1, 32: 2}
259
260# Maps rounding mode to the corresponding value
261rounding_mode_map = {
262 NpuRoundingMode.TFL: rounding.TFL.value,
263 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
264 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
265}
266
267
268def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
269 """Quantizes the given value"""
270 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
271 zp = 0 if quant is None else quant.zero_point
272 return quantise_float32(value, scale, zp)
273
274
275def has_ifm2(npu_op: NpuBlockOperation) -> bool:
276 """Checks if op has non-scalar IFM2"""
277 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
278
279
280def is_dma_op(npu_op: NpuOperation) -> bool:
281 """Checks if op is a DMA operation"""
282 return npu_op.op_type == NpuOperationType.Dma
283
284
285def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
286 """Generates IFM_PAD registers"""
287 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
288 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
289 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
290 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
291
292
293def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
294 """Generates ACTIVATION registers"""
295 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
296
297 if act.min is None:
298 quantized_min = ofm.data_type.min_value()
299 else:
300 quantized_min = quantise(act.min, ofm.quantization)
301 if act.max is None:
302 quantized_max = ofm.data_type.max_value()
303 else:
304 quantized_max = quantise(act.max, ofm.quantization)
305 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
306 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
307 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
308 assert 0 <= act.lookup_table_index < 8
309 activation_value = 16 + act.lookup_table_index
310 if ofm.data_type == NpuDataType.INT32:
311 activation_value |= 3 << 12 # Force I8 range
312 quantized_min = max(-128, quantized_min)
313 quantized_max = min(127, quantized_max)
314 else:
315 activation_value = activation_op_map[act.op_type]
316 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
317 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
318 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
319
320
321def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
322 """Generates xFM_BASE registers"""
323 if layout == NpuLayout.NHCWB16:
324 # Check that all BasePointer addresses are aligned to 16 bytes
325 assert all((int(addr) % 16) == 0 for addr in addresses)
326 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
327 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
328 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
329 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
330
331
332def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
333 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
334 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
335 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
336 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
337
338
339def generate_strides(
340 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
341):
342 """Generates STRIDE_C/Y/X registers"""
343 strides = get_strides(fm)
344 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
345 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
346 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
347
348
349def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
350 """Generates IFM/IFM2_PRECISION register"""
351 dtype = fm.data_type
352 prec = 1 if dtype.is_signed() else 0
353 activation_precision = precision_map[dtype.size_in_bits()]
354 prec += activation_precision << 2
355
356 if fm.layout == NpuLayout.NHCWB16:
357 prec |= 1 << 6
358
359 prec |= op_to_scale << 8
360 emit.cmd0_with_param(precision_cmd, prec)
361
362
363def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
364 """Generates OFM_PRECISION register"""
365 dtype = npu_op.ofm.data_type
366 prec = 1 if dtype.is_signed() else 0
367 activation_precision = precision_map[dtype.size_in_bits()]
368 prec += activation_precision << 1
369
370 if use_global_scale:
371 # Set global scale bit, as opposed to using per channel scale
372 prec |= 1 << 8
373 if npu_op.ofm.layout == NpuLayout.NHCWB16:
374 prec |= 1 << 6
375 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
376 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
377
378
379def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
380 """Generates IFM2_BROADCAST register for binary elementwise operations"""
381 ifm2_broadcast = 0
382 ifm = npu_op.ifm
383 ifm2 = npu_op.ifm2
384 if npu_op.reversed_operands:
385 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
386 if npu_op.ifm2_scalar is not None:
387 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
388 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
389 else:
390 if ifm.shape.height != ifm2.shape.height:
391 # Broadcast in 'H' dimension
392 assert ifm2.shape.height == 1
393 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
394
395 if ifm.shape.width != ifm2.shape.width:
396 # Broadcast in 'W' dimension
397 assert ifm2.shape.width == 1
398 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
399
400 if ifm.shape.depth != ifm2.shape.depth:
401 # Broadcast in 'C' dimension
402 assert ifm2.shape.depth == 1
403 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
404
405 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
406
407
408def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
409 """Generates general IFM registers"""
410 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
411 generate_addresses(
412 emit,
413 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
414 ifm.tiles.addresses,
415 ifm.layout,
416 )
417 generate_tiles(
418 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
419 )
420 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
421 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
422 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
423
424
425def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
426 """Generates general IFM2 registers"""
427 if not has_scalar:
428 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
429 generate_addresses(
430 emit,
431 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
432 ifm2.tiles.addresses,
433 ifm2.layout,
434 )
435 generate_tiles(
436 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
437 )
438 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
439 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
440
441
442def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
443 """Generates general OFM registers"""
444 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
445 generate_addresses(
446 emit,
447 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
448 ofm.tiles.addresses,
449 ofm.layout,
450 )
451 generate_tiles(
452 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
453 )
454 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
455 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
456 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
457 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
459
460
461def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
462 """Generates KERNEL related registers"""
463 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
464 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
465 # set kernel x stride low bit
466 stride = (kernel.stride_x - 1) & 1
467 # set kernel y stride low bit
468 stride |= (kernel.stride_y - 1 & 1) << 1
469 # set kernel x stride extension bits
470 stride |= (kernel.stride_x - 1 >> 1) << 6
471 # set kernel y stride extension bits
472 stride |= (kernel.stride_y - 1 >> 1) << 9
473 stride |= (kernel.dilation_x - 1) << 3
474 stride |= (kernel.dilation_y - 1) << 4
475 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
476 stride |= 1 << 2
477 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
478
479
480def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
481 """Generates WEIGHT registers"""
482 if len(weights) == 0:
483 return
484 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
485 # Set weights sources for active and present cores
486 for core, (addr, length) in enumerate(
487 [
488 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
489 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
490 ]
491 ):
492 if core < len(weights):
493 emit.cmd1_with_offset(addr, weights[core].address)
494 emit.cmd1_with_offset(length, weights[core].length)
495 elif core < arch.ncores:
496 emit.cmd1_with_offset(addr, weights[0].address)
497 emit.cmd1_with_offset(length, 0)
498
499
500def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
501 """Generates SCALE registers"""
502 if len(biases) == 0:
503 return
504 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
505 # Set weights sources for active and present cores
506 for core, (addr, length) in enumerate(
507 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
508 ):
509 if core < len(biases):
510 emit.cmd1_with_offset(addr, biases[core].address)
511 emit.cmd1_with_offset(length, biases[core].length)
512 elif core < arch.ncores:
513 emit.cmd1_with_offset(addr, biases[0].address)
514 emit.cmd1_with_offset(length, 0)
515
516
517def generate_block_config(
518 emit: CommandStreamEmitter,
519 npu_op: NpuBlockOperation,
520 arch: ArchitectureFeatures,
521 shared_buffer: SharedBufferAllocation,
522) -> NpuShape3D:
523 """Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
524 block_config = npu_op.block_config
525 if block_config is None or block_config.height < 0:
526 # Note: this code only used if the public API to generate command streams is used;
527 # in the "normal" flow, the block config selected by the scheduler is used
528 if npu_op.weights:
529 assert block_config is not None, "block_config.depth must be provided for ops with weights"
530 # Block config has not been provided: find one
531 blocks = find_suitable_block_configs(arch, shared_buffer)
532 # Return the block with biggest volume
533 # TODO: use a better algorithm to find the best block
534 best_block = None
535 best_value = 0
536 for block in blocks:
537 if block_config is not None and block[3] != block_config.depth:
538 continue
539 value = block[0] * block[1] * block[3]
540 if value > best_value:
541 best_value = value
542 best_block = block
543 assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"
544 block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])
545 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
546 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
547 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
548 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
549 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
550 return block_config
551
552
553def generate_shram_registers_elementwise(
554 emit: CommandStreamEmitter,
555 npu_op: NpuElementWiseOperation,
556 arch: ArchitectureFeatures,
557 shared_buffer: SharedBufferAllocation,
558):
559 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
560 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
561 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
562 shram_required = arch.available_shram_banks(uses_lut)
563
564 # Acc buffers not needed so set AB_START to size of SHRAM
565 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
566 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
567 if has_ifm2(npu_op):
568 # Set IFM2_IB_START to the latter half of the IB space
569 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
570 emit.cmd0_with_param(
571 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
572 )
573 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
574
575
576def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
577 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
578 emit.cmd0_with_param(
579 cmd0.NPU_SET_IFM_IB_END,
580 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
581 )
582 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
583 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
584
585
586def generate_common(
587 emit: CommandStreamEmitter,
588 npu_op: NpuBlockOperation,
589 block_traversal: NpuBlockTraversal,
590 arch: ArchitectureFeatures,
591 use_global_scale: bool = False,
592 op_to_scale: int = 0,
593):
594 """Generate registers that are common to most operations"""
595 assert npu_op.ifm is not None and npu_op.ofm is not None
596 generate_ifm(emit, npu_op.ifm)
597 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
598 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
599 if npu_op.padding is not None:
600 generate_padding(emit, npu_op.padding)
601 generate_ofm(emit, npu_op.ofm)
602 generate_ofm_precision(emit, npu_op, use_global_scale)
603 if npu_op.op_type != NpuOperationType.ElementWise:
604 assert npu_op.kernel is not None
605 generate_kernel(emit, npu_op.kernel, block_traversal)
606 generate_weights(emit, npu_op.weights, arch)
607 generate_biases(emit, npu_op.biases, arch)
608 generate_activation(emit, npu_op.activation, npu_op.ofm)
609
610
611# -------------------------------------------------------------------
612# SCALING
613# -------------------------------------------------------------------
614
615
616def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
617 """Generates OFM_SCALE register for pooling operations"""
618 # For valid padding vela has to output scaling values
619 kernel = pool_op.kernel
620 ifm_quant = pool_op.ifm.quantization
621 ofm_quant = pool_op.ofm.quantization
622 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
623 assert ifm_quant.scale_f32 is not None
624 rescale = 0x3000 * ifm_quant.scale_f32
625 if pool_op.ifm.data_type == NpuDataType.INT16:
626 # Calculate scale and shift for the output scale of 1/(3*4096)
627 shift = 0
628 max_rescale = np.iinfo(np.int16).max / 2
629 while rescale <= max_rescale and shift <= 30:
630 shift += 1
631 rescale *= 2
632 scale = int(rescale)
633 else:
634 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
635 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
636 scale = int(round_away_zero(scale * rescale))
637 elif pool_op.fused_quantize:
638 # Quantize op requires different scaling
639 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
640 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
641 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
642 elif pool_op.rescale is not None:
643 # for ResizeBilinear operations with "rescale" in primary_op.attrs
644 rescale = pool_op.rescale
645 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
646 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
647 scale = int(round_away_zero(scale * rescale))
648 else:
649 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
650 # kernel height == kernel width == 1 is always true in this case
651 # Normally the scale is maximised, to get maximum precision, which means that
652 # if rescale != 1, scale need to consider the number of bits needed for rescaling
653 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
654 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
655 rescale_bits = 0
656 if kernel.height == kernel.width == 1:
657 if rescale > 1:
658 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
659 elif rescale < 1:
660 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
661 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
662 scale = int(round_away_zero(scale * rescale))
663 else:
664 scale = 1
665 shift = 0
666
667 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
668
669
670def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
671 """
672 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
673 Returns the operator to scale
674 """
675 op_to_scale = 0
676 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
677 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
678 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
679 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
680
681 if npu_op.activation is not None and npu_op.activation.op_type in (
682 NpuActivationOp.SIGMOID,
683 NpuActivationOp.TANH,
684 ):
685 output_scale = 1 / 0x3000
686
687 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
688 if None in (input_scale, input2_scale, output_scale):
689 ofm_scale = 1
690 shift = 0
691 else:
692 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
693 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
694 else: # Add/Sub
695 if None in (input_scale, input2_scale, output_scale):
696 opa_scale = opb_scale = ofm_scale = 1
697 opa_shift = shift = 0
698 if npu_op.rescale is not None:
699 ofm_scale, shift = npu_op.rescale
700 elif input_scale == input2_scale:
701 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
702 input_scale, input2_scale, output_scale
703 )
704 opa_shift = 0 # Unused for this case
705 else:
706 # Use advanced implementation only when input scales differ
707 bitdepth = npu_op.ifm.data_type.size_in_bits()
708 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
709 input_scale, input2_scale, output_scale, bitdepth
710 )
711 opb_scale = 0 # Unused for this case
712 if npu_op.reversed_operands:
713 # If the operand order is reversed we also have to swap which operand is scaled
714 if op_to_scale == scaling.OperandToScale.OPa:
715 op_to_scale = scaling.OperandToScale.OPb
716 else:
717 op_to_scale = scaling.OperandToScale.OPa
718 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
719 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
720 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
721 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
722 output_scale = npu_op.ofm.quantization.scale_f32
723 ofm_scale, shift = scaling.quantise_scale(output_scale)
724 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
725 else:
726 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
727 return op_to_scale
728
729
730# -------------------------------------------------------------------
731# ADDRESSING/STRIDES (helper functions)
732# -------------------------------------------------------------------
733
734
735def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
736 """Checks if the ranges overlap"""
737 return range1.region == range2.region and numeric_util.overlaps(
738 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
739 )
740
741
742def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
743 """Calculates STRIDE_C/Y/X"""
744 if fm.strides is not None:
745 return fm.strides
746 elem_size = fm.data_type.size_in_bytes()
747 if fm.layout == NpuLayout.NHWC:
748 stride_c = elem_size
749 stride_x = fm.shape.depth * stride_c
750 stride_y = fm.shape.width * stride_x
751 else:
752 stride_x = 16 * elem_size
753 stride_c = stride_x * fm.shape.width
754 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
755 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
756
757
758def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
759 """Returns address of given coordinate"""
760 t = 0
761 BRICK = 16
762 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
763 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
764 if x >= fm.tiles.width_0:
765 x -= fm.tiles.width_0
766 t = 1
767 if y >= fm.tiles.height_1:
768 y -= fm.tiles.height_1
769 t += 2
770 elif y >= fm.tiles.height_0:
771 y -= fm.tiles.height_0
772 t += 2
773 elem_size = fm.data_type.size_in_bytes()
774 return (
775 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
776 )
777
778
779def get_address_range(
780 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
781) -> NpuAddressRange:
782 """Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
783 addr0 = get_address(fm, strides, y0, x0, c0)
784 addr1 = get_address(fm, strides, y1, x1, c1)
785 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
786
787
788def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
789 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
790 strides = get_strides(fm)
791 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
792 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
793 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
794 if width > width_0:
795 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
796 else:
797 t1 = None
798 if height > height_0:
799 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
800 else:
801 t2 = None
802 if t1 is not None and t2 is not None:
803 t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
804 else:
805 t3 = None
806 return [t0, t1, t2, t3]
807
808
809# -------------------------------------------------------------------
810# DMA_WAIT/KERNEL_WAIT
811# -------------------------------------------------------------------
812
813
Tim Hall289a41d2020-08-04 21:40:14 +0100814Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100815
Tim Hall79d07d22020-04-27 18:20:16 +0100816
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100817def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
818 return MemoryRangeSet(range.region, range.address, range.address + range.length)
819
820
821def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
822 """Returns the address that are read and written by the given DMA operation"""
823 res = MemoryAccessSet()
824 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
825 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
826 return res
827
828
829def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
830 """Returns the addresses that are read and written by the given operation"""
831 assert npu_op.ifm is not None and npu_op.ofm is not None
832 # Read addresses
833 read_ranges = get_address_ranges(npu_op.ifm)
834 if has_ifm2(npu_op):
835 assert npu_op.ifm2 is not None
836 read_ranges.extend(get_address_ranges(npu_op.ifm2))
837 read_ranges.extend(npu_op.weights)
838 read_ranges.extend(npu_op.biases)
839 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
840 address = arch.available_shram_banks(True) * arch.shram_bank_size
841 read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
842 # Written addresses
843 write_ranges = get_address_ranges(npu_op.ofm)
844 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
845 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
846 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
847 write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
848
849 res = MemoryAccessSet()
850 for read_range in read_ranges:
851 if read_range is not None:
852 res.add(memory_range_set(read_range), AccessDirection.Read)
853 for write_range in write_ranges:
854 if write_range is not None:
855 res.add(memory_range_set(write_range), AccessDirection.Write)
856 return res
857
858
859def get_wait_dependency(
860 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
861):
862 """Used to calculate whether DMA wait or kernel wait operations are needed"""
863 npu_op = npu_op_list[op_index]
864 op_access = memory_accesses[npu_op]
865 index = op_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100866
Tim Hall289a41d2020-08-04 21:40:14 +0100867 # NPU dependency tracking
868 npu_outstanding = -1
869 npu_ops = 0
870 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100871
Tim Hall289a41d2020-08-04 21:40:14 +0100872 # DMA dependency tracking
873 dma_outstanding = -1
874 dma_ops = 0
875 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100876
Tim Hall289a41d2020-08-04 21:40:14 +0100877 # Seek back in the command stream looking for NPU or DMA dependencies
878 # but only as far as the first dependency or the watermarks (dependencies
879 # before this point have been satisfied already).
880 # The watermark moves to after the latest element we must wait for, not
881 # the command that issues the wait.
882 # NPU->NPU dependency is handled via blockdep.
883 while (index >= npu_index) or (index >= dma_index):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884 prev_op = npu_op_list[index]
885 prev_access = memory_accesses[prev_op]
Tim Hall79d07d22020-04-27 18:20:16 +0100886
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100887 # Check NPU consuming DMA output
888 if is_dma_op(prev_op):
889 if index >= dma_index:
890 if not is_dma_op(npu_op):
891 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
892 dma_outstanding = dma_ops
893 dma_ops += 1 # Count DMA ops in the pipeline
894 if dma_ops >= arch.max_outstanding_dma:
895 dma_index = max(index + 1, dma_index)
Tim Hall289a41d2020-08-04 21:40:14 +0100896 # Check DMA consuming NPU output
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100897 else:
Tim Hall289a41d2020-08-04 21:40:14 +0100898 if index >= npu_index:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100899 if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall289a41d2020-08-04 21:40:14 +0100900 npu_outstanding = npu_ops
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100901 npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall289a41d2020-08-04 21:40:14 +0100902 if npu_ops >= arch.max_outstanding_kernels:
903 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100904
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100905 index -= 1
Tim Hall79d07d22020-04-27 18:20:16 +0100906
Tim Hall289a41d2020-08-04 21:40:14 +0100907 # Update DMA watermark if we didn't see any and the NPU pipeline is full
908 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100909 dma_index = op_index
Tim Hall289a41d2020-08-04 21:40:14 +0100910
911 # Bring the search watermark forwards as we complete for those dependencies
912 watermark = Watermark(npu_index, dma_index)
913 outstanding = Watermark(npu_outstanding, dma_outstanding)
914
915 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100916
917
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100918def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
919 if cmd_waits.npu >= 0:
920 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
921
922 if cmd_waits.dma >= 0:
923 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
924
925
926# -------------------------------------------------------------------
927# BLOCKDEP
928# -------------------------------------------------------------------
929
930
931def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
932 """Checks if npu_op's input is dependent on prev_op's output"""
933 assert npu_op.ifm is not None
934 assert prev_op.ofm is not None
935 curr_input_ranges = get_address_ranges(npu_op.ifm)
936
937 if has_ifm2(npu_op):
938 assert npu_op.ifm2 is not None
939 curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
940 for prev_range in get_address_ranges(prev_op.ofm):
941 if prev_range is None:
942 continue
943 for curr_range in curr_input_ranges:
944 if curr_range is not None and ranges_overlap(prev_range, curr_range):
945 return True
Tim Hall79d07d22020-04-27 18:20:16 +0100946 return False
947
948
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100949def shape3d_to_rect(shape: NpuShape3D) -> Rect:
950 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100951
952
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100953def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall79d07d22020-04-27 18:20:16 +0100954 # Note: NOT equivalent to the normal ifm block depth calculation since
955 # it takes into account 'depthless' block operations by returning full
956 # depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100957 if npu_op.op_type == NpuOperationType.Conv2D:
958 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
959 return res
960 return npu_op.ofm.shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100961
962
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100963def calc_blockdep(
964 arch: ArchitectureFeatures,
965 prev_op: Optional[NpuBlockOperation],
966 prev_block_config: Optional[NpuShape3D],
967 npu_op: NpuBlockOperation,
968 block_config: NpuShape3D,
969) -> int:
970 """Calculates the value of the BLOCKDEP register"""
971 if prev_op is None:
972 return 0
973 if not is_dependent_on_prev_op(prev_op, npu_op):
974 return ArchitectureFeatures.MAX_BLOCKDEP
975 if prev_op.ofm.shape != npu_op.ifm.shape:
976 return 0
977 prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
978 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
979 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
980 prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
981 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
982 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
983 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
984 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
985 cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
986 blockdep = arch.calc_block_dep(
987 prev_ifm_rect,
988 prev_ofm_rect,
989 prev_ifm_block_depth,
990 prev_ofm_block,
991 to_kernel(prev_op.kernel),
992 cur_ifm_rect,
993 cur_ofm_rect,
994 cur_ifm_block_depth,
995 cur_ofm_block,
996 to_kernel(npu_op.kernel),
997 cur_padLT,
998 )
999 return blockdep
Tim Hall79d07d22020-04-27 18:20:16 +01001000
1001
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001002# -------------------------------------------------------------------
1003# PRINT
1004# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +02001005
1006
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001007def print_feature_map(fm: NpuFeatureMap, name: str):
1008 if fm is not None:
1009 q = (
1010 "no quantization"
1011 if fm.quantization is None
1012 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
1013 )
1014 h, w, c = fm.shape
1015 sz = h * w * c * fm.data_type.size_in_bytes()
1016 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
1017 strides = get_strides(fm)
1018 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
1019 t = fm.tiles
1020 addresses = [hex(addr) for addr in t.addresses]
1021 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +01001022
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001023
1024def print_operation(npu_op: NpuOperation, index: int = 0):
1025 pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
1026 if is_dma_op(npu_op):
1027 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
1028 return
1029 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
1030 if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
1031 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +02001032 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001033 if (
1034 npu_op.op_type == NpuOperationType.Conv2D
1035 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
1036 ):
1037 fc = "FullyConnected "
1038 else:
1039 fc = ""
1040 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
1041 print_feature_map(npu_op.ifm, "IFM")
1042 if npu_op.ifm2_scalar is not None:
1043 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1044 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
1045 else:
1046 print_feature_map(npu_op.ifm2, "IFM2")
1047 print_feature_map(npu_op.ofm, "OFM")
1048 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
1049 print(f" Kernel: {k}")
1050 if npu_op.padding is not None:
1051 print(f" {npu_op.padding}")
1052 for weights in npu_op.weights:
1053 print(f" Weights: {weights}")
1054 for bias in npu_op.biases:
1055 print(f" Scales: {bias}")
1056 if npu_op.activation is not None:
1057 act = npu_op.activation
1058 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
1059 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
1060 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
1061 if npu_op.op_type == NpuOperationType.Conv2D:
1062 print(f" {npu_op.block_traversal}")
1063 bh, bw, bc = npu_op.block_config
1064 rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
1065 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +01001066
Tim Hall79d07d22020-04-27 18:20:16 +01001067
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001068def print_operations(npu_op_list: List[NpuOperation]):
1069 for index, npu_op in enumerate(npu_op_list):
1070 print_operation(npu_op, index)
Tim Hall79d07d22020-04-27 18:20:16 +01001071
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001072
1073# -------------------------------------------------------------------
1074# OPERATIONS
1075# -------------------------------------------------------------------
1076
1077
1078def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
1079 """Generates NPU_OP_* command"""
1080 op_type = npu_op.op_type
1081 if op_type == NpuOperationType.Dma:
1082 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
1083 elif op_type == NpuOperationType.Conv2D:
1084 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1085 elif op_type == NpuOperationType.ConvDepthWise:
1086 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1087 elif op_type == NpuOperationType.Pooling:
1088 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
1089 elif op_type == NpuOperationType.ElementWise:
1090 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
1091 else:
1092 assert 0, "Unsupported operation"
1093
1094
1095def generate_conv2d_op(
1096 emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures
1097) -> NpuShape3D:
1098 """Generates register commands for Conv2D operations"""
1099 generate_common(emit, npu_op, npu_op.block_traversal, arch)
1100 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1101 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)
1102 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1103 generate_shram_registers_non_elementwise(emit, shared_buffer)
1104 return block_config
1105
1106
1107def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1108 """Generates register commands for depthwise convolution operations"""
1109 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
1110 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1111 shared_buffer = shared_buffer_allocation_for_npu_op(
1112 arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode
1113 )
1114 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1115 generate_shram_registers_non_elementwise(emit, shared_buffer)
1116 return block_config
1117
1118
1119def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1120 """Generates register commands for pooling operations"""
1121 use_global_scale = (
1122 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
1123 )
1124 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
1125 # Pooling op specific
1126 if use_global_scale:
1127 generate_ofm_scaling_for_pooling(emit, npu_op)
1128 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1129 npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
1130 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)
1131 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1132 generate_shram_registers_non_elementwise(emit, shared_buffer)
1133 return block_config
1134
1135
1136def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
1137 """Generates register commands for elementwise operations"""
1138 use_global_scale = npu_op.sub_op_type in (
1139 NpuElementWiseOp.ADD,
1140 NpuElementWiseOp.SUB,
1141 NpuElementWiseOp.MUL,
1142 NpuElementWiseOp.LRELU,
1143 NpuElementWiseOp.ABS,
1144 )
1145 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
1146 generate_common(
1147 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
1148 )
1149 # Elementwise op specific
1150 if npu_op.sub_op_type not in unary_elementwise_ops:
1151 # Binary operation; generate IFM2 registers
1152 assert npu_op.ifm2 is not None
1153 has_scalar = npu_op.ifm2_scalar is not None
1154 generate_ifm2(emit, npu_op.ifm2, has_scalar)
1155 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1156 generate_ifm2_broadcast(emit, npu_op)
1157 if has_scalar:
1158 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1159 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1160 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
1161 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1162 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)
1163 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1164 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
1165 return block_config
1166
1167
1168def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
1169 """Generates register commands for DMA operations"""
1170 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
1171 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
1172 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1173
1174 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1175 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
1176
1177
1178def generate_registers_for_op(
1179 emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures
1180) -> Optional[NpuShape3D]:
1181 """
1182 Generates register commands for the given operation, but not the final NPU_OP_... command.
1183 Returns the selected block config
1184 """
1185 op_type = npu_op.op_type
1186 block_config = None
1187 if op_type == NpuOperationType.Conv2D:
1188 block_config = generate_conv2d_op(emit, npu_op, arch)
1189 elif op_type == NpuOperationType.ConvDepthWise:
1190 block_config = generate_conv_depthwise_op(emit, npu_op, arch)
1191 elif op_type == NpuOperationType.Pooling:
1192 block_config = generate_pooling_op(emit, npu_op, arch)
1193 elif op_type == NpuOperationType.ElementWise:
1194 block_config = generate_elementwise_op(emit, npu_op, arch)
1195 elif op_type == NpuOperationType.Dma:
1196 generate_dma_op(emit, npu_op)
1197 else:
1198 assert 0, "Unsupported operation"
1199 return block_config
1200
1201
1202def generate_command_stream(
1203 emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
1204):
1205 """Generates register commands for the given list of NPU operations"""
1206 # Calculate memory accesses for every operation
Tim Hall289a41d2020-08-04 21:40:14 +01001207 memory_accesses = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001208 for npu_op in npu_op_list:
1209 if is_dma_op(npu_op):
1210 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
1211 else:
1212 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
1213 if arch.is_yoda_system:
1214 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1215 dep_watermark = Watermark(0, 0)
1216 prev_op = None
1217 prev_block_config = None
1218 # Generate register commands for all operations
1219 for op_index, npu_op in enumerate(npu_op_list):
1220 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1221 block_config = generate_registers_for_op(emit, npu_op, arch)
1222 if not is_dma_op(npu_op):
1223 # Generate BLOCKDEP
1224 assert block_config is not None
1225 blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)
1226 blockdep = min(blockdep, arch.max_blockdep)
1227 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1228 prev_op = npu_op
1229 prev_block_config = block_config
1230
1231 generate_cmd_waits(emit, cmd_waits)
1232 # Generate the actual NPU_OP command
1233 generate_operation_code(emit, npu_op)
1234 if add_to_debug_db is not None:
1235 add_to_debug_db(npu_op, emit.offset)
1236 # Fill in final part of command stream:
1237 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1238
1239
1240def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
1241 """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
1242 # Convert high level command stream to list of NpuOperation
1243 npu_op_list = []
1244 npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall79d07d22020-04-27 18:20:16 +01001245 for cmd in sg.high_level_command_stream:
1246 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
1247 print("Warning: Skipping register command stream generation for", cmd.ps)
1248 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001249 npu_op = convert_command_to_npu_op(cmd, arch)
1250 npu_op_list.append(npu_op)
1251 npu_op_to_cmd[npu_op] = cmd
1252 if verbose:
1253 print_operations(npu_op_list)
1254 # Generate register commands
Tim Halle6ccd872020-11-09 16:46:37 +00001255 stream_id = DebugDatabase.add_stream(sg)
1256 DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001257 emit = CommandStreamEmitter()
Tim Halle6ccd872020-11-09 16:46:37 +00001258
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001259 def add_to_debug_db(npu_op: NpuOperation, offset: int):
1260 """Adds info to the debug database"""
1261 if not is_dma_op(npu_op):
1262 cmd = npu_op_to_cmd[npu_op]
1263 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall289a41d2020-08-04 21:40:14 +01001264
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001265 generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall79d07d22020-04-27 18:20:16 +01001266 sg.register_command_stream = emit.to_list()
1267 if verbose:
1268 emit.print_cmds()
1269 print("number of commands", len(emit.cmd_stream))
1270 print("command stream length in words", len(sg.register_command_stream))
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001271
1272
1273def generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: Accelerator) -> List[int]:
1274 """
1275 Public facing API for generating an ethosu register command stream.
1276 Calculates dependencies between commands and inserts wait operations if needed.
1277
1278 :param npu_op_list: List[NpuOperation] list of high level NPU operations
1279 :param accelerator: architecture_features.Accelerator enum to pick the correct ethosu accelerator
1280 :return ethosu instructions, as a list of 32-bit integers
1281 """
1282 emit = CommandStreamEmitter()
1283 arch = ArchitectureFeatures(
1284 vela_config=None,
1285 system_config=None,
1286 accelerator_config=accelerator.value,
1287 override_block_config=None,
1288 block_config_limit=None,
1289 global_memory_clock_scale=1.0,
1290 max_blockdep=ArchitectureFeatures.MAX_BLOCKDEP,
1291 weight_estimation_scaling=1.0,
1292 )
1293 generate_command_stream(emit, npu_op_list, arch)
1294 return emit.to_list()