blob: 9d79d58a26d650208116e9704626c577a92dfe17 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
Louis Verhaarde8a5a782020-11-02 18:04:27 +010029from . import numeric_util
Diego Russoea6111a2020-04-14 18:41:58 +010030from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
38from .api import NpuDataType
39from .api import NpuDmaOperation
40from .api import NpuElementWiseOp
41from .api import NpuElementWiseOperation
42from .api import NpuFeatureMap
43from .api import NpuKernel
44from .api import NpuLayout
45from .api import NpuOperation
46from .api import NpuOperationType
47from .api import NpuPadding
48from .api import NpuPoolingOp
49from .api import NpuPoolingOperation
50from .api import NpuQuantization
51from .api import NpuResamplingMode
52from .api import NpuRoundingMode
53from .api import NpuShape3D
54from .api import NpuTileBox
55from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010056from .architecture_features import ArchitectureFeatures
57from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010058from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import Rect
60from .architecture_features import SharedBufferArea
61from .architecture_features import SHRAMElements
Tim Halle6ccd872020-11-09 16:46:37 +000062from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010071from .high_level_command_stream import CommandType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010072from .high_level_command_to_npu_op import convert_command_to_npu_op
73from .high_level_command_to_npu_op import to_kernel
74from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russoe8a10452020-04-21 17:39:10 +010075from .numeric_util import quantise_float32
76from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010077from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010078from .operation import NpuBlockType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010079from .range_set import AccessDirection
80from .range_set import MemoryAccessSet
81from .range_set import MemoryRangeSet
82from .shared_buffer_allocation import find_suitable_block_configs
83from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
84from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010085
86
87class RegisterMachine:
88 def __init__(self):
89 self.n_banks = 1
90 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
91 self.bank_idx = 0
92
93 def set_register(self, reg, value):
94 is_changed = self.registers[self.bank_idx][reg] != value
95 self.registers[self.bank_idx][reg] = value
96 # is_changed = True # force command
97 return is_changed
98
99 def switch_bank(self):
100 self.bank_idx = (self.bank_idx + 1) % self.n_banks
101
102
103class CmdMode(IntEnum):
104 NoPayload = 0x0000
105 Payload32 = 0x4000
106 Mask = 0xC000
107 CmdOpMask = 0x03FF
108
109
Tim Hall79d07d22020-04-27 18:20:16 +0100110class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000111 WORD_SIZE = 4
112
Tim Hall79d07d22020-04-27 18:20:16 +0100113 def __init__(self):
114 self.cmd_stream = []
115 self.reg_machine = [RegisterMachine(), RegisterMachine()]
116 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000117 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100118
119 def get_reg_machine(self, cmd):
120 if "DMA" in cmd.name:
121 return self.reg_machine[1]
122 else:
123 return self.reg_machine[0]
124
125 def size_in_bytes(self):
126 sz = 0
127 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000128 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100129 return sz
130
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100131 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100132 return [elem for cmd in self.cmd_stream for elem in cmd]
133
134 def print_cmds(self):
135 print("Code: Command: Param: Payload:")
136 for words_for_one_command in self.cmd_stream:
137 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
138 param = words_for_one_command[0] >> 16 # higher 16 bits
139
140 payload_mode = CmdMode(code & CmdMode.Mask)
141
142 # code and command
143 s = " 0x%04x " % code
144 if payload_mode == CmdMode.NoPayload:
145 s += str(cmd0(code & CmdMode.CmdOpMask))
146 else:
147 s += str(cmd1(code & CmdMode.CmdOpMask))
148
149 s = s.ljust(40)
150 s += "%5d" % param
151
152 # payload
153 if payload_mode == CmdMode.Payload32:
154 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
155 else:
156 s += " -"
157
158 print(s)
159
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100160 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100161 if isinstance(param, Enum):
162 param = int(param.value)
163 else:
164 param = int(param)
165 param = param & 0xFFFF
166 command = cmd.value | (param << 16)
167 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
168 return
169
170 # This is not a redundant command, actually write it
171 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000172 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100173
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100174 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100175 offset = int(offset) & 0xFFFFFFFFF
176 command = cmd.value | CmdMode.Payload32.value | (param << 16)
177
178 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
179 return
180
181 # This is not a redundant command, actually write it
182 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000183 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100184
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100185 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100186 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100187 command = ((param & 0xFFFF) << 16) | cmd.value
188 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000189 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100190
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100191 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100192 param = int(param)
193 command = ((param & 0xFFFF) << 16) | cmd.value
194
195 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000196 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100197 self.get_reg_machine(cmd).switch_bank()
198
199
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100200# -------------------------------------------------------------------
201# REGISTER GENERATION
202# -------------------------------------------------------------------
203
204
205class BasePointerIndex(IntEnum):
206 WeightTensor = 0 # base address index for the Weight tensor
207 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
208 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
209 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
210
211
212# TODO: Replace with definitions from ethos_u55_regs
213class IFM2Broadcast(IntEnum):
214 BroadcastHdim = 1 << 0
215 BroadcastWdim = 1 << 1
216 BroadcastCdim = 1 << 2
217 ReverseOperandOrder = 1 << 6
218 UseIFM2Scalar = 1 << 7
219
220
221pooling_op_map = {
222 NpuPoolingOp.MAX: pooling_mode.MAX.value,
223 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
224 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
225}
226
227elementwise_op_map = {
228 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
229 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
230 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
231 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
232 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
233 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
234 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
235 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
236 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
237 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
238}
239
240activation_op_map = {
241 NpuActivationOp.NONE_OR_RELU: activation.NONE,
242 NpuActivationOp.TANH: activation.TANH,
243 NpuActivationOp.SIGMOID: activation.SIGMOID,
244}
245
246# Maps an AccumulatorType enum to the corresponding acc_format value
247acc_format_map = {
248 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
249 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
250 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
251}
252
253resampling_mode_map = {
254 NpuResamplingMode.NONE: resampling_mode.NONE,
255 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
256 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
257}
258
259# Maps data type size in bits to activation precision
260precision_map = {8: 0, 16: 1, 32: 2}
261
262# Maps rounding mode to the corresponding value
263rounding_mode_map = {
264 NpuRoundingMode.TFL: rounding.TFL.value,
265 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
266 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
267}
268
269
270def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
271 """Quantizes the given value"""
272 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
273 zp = 0 if quant is None else quant.zero_point
274 return quantise_float32(value, scale, zp)
275
276
277def has_ifm2(npu_op: NpuBlockOperation) -> bool:
278 """Checks if op has non-scalar IFM2"""
279 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
280
281
282def is_dma_op(npu_op: NpuOperation) -> bool:
283 """Checks if op is a DMA operation"""
284 return npu_op.op_type == NpuOperationType.Dma
285
286
287def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
288 """Generates IFM_PAD registers"""
289 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
290 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
291 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
292 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
293
294
295def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
296 """Generates ACTIVATION registers"""
297 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
298
299 if act.min is None:
300 quantized_min = ofm.data_type.min_value()
301 else:
302 quantized_min = quantise(act.min, ofm.quantization)
303 if act.max is None:
304 quantized_max = ofm.data_type.max_value()
305 else:
306 quantized_max = quantise(act.max, ofm.quantization)
307 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
308 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
309 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
310 assert 0 <= act.lookup_table_index < 8
311 activation_value = 16 + act.lookup_table_index
312 if ofm.data_type == NpuDataType.INT32:
313 activation_value |= 3 << 12 # Force I8 range
314 quantized_min = max(-128, quantized_min)
315 quantized_max = min(127, quantized_max)
316 else:
317 activation_value = activation_op_map[act.op_type]
318 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
319 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
320 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
321
322
323def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
324 """Generates xFM_BASE registers"""
325 if layout == NpuLayout.NHCWB16:
326 # Check that all BasePointer addresses are aligned to 16 bytes
327 assert all((int(addr) % 16) == 0 for addr in addresses)
328 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
329 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
330 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
331 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
332
333
334def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
335 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
336 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
337 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
338 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
339
340
341def generate_strides(
342 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
343):
344 """Generates STRIDE_C/Y/X registers"""
345 strides = get_strides(fm)
346 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
347 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
348 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
349
350
351def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
352 """Generates IFM/IFM2_PRECISION register"""
353 dtype = fm.data_type
354 prec = 1 if dtype.is_signed() else 0
355 activation_precision = precision_map[dtype.size_in_bits()]
356 prec += activation_precision << 2
357
358 if fm.layout == NpuLayout.NHCWB16:
359 prec |= 1 << 6
360
361 prec |= op_to_scale << 8
362 emit.cmd0_with_param(precision_cmd, prec)
363
364
365def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
366 """Generates OFM_PRECISION register"""
367 dtype = npu_op.ofm.data_type
368 prec = 1 if dtype.is_signed() else 0
369 activation_precision = precision_map[dtype.size_in_bits()]
370 prec += activation_precision << 1
371
372 if use_global_scale:
373 # Set global scale bit, as opposed to using per channel scale
374 prec |= 1 << 8
375 if npu_op.ofm.layout == NpuLayout.NHCWB16:
376 prec |= 1 << 6
377 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
378 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
379
380
381def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
382 """Generates IFM2_BROADCAST register for binary elementwise operations"""
383 ifm2_broadcast = 0
384 ifm = npu_op.ifm
385 ifm2 = npu_op.ifm2
386 if npu_op.reversed_operands:
387 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
388 if npu_op.ifm2_scalar is not None:
389 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
390 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
391 else:
392 if ifm.shape.height != ifm2.shape.height:
393 # Broadcast in 'H' dimension
394 assert ifm2.shape.height == 1
395 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
396
397 if ifm.shape.width != ifm2.shape.width:
398 # Broadcast in 'W' dimension
399 assert ifm2.shape.width == 1
400 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
401
402 if ifm.shape.depth != ifm2.shape.depth:
403 # Broadcast in 'C' dimension
404 assert ifm2.shape.depth == 1
405 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
406
407 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
408
409
410def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
411 """Generates general IFM registers"""
412 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
413 generate_addresses(
414 emit,
415 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
416 ifm.tiles.addresses,
417 ifm.layout,
418 )
419 generate_tiles(
420 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
421 )
422 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
423 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
424 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
425
426
427def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
428 """Generates general IFM2 registers"""
429 if not has_scalar:
430 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
431 generate_addresses(
432 emit,
433 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
434 ifm2.tiles.addresses,
435 ifm2.layout,
436 )
437 generate_tiles(
438 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
439 )
440 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
441 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
442
443
444def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
445 """Generates general OFM registers"""
446 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
447 generate_addresses(
448 emit,
449 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
450 ofm.tiles.addresses,
451 ofm.layout,
452 )
453 generate_tiles(
454 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
455 )
456 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
457 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
459 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
461
462
463def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
464 """Generates KERNEL related registers"""
465 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
466 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
467 # set kernel x stride low bit
468 stride = (kernel.stride_x - 1) & 1
469 # set kernel y stride low bit
470 stride |= (kernel.stride_y - 1 & 1) << 1
471 # set kernel x stride extension bits
472 stride |= (kernel.stride_x - 1 >> 1) << 6
473 # set kernel y stride extension bits
474 stride |= (kernel.stride_y - 1 >> 1) << 9
475 stride |= (kernel.dilation_x - 1) << 3
476 stride |= (kernel.dilation_y - 1) << 4
477 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
478 stride |= 1 << 2
479 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
480
481
482def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
483 """Generates WEIGHT registers"""
484 if len(weights) == 0:
485 return
486 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
487 # Set weights sources for active and present cores
488 for core, (addr, length) in enumerate(
489 [
490 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
491 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
492 ]
493 ):
494 if core < len(weights):
495 emit.cmd1_with_offset(addr, weights[core].address)
496 emit.cmd1_with_offset(length, weights[core].length)
497 elif core < arch.ncores:
498 emit.cmd1_with_offset(addr, weights[0].address)
499 emit.cmd1_with_offset(length, 0)
500
501
502def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
503 """Generates SCALE registers"""
504 if len(biases) == 0:
505 return
506 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
507 # Set weights sources for active and present cores
508 for core, (addr, length) in enumerate(
509 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
510 ):
511 if core < len(biases):
512 emit.cmd1_with_offset(addr, biases[core].address)
513 emit.cmd1_with_offset(length, biases[core].length)
514 elif core < arch.ncores:
515 emit.cmd1_with_offset(addr, biases[0].address)
516 emit.cmd1_with_offset(length, 0)
517
518
519def generate_block_config(
520 emit: CommandStreamEmitter,
521 npu_op: NpuBlockOperation,
522 arch: ArchitectureFeatures,
523 shared_buffer: SharedBufferAllocation,
524) -> NpuShape3D:
525 """Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
526 block_config = npu_op.block_config
527 if block_config is None or block_config.height < 0:
528 # Note: this code only used if the public API to generate command streams is used;
529 # in the "normal" flow, the block config selected by the scheduler is used
530 if npu_op.weights:
531 assert block_config is not None, "block_config.depth must be provided for ops with weights"
532 # Block config has not been provided: find one
533 blocks = find_suitable_block_configs(arch, shared_buffer)
534 # Return the block with biggest volume
535 # TODO: use a better algorithm to find the best block
536 best_block = None
537 best_value = 0
538 for block in blocks:
539 if block_config is not None and block[3] != block_config.depth:
540 continue
541 value = block[0] * block[1] * block[3]
542 if value > best_value:
543 best_value = value
544 best_block = block
545 assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"
546 block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])
547 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
548 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
549 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
550 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
551 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
552 return block_config
553
554
555def generate_shram_registers_elementwise(
556 emit: CommandStreamEmitter,
557 npu_op: NpuElementWiseOperation,
558 arch: ArchitectureFeatures,
559 shared_buffer: SharedBufferAllocation,
560):
561 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
562 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
563 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
564 shram_required = arch.available_shram_banks(uses_lut)
565
566 # Acc buffers not needed so set AB_START to size of SHRAM
567 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
568 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
569 if has_ifm2(npu_op):
570 # Set IFM2_IB_START to the latter half of the IB space
571 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
572 emit.cmd0_with_param(
573 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
574 )
575 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
576
577
578def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
579 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
580 emit.cmd0_with_param(
581 cmd0.NPU_SET_IFM_IB_END,
582 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
583 )
584 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
585 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
586
587
588def generate_common(
589 emit: CommandStreamEmitter,
590 npu_op: NpuBlockOperation,
591 block_traversal: NpuBlockTraversal,
592 arch: ArchitectureFeatures,
593 use_global_scale: bool = False,
594 op_to_scale: int = 0,
595):
596 """Generate registers that are common to most operations"""
597 assert npu_op.ifm is not None and npu_op.ofm is not None
598 generate_ifm(emit, npu_op.ifm)
599 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
600 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
601 if npu_op.padding is not None:
602 generate_padding(emit, npu_op.padding)
603 generate_ofm(emit, npu_op.ofm)
604 generate_ofm_precision(emit, npu_op, use_global_scale)
605 if npu_op.op_type != NpuOperationType.ElementWise:
606 assert npu_op.kernel is not None
607 generate_kernel(emit, npu_op.kernel, block_traversal)
608 generate_weights(emit, npu_op.weights, arch)
609 generate_biases(emit, npu_op.biases, arch)
610 generate_activation(emit, npu_op.activation, npu_op.ofm)
611
612
613# -------------------------------------------------------------------
614# SCALING
615# -------------------------------------------------------------------
616
617
618def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
619 """Generates OFM_SCALE register for pooling operations"""
620 # For valid padding vela has to output scaling values
621 kernel = pool_op.kernel
622 ifm_quant = pool_op.ifm.quantization
623 ofm_quant = pool_op.ofm.quantization
624 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
625 assert ifm_quant.scale_f32 is not None
626 rescale = 0x3000 * ifm_quant.scale_f32
627 if pool_op.ifm.data_type == NpuDataType.INT16:
628 # Calculate scale and shift for the output scale of 1/(3*4096)
629 shift = 0
630 max_rescale = np.iinfo(np.int16).max / 2
631 while rescale <= max_rescale and shift <= 30:
632 shift += 1
633 rescale *= 2
634 scale = int(rescale)
635 else:
636 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
637 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
638 scale = int(round_away_zero(scale * rescale))
639 elif pool_op.fused_quantize:
640 # Quantize op requires different scaling
641 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
642 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
643 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
644 elif pool_op.rescale is not None:
645 # for ResizeBilinear operations with "rescale" in primary_op.attrs
646 rescale = pool_op.rescale
647 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
648 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
649 scale = int(round_away_zero(scale * rescale))
650 else:
651 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
652 # kernel height == kernel width == 1 is always true in this case
653 # Normally the scale is maximised, to get maximum precision, which means that
654 # if rescale != 1, scale need to consider the number of bits needed for rescaling
655 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
656 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
657 rescale_bits = 0
658 if kernel.height == kernel.width == 1:
659 if rescale > 1:
660 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
661 elif rescale < 1:
662 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
663 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
664 scale = int(round_away_zero(scale * rescale))
665 else:
666 scale = 1
667 shift = 0
668
669 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
670
671
672def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
673 """
674 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
675 Returns the operator to scale
676 """
677 op_to_scale = 0
678 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
679 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
680 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
681 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
682
683 if npu_op.activation is not None and npu_op.activation.op_type in (
684 NpuActivationOp.SIGMOID,
685 NpuActivationOp.TANH,
686 ):
687 output_scale = 1 / 0x3000
688
689 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
690 if None in (input_scale, input2_scale, output_scale):
691 ofm_scale = 1
692 shift = 0
693 else:
694 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
695 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
696 else: # Add/Sub
697 if None in (input_scale, input2_scale, output_scale):
698 opa_scale = opb_scale = ofm_scale = 1
699 opa_shift = shift = 0
700 if npu_op.rescale is not None:
701 ofm_scale, shift = npu_op.rescale
702 elif input_scale == input2_scale:
703 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
704 input_scale, input2_scale, output_scale
705 )
706 opa_shift = 0 # Unused for this case
707 else:
708 # Use advanced implementation only when input scales differ
709 bitdepth = npu_op.ifm.data_type.size_in_bits()
710 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
711 input_scale, input2_scale, output_scale, bitdepth
712 )
713 opb_scale = 0 # Unused for this case
714 if npu_op.reversed_operands:
715 # If the operand order is reversed we also have to swap which operand is scaled
716 if op_to_scale == scaling.OperandToScale.OPa:
717 op_to_scale = scaling.OperandToScale.OPb
718 else:
719 op_to_scale = scaling.OperandToScale.OPa
720 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
721 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
722 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
723 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
724 output_scale = npu_op.ofm.quantization.scale_f32
725 ofm_scale, shift = scaling.quantise_scale(output_scale)
726 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
727 else:
728 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
729 return op_to_scale
730
731
732# -------------------------------------------------------------------
733# ADDRESSING/STRIDES (helper functions)
734# -------------------------------------------------------------------
735
736
737def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
738 """Checks if the ranges overlap"""
739 return range1.region == range2.region and numeric_util.overlaps(
740 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
741 )
742
743
744def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
745 """Calculates STRIDE_C/Y/X"""
746 if fm.strides is not None:
747 return fm.strides
748 elem_size = fm.data_type.size_in_bytes()
749 if fm.layout == NpuLayout.NHWC:
750 stride_c = elem_size
751 stride_x = fm.shape.depth * stride_c
752 stride_y = fm.shape.width * stride_x
753 else:
754 stride_x = 16 * elem_size
755 stride_c = stride_x * fm.shape.width
756 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
757 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
758
759
760def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
761 """Returns address of given coordinate"""
762 t = 0
763 BRICK = 16
764 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
765 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
766 if x >= fm.tiles.width_0:
767 x -= fm.tiles.width_0
768 t = 1
769 if y >= fm.tiles.height_1:
770 y -= fm.tiles.height_1
771 t += 2
772 elif y >= fm.tiles.height_0:
773 y -= fm.tiles.height_0
774 t += 2
775 elem_size = fm.data_type.size_in_bytes()
776 return (
777 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
778 )
779
780
781def get_address_range(
782 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
783) -> NpuAddressRange:
784 """Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
785 addr0 = get_address(fm, strides, y0, x0, c0)
786 addr1 = get_address(fm, strides, y1, x1, c1)
787 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
788
789
790def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
791 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
792 strides = get_strides(fm)
793 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
794 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
795 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
796 if width > width_0:
797 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
798 else:
799 t1 = None
800 if height > height_0:
801 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
802 else:
803 t2 = None
804 if t1 is not None and t2 is not None:
805 t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
806 else:
807 t3 = None
808 return [t0, t1, t2, t3]
809
810
811# -------------------------------------------------------------------
812# DMA_WAIT/KERNEL_WAIT
813# -------------------------------------------------------------------
814
815
Tim Hall289a41d2020-08-04 21:40:14 +0100816Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100817
Tim Hall79d07d22020-04-27 18:20:16 +0100818
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100819def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
820 return MemoryRangeSet(range.region, range.address, range.address + range.length)
821
822
823def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
824 """Returns the address that are read and written by the given DMA operation"""
825 res = MemoryAccessSet()
826 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
827 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
828 return res
829
830
831def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
832 """Returns the addresses that are read and written by the given operation"""
833 assert npu_op.ifm is not None and npu_op.ofm is not None
834 # Read addresses
835 read_ranges = get_address_ranges(npu_op.ifm)
836 if has_ifm2(npu_op):
837 assert npu_op.ifm2 is not None
838 read_ranges.extend(get_address_ranges(npu_op.ifm2))
839 read_ranges.extend(npu_op.weights)
840 read_ranges.extend(npu_op.biases)
841 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
842 address = arch.available_shram_banks(True) * arch.shram_bank_size
843 read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
844 # Written addresses
845 write_ranges = get_address_ranges(npu_op.ofm)
846 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
847 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
848 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
849 write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
850
851 res = MemoryAccessSet()
852 for read_range in read_ranges:
853 if read_range is not None:
854 res.add(memory_range_set(read_range), AccessDirection.Read)
855 for write_range in write_ranges:
856 if write_range is not None:
857 res.add(memory_range_set(write_range), AccessDirection.Write)
858 return res
859
860
861def get_wait_dependency(
862 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
863):
864 """Used to calculate whether DMA wait or kernel wait operations are needed"""
865 npu_op = npu_op_list[op_index]
866 op_access = memory_accesses[npu_op]
867 index = op_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100868
Tim Hall289a41d2020-08-04 21:40:14 +0100869 # NPU dependency tracking
870 npu_outstanding = -1
871 npu_ops = 0
872 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100873
Tim Hall289a41d2020-08-04 21:40:14 +0100874 # DMA dependency tracking
875 dma_outstanding = -1
876 dma_ops = 0
877 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100878
Tim Hall289a41d2020-08-04 21:40:14 +0100879 # Seek back in the command stream looking for NPU or DMA dependencies
880 # but only as far as the first dependency or the watermarks (dependencies
881 # before this point have been satisfied already).
882 # The watermark moves to after the latest element we must wait for, not
883 # the command that issues the wait.
884 # NPU->NPU dependency is handled via blockdep.
885 while (index >= npu_index) or (index >= dma_index):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100886 prev_op = npu_op_list[index]
887 prev_access = memory_accesses[prev_op]
Tim Hall79d07d22020-04-27 18:20:16 +0100888
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100889 # Check NPU consuming DMA output
890 if is_dma_op(prev_op):
891 if index >= dma_index:
892 if not is_dma_op(npu_op):
893 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
894 dma_outstanding = dma_ops
895 dma_ops += 1 # Count DMA ops in the pipeline
896 if dma_ops >= arch.max_outstanding_dma:
897 dma_index = max(index + 1, dma_index)
Tim Hall289a41d2020-08-04 21:40:14 +0100898 # Check DMA consuming NPU output
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100899 else:
Tim Hall289a41d2020-08-04 21:40:14 +0100900 if index >= npu_index:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100901 if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall289a41d2020-08-04 21:40:14 +0100902 npu_outstanding = npu_ops
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100903 npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall289a41d2020-08-04 21:40:14 +0100904 if npu_ops >= arch.max_outstanding_kernels:
905 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100906
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100907 index -= 1
Tim Hall79d07d22020-04-27 18:20:16 +0100908
Tim Hall289a41d2020-08-04 21:40:14 +0100909 # Update DMA watermark if we didn't see any and the NPU pipeline is full
910 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100911 dma_index = op_index
Tim Hall289a41d2020-08-04 21:40:14 +0100912
913 # Bring the search watermark forwards as we complete for those dependencies
914 watermark = Watermark(npu_index, dma_index)
915 outstanding = Watermark(npu_outstanding, dma_outstanding)
916
917 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100918
919
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
921 if cmd_waits.npu >= 0:
922 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
923
924 if cmd_waits.dma >= 0:
925 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
926
927
928# -------------------------------------------------------------------
929# BLOCKDEP
930# -------------------------------------------------------------------
931
932
933def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
934 """Checks if npu_op's input is dependent on prev_op's output"""
935 assert npu_op.ifm is not None
936 assert prev_op.ofm is not None
937 curr_input_ranges = get_address_ranges(npu_op.ifm)
938
939 if has_ifm2(npu_op):
940 assert npu_op.ifm2 is not None
941 curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
942 for prev_range in get_address_ranges(prev_op.ofm):
943 if prev_range is None:
944 continue
945 for curr_range in curr_input_ranges:
946 if curr_range is not None and ranges_overlap(prev_range, curr_range):
947 return True
Tim Hall79d07d22020-04-27 18:20:16 +0100948 return False
949
950
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100951def shape3d_to_rect(shape: NpuShape3D) -> Rect:
952 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100953
954
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100955def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall79d07d22020-04-27 18:20:16 +0100956 # Note: NOT equivalent to the normal ifm block depth calculation since
957 # it takes into account 'depthless' block operations by returning full
958 # depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100959 if npu_op.op_type == NpuOperationType.Conv2D:
960 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
961 return res
962 return npu_op.ofm.shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100963
964
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100965def calc_blockdep(
966 arch: ArchitectureFeatures,
967 prev_op: Optional[NpuBlockOperation],
968 prev_block_config: Optional[NpuShape3D],
969 npu_op: NpuBlockOperation,
970 block_config: NpuShape3D,
971) -> int:
972 """Calculates the value of the BLOCKDEP register"""
973 if prev_op is None:
974 return 0
975 if not is_dependent_on_prev_op(prev_op, npu_op):
976 return ArchitectureFeatures.MAX_BLOCKDEP
977 if prev_op.ofm.shape != npu_op.ifm.shape:
978 return 0
979 prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
980 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
981 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
982 prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
983 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
984 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
985 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
986 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
987 cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
988 blockdep = arch.calc_block_dep(
989 prev_ifm_rect,
990 prev_ofm_rect,
991 prev_ifm_block_depth,
992 prev_ofm_block,
993 to_kernel(prev_op.kernel),
994 cur_ifm_rect,
995 cur_ofm_rect,
996 cur_ifm_block_depth,
997 cur_ofm_block,
998 to_kernel(npu_op.kernel),
999 cur_padLT,
1000 )
1001 return blockdep
Tim Hall79d07d22020-04-27 18:20:16 +01001002
1003
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001004# -------------------------------------------------------------------
1005# PRINT
1006# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +02001007
1008
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001009def print_feature_map(fm: NpuFeatureMap, name: str):
1010 if fm is not None:
1011 q = (
1012 "no quantization"
1013 if fm.quantization is None
1014 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
1015 )
1016 h, w, c = fm.shape
1017 sz = h * w * c * fm.data_type.size_in_bytes()
1018 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
1019 strides = get_strides(fm)
1020 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
1021 t = fm.tiles
1022 addresses = [hex(addr) for addr in t.addresses]
1023 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +01001024
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001025
1026def print_operation(npu_op: NpuOperation, index: int = 0):
1027 pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
1028 if is_dma_op(npu_op):
1029 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
1030 return
1031 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
1032 if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
1033 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +02001034 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001035 if (
1036 npu_op.op_type == NpuOperationType.Conv2D
1037 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
1038 ):
1039 fc = "FullyConnected "
1040 else:
1041 fc = ""
1042 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
1043 print_feature_map(npu_op.ifm, "IFM")
1044 if npu_op.ifm2_scalar is not None:
1045 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1046 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
1047 else:
1048 print_feature_map(npu_op.ifm2, "IFM2")
1049 print_feature_map(npu_op.ofm, "OFM")
1050 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
1051 print(f" Kernel: {k}")
1052 if npu_op.padding is not None:
1053 print(f" {npu_op.padding}")
1054 for weights in npu_op.weights:
1055 print(f" Weights: {weights}")
1056 for bias in npu_op.biases:
1057 print(f" Scales: {bias}")
1058 if npu_op.activation is not None:
1059 act = npu_op.activation
1060 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
1061 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
1062 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
1063 if npu_op.op_type == NpuOperationType.Conv2D:
1064 print(f" {npu_op.block_traversal}")
1065 bh, bw, bc = npu_op.block_config
1066 rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
1067 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +01001068
Tim Hall79d07d22020-04-27 18:20:16 +01001069
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001070def print_operations(npu_op_list: List[NpuOperation]):
1071 for index, npu_op in enumerate(npu_op_list):
1072 print_operation(npu_op, index)
Tim Hall79d07d22020-04-27 18:20:16 +01001073
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001074
1075# -------------------------------------------------------------------
1076# OPERATIONS
1077# -------------------------------------------------------------------
1078
1079
1080def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
1081 """Generates NPU_OP_* command"""
1082 op_type = npu_op.op_type
1083 if op_type == NpuOperationType.Dma:
1084 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
1085 elif op_type == NpuOperationType.Conv2D:
1086 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1087 elif op_type == NpuOperationType.ConvDepthWise:
1088 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1089 elif op_type == NpuOperationType.Pooling:
1090 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
1091 elif op_type == NpuOperationType.ElementWise:
1092 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
1093 else:
1094 assert 0, "Unsupported operation"
1095
1096
1097def generate_conv2d_op(
1098 emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures
1099) -> NpuShape3D:
1100 """Generates register commands for Conv2D operations"""
1101 generate_common(emit, npu_op, npu_op.block_traversal, arch)
1102 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1103 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)
1104 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1105 generate_shram_registers_non_elementwise(emit, shared_buffer)
1106 return block_config
1107
1108
1109def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1110 """Generates register commands for depthwise convolution operations"""
1111 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
1112 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1113 shared_buffer = shared_buffer_allocation_for_npu_op(
1114 arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode
1115 )
1116 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1117 generate_shram_registers_non_elementwise(emit, shared_buffer)
1118 return block_config
1119
1120
1121def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1122 """Generates register commands for pooling operations"""
1123 use_global_scale = (
1124 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
1125 )
1126 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
1127 # Pooling op specific
1128 if use_global_scale:
1129 generate_ofm_scaling_for_pooling(emit, npu_op)
1130 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1131 npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
1132 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)
1133 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1134 generate_shram_registers_non_elementwise(emit, shared_buffer)
1135 return block_config
1136
1137
1138def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
1139 """Generates register commands for elementwise operations"""
1140 use_global_scale = npu_op.sub_op_type in (
1141 NpuElementWiseOp.ADD,
1142 NpuElementWiseOp.SUB,
1143 NpuElementWiseOp.MUL,
1144 NpuElementWiseOp.LRELU,
1145 NpuElementWiseOp.ABS,
1146 )
1147 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
1148 generate_common(
1149 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
1150 )
1151 # Elementwise op specific
1152 if npu_op.sub_op_type not in unary_elementwise_ops:
1153 # Binary operation; generate IFM2 registers
1154 assert npu_op.ifm2 is not None
1155 has_scalar = npu_op.ifm2_scalar is not None
1156 generate_ifm2(emit, npu_op.ifm2, has_scalar)
1157 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1158 generate_ifm2_broadcast(emit, npu_op)
1159 if has_scalar:
1160 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1161 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1162 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
1163 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
1164 shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)
1165 block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
1166 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
1167 return block_config
1168
1169
1170def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
1171 """Generates register commands for DMA operations"""
1172 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
1173 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
1174 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1175
1176 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1177 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
1178
1179
1180def generate_registers_for_op(
1181 emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures
1182) -> Optional[NpuShape3D]:
1183 """
1184 Generates register commands for the given operation, but not the final NPU_OP_... command.
1185 Returns the selected block config
1186 """
1187 op_type = npu_op.op_type
1188 block_config = None
1189 if op_type == NpuOperationType.Conv2D:
1190 block_config = generate_conv2d_op(emit, npu_op, arch)
1191 elif op_type == NpuOperationType.ConvDepthWise:
1192 block_config = generate_conv_depthwise_op(emit, npu_op, arch)
1193 elif op_type == NpuOperationType.Pooling:
1194 block_config = generate_pooling_op(emit, npu_op, arch)
1195 elif op_type == NpuOperationType.ElementWise:
1196 block_config = generate_elementwise_op(emit, npu_op, arch)
1197 elif op_type == NpuOperationType.Dma:
1198 generate_dma_op(emit, npu_op)
1199 else:
1200 assert 0, "Unsupported operation"
1201 return block_config
1202
1203
1204def generate_command_stream(
1205 emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
1206):
1207 """Generates register commands for the given list of NPU operations"""
1208 # Calculate memory accesses for every operation
Tim Hall289a41d2020-08-04 21:40:14 +01001209 memory_accesses = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001210 for npu_op in npu_op_list:
1211 if is_dma_op(npu_op):
1212 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
1213 else:
1214 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hallc8a73862020-10-27 12:43:14 +00001215 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001216 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1217 dep_watermark = Watermark(0, 0)
1218 prev_op = None
1219 prev_block_config = None
1220 # Generate register commands for all operations
1221 for op_index, npu_op in enumerate(npu_op_list):
1222 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1223 block_config = generate_registers_for_op(emit, npu_op, arch)
1224 if not is_dma_op(npu_op):
1225 # Generate BLOCKDEP
1226 assert block_config is not None
1227 blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)
1228 blockdep = min(blockdep, arch.max_blockdep)
1229 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1230 prev_op = npu_op
1231 prev_block_config = block_config
1232
1233 generate_cmd_waits(emit, cmd_waits)
1234 # Generate the actual NPU_OP command
1235 generate_operation_code(emit, npu_op)
1236 if add_to_debug_db is not None:
1237 add_to_debug_db(npu_op, emit.offset)
1238 # Fill in final part of command stream:
1239 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1240
1241
1242def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
1243 """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
1244 # Convert high level command stream to list of NpuOperation
1245 npu_op_list = []
1246 npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall79d07d22020-04-27 18:20:16 +01001247 for cmd in sg.high_level_command_stream:
1248 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
1249 print("Warning: Skipping register command stream generation for", cmd.ps)
1250 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001251 npu_op = convert_command_to_npu_op(cmd, arch)
1252 npu_op_list.append(npu_op)
1253 npu_op_to_cmd[npu_op] = cmd
1254 if verbose:
1255 print_operations(npu_op_list)
1256 # Generate register commands
Tim Halle6ccd872020-11-09 16:46:37 +00001257 stream_id = DebugDatabase.add_stream(sg)
1258 DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001259 emit = CommandStreamEmitter()
Tim Halle6ccd872020-11-09 16:46:37 +00001260
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001261 def add_to_debug_db(npu_op: NpuOperation, offset: int):
1262 """Adds info to the debug database"""
1263 if not is_dma_op(npu_op):
1264 cmd = npu_op_to_cmd[npu_op]
1265 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall289a41d2020-08-04 21:40:14 +01001266
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001267 generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall79d07d22020-04-27 18:20:16 +01001268 sg.register_command_stream = emit.to_list()
1269 if verbose:
1270 emit.print_cmds()
1271 print("number of commands", len(emit.cmd_stream))
1272 print("command stream length in words", len(sg.register_command_stream))
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001273
1274
Louis Verhaardaeae5672020-11-02 18:04:27 +01001275def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001276 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001277 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001278 Calculates dependencies between commands and inserts wait operations if needed.
1279
1280 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001281 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1282 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001283 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001284 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001285 emit = CommandStreamEmitter()
Louis Verhaard52078302020-11-18 13:35:06 +01001286 arch = create_default_arch(accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001287 generate_command_stream(emit, npu_op_list, arch)
1288 return emit.to_list()