blob: fb705b9641eb7f6873434b8b693a1149920b4c23 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010024from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025from typing import List
26from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010027
28import numpy as np
29
30from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010038from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010039from .api import NpuDataType
40from .api import NpuDmaOperation
41from .api import NpuElementWiseOp
42from .api import NpuElementWiseOperation
43from .api import NpuFeatureMap
44from .api import NpuKernel
45from .api import NpuLayout
46from .api import NpuOperation
47from .api import NpuOperationType
48from .api import NpuPadding
49from .api import NpuPoolingOp
50from .api import NpuPoolingOperation
51from .api import NpuQuantization
52from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
56from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010057from .architecture_features import ArchitectureFeatures
58from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010059from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import SharedBufferArea
61from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010062from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import quantise_float32
72from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010074from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010075from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010076from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010077from .register_command_stream_util import calc_blockdep
78from .register_command_stream_util import get_dma_memory_accesses
79from .register_command_stream_util import get_op_memory_accesses
80from .register_command_stream_util import get_strides
81from .register_command_stream_util import get_wait_dependency
82from .register_command_stream_util import has_ifm2
Louis Verhaard1e170182020-11-26 11:42:04 +010083from .register_command_stream_util import to_kernel
84from .register_command_stream_util import UNARY_ELEMWISE_OPS
85from .register_command_stream_util import Watermark
Louis Verhaarde8a5a782020-11-02 18:04:27 +010086from .shared_buffer_allocation import find_suitable_block_configs
87from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
88from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
139 print("Code: Command: Param: Payload:")
140 for words_for_one_command in self.cmd_stream:
141 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
142 param = words_for_one_command[0] >> 16 # higher 16 bits
143
144 payload_mode = CmdMode(code & CmdMode.Mask)
145
146 # code and command
147 s = " 0x%04x " % code
148 if payload_mode == CmdMode.NoPayload:
149 s += str(cmd0(code & CmdMode.CmdOpMask))
150 else:
151 s += str(cmd1(code & CmdMode.CmdOpMask))
152
153 s = s.ljust(40)
154 s += "%5d" % param
155
156 # payload
157 if payload_mode == CmdMode.Payload32:
158 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
159 else:
160 s += " -"
161
162 print(s)
163
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100164 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100165 if isinstance(param, Enum):
166 param = int(param.value)
167 else:
168 param = int(param)
169 param = param & 0xFFFF
170 command = cmd.value | (param << 16)
171 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
172 return
173
174 # This is not a redundant command, actually write it
175 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000176 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100177
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100178 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200179 offset = int(offset) & 0xFFFFFFFF
180 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100181 command = cmd.value | CmdMode.Payload32.value | (param << 16)
182
183 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
184 return
185
186 # This is not a redundant command, actually write it
187 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000188 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100190 def cmd1_with_address(self, cmd: cmd1, offset):
191 self.cmd1_with_offset(cmd, offset, offset >> 32)
192
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100193 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100194 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100195 command = ((param & 0xFFFF) << 16) | cmd.value
196 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000197 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100198
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100199 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100200 param = int(param)
201 command = ((param & 0xFFFF) << 16) | cmd.value
202
203 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000204 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100205 self.get_reg_machine(cmd).switch_bank()
206
207
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100208# -------------------------------------------------------------------
209# REGISTER GENERATION
210# -------------------------------------------------------------------
211
212
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100213# TODO: Replace with definitions from ethos_u55_regs
214class IFM2Broadcast(IntEnum):
215 BroadcastHdim = 1 << 0
216 BroadcastWdim = 1 << 1
217 BroadcastCdim = 1 << 2
218 ReverseOperandOrder = 1 << 6
219 UseIFM2Scalar = 1 << 7
220
221
222pooling_op_map = {
223 NpuPoolingOp.MAX: pooling_mode.MAX.value,
224 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
225 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
226}
227
228elementwise_op_map = {
229 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
230 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
231 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
232 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
233 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
234 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
235 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
236 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
237 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
238 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
239}
240
241activation_op_map = {
242 NpuActivationOp.NONE_OR_RELU: activation.NONE,
243 NpuActivationOp.TANH: activation.TANH,
244 NpuActivationOp.SIGMOID: activation.SIGMOID,
245}
246
247# Maps an AccumulatorType enum to the corresponding acc_format value
248acc_format_map = {
249 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
250 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
251 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
252}
253
254resampling_mode_map = {
255 NpuResamplingMode.NONE: resampling_mode.NONE,
256 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
257 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
258}
259
260# Maps data type size in bits to activation precision
261precision_map = {8: 0, 16: 1, 32: 2}
262
263# Maps rounding mode to the corresponding value
264rounding_mode_map = {
265 NpuRoundingMode.TFL: rounding.TFL.value,
266 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
267 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
268}
269
270
Louis Verhaard024c3552021-03-17 14:26:34 +0100271def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
272 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
273 for mem_access in memory_accesses.accesses:
274 for region, range_set in mem_access.regions.items():
275 if region not in mem_limits:
276 raise VelaError(f"Invalid region: {region}")
277 max = mem_limits[region]
278 for start, end in range_set.ranges:
279 for offset in (start, end):
280 if offset < 0:
281 raise VelaError(f"Negative address offset: {offset}, region: {region}")
282 if offset > max:
283 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
284
285
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100286def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
287 """Quantizes the given value"""
288 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
289 zp = 0 if quant is None else quant.zero_point
290 return quantise_float32(value, scale, zp)
291
292
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100293def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
294 """Generates IFM_PAD registers"""
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
297 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
298 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
299
300
301def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
302 """Generates ACTIVATION registers"""
303 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
304
305 if act.min is None:
306 quantized_min = ofm.data_type.min_value()
307 else:
308 quantized_min = quantise(act.min, ofm.quantization)
309 if act.max is None:
310 quantized_max = ofm.data_type.max_value()
311 else:
312 quantized_max = quantise(act.max, ofm.quantization)
313 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
314 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
315 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
316 assert 0 <= act.lookup_table_index < 8
317 activation_value = 16 + act.lookup_table_index
318 if ofm.data_type == NpuDataType.INT32:
319 activation_value |= 3 << 12 # Force I8 range
320 quantized_min = max(-128, quantized_min)
321 quantized_max = min(127, quantized_max)
322 else:
323 activation_value = activation_op_map[act.op_type]
324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
325 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
326 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
327
328
329def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
330 """Generates xFM_BASE registers"""
331 if layout == NpuLayout.NHCWB16:
332 # Check that all BasePointer addresses are aligned to 16 bytes
333 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100334 for i in range(4):
335 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100336
337
338def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
339 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
340 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
341 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
342 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
343
344
345def generate_strides(
346 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
347):
348 """Generates STRIDE_C/Y/X registers"""
349 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100350 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
351 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
352 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100353
354
355def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
356 """Generates IFM/IFM2_PRECISION register"""
357 dtype = fm.data_type
358 prec = 1 if dtype.is_signed() else 0
359 activation_precision = precision_map[dtype.size_in_bits()]
360 prec += activation_precision << 2
361
362 if fm.layout == NpuLayout.NHCWB16:
363 prec |= 1 << 6
364
365 prec |= op_to_scale << 8
366 emit.cmd0_with_param(precision_cmd, prec)
367
368
369def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
370 """Generates OFM_PRECISION register"""
371 dtype = npu_op.ofm.data_type
372 prec = 1 if dtype.is_signed() else 0
373 activation_precision = precision_map[dtype.size_in_bits()]
374 prec += activation_precision << 1
375
376 if use_global_scale:
377 # Set global scale bit, as opposed to using per channel scale
378 prec |= 1 << 8
379 if npu_op.ofm.layout == NpuLayout.NHCWB16:
380 prec |= 1 << 6
381 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
382 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
383
384
385def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
386 """Generates IFM2_BROADCAST register for binary elementwise operations"""
387 ifm2_broadcast = 0
388 ifm = npu_op.ifm
389 ifm2 = npu_op.ifm2
390 if npu_op.reversed_operands:
391 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
392 if npu_op.ifm2_scalar is not None:
393 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
394 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
395 else:
396 if ifm.shape.height != ifm2.shape.height:
397 # Broadcast in 'H' dimension
398 assert ifm2.shape.height == 1
399 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
400
401 if ifm.shape.width != ifm2.shape.width:
402 # Broadcast in 'W' dimension
403 assert ifm2.shape.width == 1
404 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
405
406 if ifm.shape.depth != ifm2.shape.depth:
407 # Broadcast in 'C' dimension
408 assert ifm2.shape.depth == 1
409 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
410
411 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
412
413
414def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
415 """Generates general IFM registers"""
416 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
417 generate_addresses(
418 emit,
419 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
420 ifm.tiles.addresses,
421 ifm.layout,
422 )
423 generate_tiles(
424 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
425 )
426 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
427 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
428 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
429
430
431def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
432 """Generates general IFM2 registers"""
433 if not has_scalar:
434 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
435 generate_addresses(
436 emit,
437 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
438 ifm2.tiles.addresses,
439 ifm2.layout,
440 )
441 generate_tiles(
442 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
443 )
444 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
445 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
446
447
448def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
449 """Generates general OFM registers"""
450 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
451 generate_addresses(
452 emit,
453 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
454 ofm.tiles.addresses,
455 ofm.layout,
456 )
457 generate_tiles(
458 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
459 )
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
462 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
463 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
464 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
465
466
467def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
468 """Generates KERNEL related registers"""
469 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
470 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
471 # set kernel x stride low bit
472 stride = (kernel.stride_x - 1) & 1
473 # set kernel y stride low bit
474 stride |= (kernel.stride_y - 1 & 1) << 1
475 # set kernel x stride extension bits
476 stride |= (kernel.stride_x - 1 >> 1) << 6
477 # set kernel y stride extension bits
478 stride |= (kernel.stride_y - 1 >> 1) << 9
479 stride |= (kernel.dilation_x - 1) << 3
480 stride |= (kernel.dilation_y - 1) << 4
481 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
482 stride |= 1 << 2
483 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
484
485
486def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
487 """Generates WEIGHT registers"""
488 if len(weights) == 0:
489 return
490 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
491 # Set weights sources for active and present cores
492 for core, (addr, length) in enumerate(
493 [
494 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
495 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
496 ]
497 ):
498 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100499 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100500 emit.cmd1_with_offset(length, weights[core].length)
501 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100502 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100503 emit.cmd1_with_offset(length, 0)
504
505
506def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
507 """Generates SCALE registers"""
508 if len(biases) == 0:
509 return
510 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
511 # Set weights sources for active and present cores
512 for core, (addr, length) in enumerate(
513 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
514 ):
515 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100516 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100517 emit.cmd1_with_offset(length, biases[core].length)
518 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100519 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100520 emit.cmd1_with_offset(length, 0)
521
522
523def generate_block_config(
524 emit: CommandStreamEmitter,
525 npu_op: NpuBlockOperation,
526 arch: ArchitectureFeatures,
527 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100528):
529 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100530 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100531 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100532 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
533 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
534 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
535 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
536 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100537
538
539def generate_shram_registers_elementwise(
540 emit: CommandStreamEmitter,
541 npu_op: NpuElementWiseOperation,
542 arch: ArchitectureFeatures,
543 shared_buffer: SharedBufferAllocation,
544):
545 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
546 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
547 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
548 shram_required = arch.available_shram_banks(uses_lut)
549
550 # Acc buffers not needed so set AB_START to size of SHRAM
551 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
552 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
553 if has_ifm2(npu_op):
554 # Set IFM2_IB_START to the latter half of the IB space
555 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
556 emit.cmd0_with_param(
557 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
558 )
559 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
560
561
562def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
563 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
564 emit.cmd0_with_param(
565 cmd0.NPU_SET_IFM_IB_END,
566 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
567 )
568 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
569 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
570
571
Louis Verhaard933f55e2020-11-25 14:10:30 +0100572def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
573 """Creates shared buffer allocation for the given operation"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100574 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100575 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100576 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100577 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100578 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100579 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100580 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100581 block_type = NpuBlockType.ElementWise
582 else:
583 assert 0, "Unsupported operation"
584 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
585 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
586
587
Louis Verhaard1e170182020-11-26 11:42:04 +0100588def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
589 """Generates KERNEL_WAIT/DMA_WAIT"""
590 if cmd_waits.npu >= 0:
591 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
592
593 if cmd_waits.dma >= 0:
594 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
595
596
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100597def generate_common(
598 emit: CommandStreamEmitter,
599 npu_op: NpuBlockOperation,
600 block_traversal: NpuBlockTraversal,
601 arch: ArchitectureFeatures,
602 use_global_scale: bool = False,
603 op_to_scale: int = 0,
604):
605 """Generate registers that are common to most operations"""
606 assert npu_op.ifm is not None and npu_op.ofm is not None
607 generate_ifm(emit, npu_op.ifm)
608 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
609 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
610 if npu_op.padding is not None:
611 generate_padding(emit, npu_op.padding)
612 generate_ofm(emit, npu_op.ofm)
613 generate_ofm_precision(emit, npu_op, use_global_scale)
614 if npu_op.op_type != NpuOperationType.ElementWise:
615 assert npu_op.kernel is not None
616 generate_kernel(emit, npu_op.kernel, block_traversal)
617 generate_weights(emit, npu_op.weights, arch)
618 generate_biases(emit, npu_op.biases, arch)
619 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100620 shared_buffer = create_shared_buffer(npu_op, arch)
621 generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100622 if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100623 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
624 else:
625 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100626
627
628# -------------------------------------------------------------------
629# SCALING
630# -------------------------------------------------------------------
631
632
633def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
634 """Generates OFM_SCALE register for pooling operations"""
635 # For valid padding vela has to output scaling values
636 kernel = pool_op.kernel
637 ifm_quant = pool_op.ifm.quantization
638 ofm_quant = pool_op.ofm.quantization
639 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
640 assert ifm_quant.scale_f32 is not None
641 rescale = 0x3000 * ifm_quant.scale_f32
642 if pool_op.ifm.data_type == NpuDataType.INT16:
643 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100644 x_log2 = math.log2(ifm_quant.scale_f32)
645 rounded_log2 = int(round(x_log2))
646 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
647 shift = rounded_log2 + 12
648 if is_power_of_two and shift in (0, 1):
649 # Special handling if input scale is 1/2048 or 1/4096
650 scale = 3 << shift
651 shift = 0
652 else:
653 shift = 0
654 max_rescale = np.iinfo(np.int16).max / 2
655 while rescale <= max_rescale and shift <= 30:
656 shift += 1
657 rescale *= 2
658 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100659 else:
660 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
661 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
662 scale = int(round_away_zero(scale * rescale))
663 elif pool_op.fused_quantize:
664 # Quantize op requires different scaling
665 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
666 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
667 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
668 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100669 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100670 rescale = pool_op.rescale
671 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
672 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
673 scale = int(round_away_zero(scale * rescale))
674 else:
675 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
676 # kernel height == kernel width == 1 is always true in this case
677 # Normally the scale is maximised, to get maximum precision, which means that
678 # if rescale != 1, scale need to consider the number of bits needed for rescaling
679 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
680 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
681 rescale_bits = 0
682 if kernel.height == kernel.width == 1:
683 if rescale > 1:
684 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
685 elif rescale < 1:
686 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
687 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
688 scale = int(round_away_zero(scale * rescale))
689 else:
690 scale = 1
691 shift = 0
692
693 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
694
695
696def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
697 """
698 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
699 Returns the operator to scale
700 """
701 op_to_scale = 0
702 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
703 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
704 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
705 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
706
707 if npu_op.activation is not None and npu_op.activation.op_type in (
708 NpuActivationOp.SIGMOID,
709 NpuActivationOp.TANH,
710 ):
711 output_scale = 1 / 0x3000
712
713 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
714 if None in (input_scale, input2_scale, output_scale):
715 ofm_scale = 1
716 shift = 0
717 else:
718 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
719 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
720 else: # Add/Sub
721 if None in (input_scale, input2_scale, output_scale):
722 opa_scale = opb_scale = ofm_scale = 1
723 opa_shift = shift = 0
724 if npu_op.rescale is not None:
725 ofm_scale, shift = npu_op.rescale
726 elif input_scale == input2_scale:
727 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
728 input_scale, input2_scale, output_scale
729 )
730 opa_shift = 0 # Unused for this case
731 else:
732 # Use advanced implementation only when input scales differ
733 bitdepth = npu_op.ifm.data_type.size_in_bits()
734 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
735 input_scale, input2_scale, output_scale, bitdepth
736 )
737 opb_scale = 0 # Unused for this case
738 if npu_op.reversed_operands:
739 # If the operand order is reversed we also have to swap which operand is scaled
740 if op_to_scale == scaling.OperandToScale.OPa:
741 op_to_scale = scaling.OperandToScale.OPb
742 else:
743 op_to_scale = scaling.OperandToScale.OPa
744 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
745 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
746 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
747 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
748 output_scale = npu_op.ofm.quantization.scale_f32
749 ofm_scale, shift = scaling.quantise_scale(output_scale)
750 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
751 else:
752 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
753 return op_to_scale
754
755
756# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100757# PRINT
758# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200759
760
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100761def print_feature_map(fm: NpuFeatureMap, name: str):
762 if fm is not None:
763 q = (
764 "no quantization"
765 if fm.quantization is None
766 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
767 )
768 h, w, c = fm.shape
769 sz = h * w * c * fm.data_type.size_in_bytes()
770 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
771 strides = get_strides(fm)
772 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
773 t = fm.tiles
774 addresses = [hex(addr) for addr in t.addresses]
775 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100776
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100777
Dwight Lidman9b43f842020-12-08 17:56:44 +0100778def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
779 pass_info = f", {cmd}" if cmd else ""
780 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
781 print(f"{index} {npu_op.op_type.name}{pass_info}")
782 return
783 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100784 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
785 return
786 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100787 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100788 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200789 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100790 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100791 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100792 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
793 ):
794 fc = "FullyConnected "
795 else:
796 fc = ""
797 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
798 print_feature_map(npu_op.ifm, "IFM")
799 if npu_op.ifm2_scalar is not None:
800 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
801 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
802 else:
803 print_feature_map(npu_op.ifm2, "IFM2")
804 print_feature_map(npu_op.ofm, "OFM")
805 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
806 print(f" Kernel: {k}")
807 if npu_op.padding is not None:
808 print(f" {npu_op.padding}")
809 for weights in npu_op.weights:
810 print(f" Weights: {weights}")
811 for bias in npu_op.biases:
812 print(f" Scales: {bias}")
813 if npu_op.activation is not None:
814 act = npu_op.activation
815 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
816 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
817 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100818 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100819 print(f" {npu_op.block_traversal}")
820 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100821 rescale = (
822 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
823 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100824 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100825
Tim Hall79d07d22020-04-27 18:20:16 +0100826
Dwight Lidman9b43f842020-12-08 17:56:44 +0100827def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
828 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100829 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100830 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100831
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100832
833# -------------------------------------------------------------------
834# OPERATIONS
835# -------------------------------------------------------------------
836
837
838def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
839 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100840 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100841 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100842 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100843 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100844 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100845 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100846 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100847 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100848 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100849 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
850 else:
851 assert 0, "Unsupported operation"
852
853
Louis Verhaard933f55e2020-11-25 14:10:30 +0100854def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100855 """Generates register commands for Conv2D operations"""
856 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100857
858
Dwight Lidman9b43f842020-12-08 17:56:44 +0100859def generate_conv_depthwise_op(
860 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
861):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100862 """Generates register commands for depthwise convolution operations"""
863 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100864
865
866def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
867 """Generates register commands for pooling operations"""
868 use_global_scale = (
869 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
870 )
871 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
872 # Pooling op specific
873 if use_global_scale:
874 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100875
876
877def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
878 """Generates register commands for elementwise operations"""
879 use_global_scale = npu_op.sub_op_type in (
880 NpuElementWiseOp.ADD,
881 NpuElementWiseOp.SUB,
882 NpuElementWiseOp.MUL,
883 NpuElementWiseOp.LRELU,
884 NpuElementWiseOp.ABS,
885 )
886 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
887 generate_common(
888 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
889 )
890 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100891 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100892 # Binary operation; generate IFM2 registers
893 assert npu_op.ifm2 is not None
894 has_scalar = npu_op.ifm2_scalar is not None
895 generate_ifm2(emit, npu_op.ifm2, has_scalar)
896 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
897 generate_ifm2_broadcast(emit, npu_op)
898 if has_scalar:
899 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
900 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
901 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100902
903
904def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
905 """Generates register commands for DMA operations"""
906 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100907 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100908 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
909
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100910 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
911 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100912
913
Louis Verhaard933f55e2020-11-25 14:10:30 +0100914def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100915 """
916 Generates register commands for the given operation, but not the final NPU_OP_... command.
917 Returns the selected block config
918 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100919 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100920 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100921 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100922 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100923 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100924 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100925 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100926 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100927 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100928 generate_dma_op(emit, npu_op)
929 else:
930 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100931
932
933def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100934 npu_op_list: List[NpuOperation],
935 arch: ArchitectureFeatures,
936 verbose: bool,
937 mem_limits: Dict[int, int],
938 add_to_debug_db=None,
939 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100940) -> List[int]:
941 """
942 Generates register commands for the given list of NPU operations.
943 Returns Ethos-U instructions, as a list of 32-bit integers.
944 """
945 emit = CommandStreamEmitter()
946 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100947 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100948 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100949 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100950 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100951 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100952 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100953 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100954 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100955 else:
956 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +0100957
Tim Hallc8a73862020-10-27 12:43:14 +0000958 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100959 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
960 dep_watermark = Watermark(0, 0)
961 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100962 # Generate register commands for all operations
963 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +0100964 try:
965 check_mem_limits(memory_accesses[npu_op], mem_limits)
966 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
967 generate_registers_for_op(emit, npu_op, arch)
968 except VelaError as e:
969 # Add operation info and rethrow
970 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +0100971 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100972 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +0100973 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100974 blockdep = min(blockdep, arch.max_blockdep)
975 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
976 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100977
978 generate_cmd_waits(emit, cmd_waits)
979 # Generate the actual NPU_OP command
980 generate_operation_code(emit, npu_op)
981 if add_to_debug_db is not None:
982 add_to_debug_db(npu_op, emit.offset)
983 # Fill in final part of command stream:
984 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +0100985 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +0100986
987 if emit.size_in_bytes() >= 1 << 24:
988 raise VelaError(
989 f"The command stream size exceeds the hardware limit of 16 MiB. "
990 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
991 )
992
Tim Hall79d07d22020-04-27 18:20:16 +0100993 if verbose:
994 emit.print_cmds()
995 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +0100996 print("command stream length in words", len(res))
997 return res
998
999
1000# -------------------------------------------------------------------
1001# EXTERNAL API
1002# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001003
1004
Louis Verhaard933f55e2020-11-25 14:10:30 +01001005def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
1006 """
1007 Internal implementation of the public facing API for finding block configs.
1008 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001009 if isinstance(npu_op, NpuBlockOperation):
1010 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
1011 shared_buffer = create_shared_buffer(npu_op, arch)
1012 blocks = find_suitable_block_configs(arch, shared_buffer)
1013 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
1014 return []
Louis Verhaard933f55e2020-11-25 14:10:30 +01001015
1016
Louis Verhaardaeae5672020-11-02 18:04:27 +01001017def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001018 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001019 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001020 Calculates dependencies between commands and inserts wait operations if needed.
1021
1022 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001023 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1024 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001025 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001026 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001027 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001028 mem_limits = dict()
1029 for region in range(0, 8):
1030 mem_limits[region] = arch.max_address_offset
1031 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1032 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)