blob: fd32b6552e35f25ce85153565f40f317c827a7cb [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010024from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025from typing import List
26from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010027
28import numpy as np
29
30from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010038from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010039from .api import NpuDataType
40from .api import NpuDmaOperation
41from .api import NpuElementWiseOp
42from .api import NpuElementWiseOperation
43from .api import NpuFeatureMap
44from .api import NpuKernel
45from .api import NpuLayout
46from .api import NpuOperation
47from .api import NpuOperationType
48from .api import NpuPadding
49from .api import NpuPoolingOp
50from .api import NpuPoolingOperation
51from .api import NpuQuantization
52from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010056from .architecture_allocator import ArchitectureBlockConfig
57from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010058from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010060from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010061from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010062from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import quantise_float32
72from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020074from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010075from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010076from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010077from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010078from .register_command_stream_util import calc_blockdep
79from .register_command_stream_util import get_dma_memory_accesses
80from .register_command_stream_util import get_op_memory_accesses
81from .register_command_stream_util import get_strides
82from .register_command_stream_util import get_wait_dependency
83from .register_command_stream_util import has_ifm2
Tim Halld8339a72021-05-27 18:49:40 +010084from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010085from .register_command_stream_util import to_kernel
86from .register_command_stream_util import UNARY_ELEMWISE_OPS
87from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010088
89
90class RegisterMachine:
91 def __init__(self):
92 self.n_banks = 1
93 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
94 self.bank_idx = 0
95
96 def set_register(self, reg, value):
97 is_changed = self.registers[self.bank_idx][reg] != value
98 self.registers[self.bank_idx][reg] = value
99 # is_changed = True # force command
100 return is_changed
101
102 def switch_bank(self):
103 self.bank_idx = (self.bank_idx + 1) % self.n_banks
104
105
106class CmdMode(IntEnum):
107 NoPayload = 0x0000
108 Payload32 = 0x4000
109 Mask = 0xC000
110 CmdOpMask = 0x03FF
111
112
Tim Hall79d07d22020-04-27 18:20:16 +0100113class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000114 WORD_SIZE = 4
115
Tim Hall79d07d22020-04-27 18:20:16 +0100116 def __init__(self):
117 self.cmd_stream = []
118 self.reg_machine = [RegisterMachine(), RegisterMachine()]
119 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000120 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100121
122 def get_reg_machine(self, cmd):
123 if "DMA" in cmd.name:
124 return self.reg_machine[1]
125 else:
126 return self.reg_machine[0]
127
128 def size_in_bytes(self):
129 sz = 0
130 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000131 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100132 return sz
133
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100134 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100135 return [elem for cmd in self.cmd_stream for elem in cmd]
136
137 def print_cmds(self):
138 print("Code: Command: Param: Payload:")
139 for words_for_one_command in self.cmd_stream:
140 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
141 param = words_for_one_command[0] >> 16 # higher 16 bits
142
143 payload_mode = CmdMode(code & CmdMode.Mask)
144
145 # code and command
146 s = " 0x%04x " % code
147 if payload_mode == CmdMode.NoPayload:
148 s += str(cmd0(code & CmdMode.CmdOpMask))
149 else:
150 s += str(cmd1(code & CmdMode.CmdOpMask))
151
152 s = s.ljust(40)
153 s += "%5d" % param
154
155 # payload
156 if payload_mode == CmdMode.Payload32:
157 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
158 else:
159 s += " -"
160
161 print(s)
162
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100163 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100164 if isinstance(param, Enum):
165 param = int(param.value)
166 else:
167 param = int(param)
168 param = param & 0xFFFF
169 command = cmd.value | (param << 16)
170 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
171 return
172
173 # This is not a redundant command, actually write it
174 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000175 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100176
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100177 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200178 offset = int(offset) & 0xFFFFFFFF
179 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100180 command = cmd.value | CmdMode.Payload32.value | (param << 16)
181
182 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
183 return
184
185 # This is not a redundant command, actually write it
186 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000187 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100188
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100189 def cmd1_with_address(self, cmd: cmd1, offset):
190 self.cmd1_with_offset(cmd, offset, offset >> 32)
191
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100192 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100193 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100194 command = ((param & 0xFFFF) << 16) | cmd.value
195 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000196 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100197
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100198 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100199 param = int(param)
200 command = ((param & 0xFFFF) << 16) | cmd.value
201
202 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000203 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100204 self.get_reg_machine(cmd).switch_bank()
205
206
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100207# -------------------------------------------------------------------
208# REGISTER GENERATION
209# -------------------------------------------------------------------
210
211
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100212# TODO: Replace with definitions from ethos_u55_regs
213class IFM2Broadcast(IntEnum):
214 BroadcastHdim = 1 << 0
215 BroadcastWdim = 1 << 1
216 BroadcastCdim = 1 << 2
217 ReverseOperandOrder = 1 << 6
218 UseIFM2Scalar = 1 << 7
219
220
221pooling_op_map = {
222 NpuPoolingOp.MAX: pooling_mode.MAX.value,
223 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
224 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
225}
226
227elementwise_op_map = {
228 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
229 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
230 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
231 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
232 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
233 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
234 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
235 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
236 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
237 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
238}
239
240activation_op_map = {
241 NpuActivationOp.NONE_OR_RELU: activation.NONE,
242 NpuActivationOp.TANH: activation.TANH,
243 NpuActivationOp.SIGMOID: activation.SIGMOID,
244}
245
246# Maps an AccumulatorType enum to the corresponding acc_format value
247acc_format_map = {
248 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
249 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
250 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
251}
252
253resampling_mode_map = {
254 NpuResamplingMode.NONE: resampling_mode.NONE,
255 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
256 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
257}
258
259# Maps data type size in bits to activation precision
260precision_map = {8: 0, 16: 1, 32: 2}
261
262# Maps rounding mode to the corresponding value
263rounding_mode_map = {
264 NpuRoundingMode.TFL: rounding.TFL.value,
265 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
266 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
267}
268
269
Louis Verhaard024c3552021-03-17 14:26:34 +0100270def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
271 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
272 for mem_access in memory_accesses.accesses:
273 for region, range_set in mem_access.regions.items():
274 if region not in mem_limits:
275 raise VelaError(f"Invalid region: {region}")
276 max = mem_limits[region]
277 for start, end in range_set.ranges:
278 for offset in (start, end):
279 if offset < 0:
280 raise VelaError(f"Negative address offset: {offset}, region: {region}")
281 if offset > max:
282 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
283
284
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100285def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
286 """Quantizes the given value"""
287 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
288 zp = 0 if quant is None else quant.zero_point
289 return quantise_float32(value, scale, zp)
290
291
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100292def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
293 """Generates IFM_PAD registers"""
294 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
297 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
298
299
300def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
301 """Generates ACTIVATION registers"""
302 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
303
304 if act.min is None:
305 quantized_min = ofm.data_type.min_value()
306 else:
307 quantized_min = quantise(act.min, ofm.quantization)
308 if act.max is None:
309 quantized_max = ofm.data_type.max_value()
310 else:
311 quantized_max = quantise(act.max, ofm.quantization)
312 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
313 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
314 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
315 assert 0 <= act.lookup_table_index < 8
316 activation_value = 16 + act.lookup_table_index
317 if ofm.data_type == NpuDataType.INT32:
318 activation_value |= 3 << 12 # Force I8 range
319 quantized_min = max(-128, quantized_min)
320 quantized_max = min(127, quantized_max)
321 else:
322 activation_value = activation_op_map[act.op_type]
323 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
325 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
326
327
328def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
329 """Generates xFM_BASE registers"""
330 if layout == NpuLayout.NHCWB16:
331 # Check that all BasePointer addresses are aligned to 16 bytes
332 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100333 for i in range(4):
334 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100335
336
337def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
338 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
339 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
340 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
341 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
342
343
344def generate_strides(
345 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
346):
347 """Generates STRIDE_C/Y/X registers"""
348 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100349 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
350 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
351 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100352
353
354def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
355 """Generates IFM/IFM2_PRECISION register"""
356 dtype = fm.data_type
357 prec = 1 if dtype.is_signed() else 0
358 activation_precision = precision_map[dtype.size_in_bits()]
359 prec += activation_precision << 2
360
361 if fm.layout == NpuLayout.NHCWB16:
362 prec |= 1 << 6
363
364 prec |= op_to_scale << 8
365 emit.cmd0_with_param(precision_cmd, prec)
366
367
368def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
369 """Generates OFM_PRECISION register"""
370 dtype = npu_op.ofm.data_type
371 prec = 1 if dtype.is_signed() else 0
372 activation_precision = precision_map[dtype.size_in_bits()]
373 prec += activation_precision << 1
374
375 if use_global_scale:
376 # Set global scale bit, as opposed to using per channel scale
377 prec |= 1 << 8
378 if npu_op.ofm.layout == NpuLayout.NHCWB16:
379 prec |= 1 << 6
380 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
381 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
382
383
384def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
385 """Generates IFM2_BROADCAST register for binary elementwise operations"""
386 ifm2_broadcast = 0
387 ifm = npu_op.ifm
388 ifm2 = npu_op.ifm2
389 if npu_op.reversed_operands:
390 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
391 if npu_op.ifm2_scalar is not None:
392 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
393 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
394 else:
395 if ifm.shape.height != ifm2.shape.height:
396 # Broadcast in 'H' dimension
397 assert ifm2.shape.height == 1
398 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
399
400 if ifm.shape.width != ifm2.shape.width:
401 # Broadcast in 'W' dimension
402 assert ifm2.shape.width == 1
403 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
404
405 if ifm.shape.depth != ifm2.shape.depth:
406 # Broadcast in 'C' dimension
407 assert ifm2.shape.depth == 1
408 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
409
410 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
411
412
413def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
414 """Generates general IFM registers"""
415 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
416 generate_addresses(
417 emit,
418 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
419 ifm.tiles.addresses,
420 ifm.layout,
421 )
422 generate_tiles(
423 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
424 )
425 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
426 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
427 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
428
429
430def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
431 """Generates general IFM2 registers"""
432 if not has_scalar:
433 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
434 generate_addresses(
435 emit,
436 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
437 ifm2.tiles.addresses,
438 ifm2.layout,
439 )
440 generate_tiles(
441 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
442 )
443 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
444 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
445
446
447def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
448 """Generates general OFM registers"""
449 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
450 generate_addresses(
451 emit,
452 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
453 ofm.tiles.addresses,
454 ofm.layout,
455 )
456 generate_tiles(
457 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
458 )
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
462 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
463 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
464
465
466def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
467 """Generates KERNEL related registers"""
468 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
469 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
470 # set kernel x stride low bit
471 stride = (kernel.stride_x - 1) & 1
472 # set kernel y stride low bit
473 stride |= (kernel.stride_y - 1 & 1) << 1
474 # set kernel x stride extension bits
475 stride |= (kernel.stride_x - 1 >> 1) << 6
476 # set kernel y stride extension bits
477 stride |= (kernel.stride_y - 1 >> 1) << 9
478 stride |= (kernel.dilation_x - 1) << 3
479 stride |= (kernel.dilation_y - 1) << 4
480 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
481 stride |= 1 << 2
482 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
483
484
485def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
486 """Generates WEIGHT registers"""
487 if len(weights) == 0:
488 return
489 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
490 # Set weights sources for active and present cores
491 for core, (addr, length) in enumerate(
492 [
493 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
494 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
495 ]
496 ):
497 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100498 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100499 emit.cmd1_with_offset(length, weights[core].length)
500 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100501 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100502 emit.cmd1_with_offset(length, 0)
503
504
505def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
506 """Generates SCALE registers"""
507 if len(biases) == 0:
508 return
509 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
510 # Set weights sources for active and present cores
511 for core, (addr, length) in enumerate(
512 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
513 ):
514 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100515 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100516 emit.cmd1_with_offset(length, biases[core].length)
517 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100518 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100519 emit.cmd1_with_offset(length, 0)
520
521
522def generate_block_config(
Tim Halld8339a72021-05-27 18:49:40 +0100523 emit: CommandStreamEmitter, block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100524):
525 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100526 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
527 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
528 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100529
530
Tim Halld8339a72021-05-27 18:49:40 +0100531def generate_shram_registers(
532 emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100533):
Tim Halld8339a72021-05-27 18:49:40 +0100534 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
535 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
536 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100537 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100538 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
539 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100540
541
Tim Halld8339a72021-05-27 18:49:40 +0100542def get_block_config_for_npu_op(
543 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
544) -> Optional[ArchitectureBlockConfig]:
545 """
546 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
547 Returns None if the block_config does not fit.
548 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100549
550
Tim Halld8339a72021-05-27 18:49:40 +0100551def get_arch_block_config(
552 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
553) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100554 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100555 assert npu_op.block_config is not None, "block_config has not been set"
556 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100557 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100558 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100559 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100560 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100561 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100562 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100563 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100564 block_type = NpuBlockType.ElementWise
565 else:
566 assert 0, "Unsupported operation"
567 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100568 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
569 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
570 lut_banks = 2 if uses_lut else 0
571 fms = [npu_op.ifm, npu_op.ofm]
572 if npu_op.ifm2 is not None:
573 fms.append(npu_op.ifm2)
574 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
575 ifm_bits = npu_op.ifm.data_type.size_in_bits()
576 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
577 if has_ifm2(npu_op):
578 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
579 else:
580 ifm2_shape = None
581 uses_scalar = npu_op.ifm2_scalar is not None
582 block_config = shape3d_to_block(npu_op.block_config)
583 arch_block_config = try_block_config(
584 block_config,
585 arch,
586 block_type,
Tim Hall30161572021-06-17 17:03:49 +0100587 npu_op.ofm.shape,
Tim Halld8339a72021-05-27 18:49:40 +0100588 ifm_shape,
589 ifm2_shape,
590 uses_scalar,
591 ifm_bits,
592 is_partkernel=is_partkernel,
593 kernel=to_kernel(npu_op.kernel),
594 lut_banks=lut_banks,
595 scaled=all_fms_have_quant,
596 ifm_resampling=ifm_resampling_mode,
597 )
598 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
599 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100600
601
Louis Verhaard1e170182020-11-26 11:42:04 +0100602def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
603 """Generates KERNEL_WAIT/DMA_WAIT"""
604 if cmd_waits.npu >= 0:
605 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
606
607 if cmd_waits.dma >= 0:
608 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
609
610
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100611def generate_common(
612 emit: CommandStreamEmitter,
613 npu_op: NpuBlockOperation,
614 block_traversal: NpuBlockTraversal,
615 arch: ArchitectureFeatures,
616 use_global_scale: bool = False,
617 op_to_scale: int = 0,
618):
619 """Generate registers that are common to most operations"""
620 assert npu_op.ifm is not None and npu_op.ofm is not None
621 generate_ifm(emit, npu_op.ifm)
622 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
623 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
624 if npu_op.padding is not None:
625 generate_padding(emit, npu_op.padding)
626 generate_ofm(emit, npu_op.ofm)
627 generate_ofm_precision(emit, npu_op, use_global_scale)
628 if npu_op.op_type != NpuOperationType.ElementWise:
629 assert npu_op.kernel is not None
630 generate_kernel(emit, npu_op.kernel, block_traversal)
631 generate_weights(emit, npu_op.weights, arch)
632 generate_biases(emit, npu_op.biases, arch)
633 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100634 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
635 generate_block_config(emit, npu_op.block_config)
636 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100637
638
639# -------------------------------------------------------------------
640# SCALING
641# -------------------------------------------------------------------
642
643
644def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
645 """Generates OFM_SCALE register for pooling operations"""
646 # For valid padding vela has to output scaling values
647 kernel = pool_op.kernel
648 ifm_quant = pool_op.ifm.quantization
649 ofm_quant = pool_op.ofm.quantization
650 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
651 assert ifm_quant.scale_f32 is not None
652 rescale = 0x3000 * ifm_quant.scale_f32
653 if pool_op.ifm.data_type == NpuDataType.INT16:
654 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100655 x_log2 = math.log2(ifm_quant.scale_f32)
656 rounded_log2 = int(round(x_log2))
657 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
658 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100659 if is_power_of_two and (
660 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
661 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
662 ):
663 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100664 scale = 3 << shift
665 shift = 0
666 else:
667 shift = 0
668 max_rescale = np.iinfo(np.int16).max / 2
669 while rescale <= max_rescale and shift <= 30:
670 shift += 1
671 rescale *= 2
672 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100673 else:
674 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
675 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
676 scale = int(round_away_zero(scale * rescale))
677 elif pool_op.fused_quantize:
678 # Quantize op requires different scaling
679 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
680 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
681 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
682 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200683 if type(pool_op.rescale) == ExplicitScaling:
684 # Note: reuse of rescale for explicit scaling to not expose this in the external API
685 explicit_scaling = pool_op.rescale
686 assert explicit_scaling.per_channel is False
687 scale = explicit_scaling.multiplier[0]
688 shift = explicit_scaling.shift[0]
689 else:
690 # for ResizeBilinear operations with rescale
691 rescale = pool_op.rescale
692 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
693 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
694 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100695 else:
696 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
697 # kernel height == kernel width == 1 is always true in this case
698 # Normally the scale is maximised, to get maximum precision, which means that
699 # if rescale != 1, scale need to consider the number of bits needed for rescaling
700 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
701 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
702 rescale_bits = 0
703 if kernel.height == kernel.width == 1:
704 if rescale > 1:
705 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
706 elif rescale < 1:
707 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
708 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
709 scale = int(round_away_zero(scale * rescale))
710 else:
711 scale = 1
712 shift = 0
713
714 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
715
716
717def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
718 """
719 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
720 Returns the operator to scale
721 """
722 op_to_scale = 0
723 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
724 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
725 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
726 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
727
728 if npu_op.activation is not None and npu_op.activation.op_type in (
729 NpuActivationOp.SIGMOID,
730 NpuActivationOp.TANH,
731 ):
732 output_scale = 1 / 0x3000
733
734 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200735 if npu_op.rescale:
736 ofm_scale, shift = npu_op.rescale
737 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100738 ofm_scale = 1
739 shift = 0
740 else:
741 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
742 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
743 else: # Add/Sub
Henrik G Olssonad656a82021-03-19 15:50:28 +0100744 bitdepth = npu_op.ifm.data_type.size_in_bits()
745 use_advanced_scaling = False
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100746 if None in (input_scale, input2_scale, output_scale):
747 opa_scale = opb_scale = ofm_scale = 1
748 opa_shift = shift = 0
749 if npu_op.rescale is not None:
750 ofm_scale, shift = npu_op.rescale
Henrik G Olssonad656a82021-03-19 15:50:28 +0100751 elif input_scale == input2_scale and bitdepth == 16:
752 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
753 input_scale, input2_scale, output_scale
754 )
755 # align the double rounding with that of advanced scaling
756 opa_scale /= 2
757 opb_scale /= 2
758 shift -= 1
759 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100760 elif input_scale == input2_scale:
761 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
762 input_scale, input2_scale, output_scale
763 )
764 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100765 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
766 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
767 # the following we know that double rounding will have no effect for advanced scaling
768 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
769 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100770 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100771 use_advanced_scaling = True
772 if use_advanced_scaling:
773 # Use advanced implementation only when input/output scales differ,
774 # or when we can't guarantee the absence of rounding errors
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100775 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
776 input_scale, input2_scale, output_scale, bitdepth
777 )
778 opb_scale = 0 # Unused for this case
779 if npu_op.reversed_operands:
780 # If the operand order is reversed we also have to swap which operand is scaled
781 if op_to_scale == scaling.OperandToScale.OPa:
782 op_to_scale = scaling.OperandToScale.OPb
783 else:
784 op_to_scale = scaling.OperandToScale.OPa
785 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
786 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
787 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
788 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
789 output_scale = npu_op.ofm.quantization.scale_f32
790 ofm_scale, shift = scaling.quantise_scale(output_scale)
791 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
792 else:
793 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
794 return op_to_scale
795
796
797# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100798# PRINT
799# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200800
801
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100802def print_feature_map(fm: NpuFeatureMap, name: str):
803 if fm is not None:
804 q = (
805 "no quantization"
806 if fm.quantization is None
807 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
808 )
809 h, w, c = fm.shape
810 sz = h * w * c * fm.data_type.size_in_bytes()
811 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
812 strides = get_strides(fm)
813 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
814 t = fm.tiles
815 addresses = [hex(addr) for addr in t.addresses]
816 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100817
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100818
Dwight Lidman9b43f842020-12-08 17:56:44 +0100819def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
820 pass_info = f", {cmd}" if cmd else ""
821 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
822 print(f"{index} {npu_op.op_type.name}{pass_info}")
823 return
824 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100825 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
826 return
827 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100828 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100829 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200830 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100831 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100832 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100833 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
834 ):
835 fc = "FullyConnected "
836 else:
837 fc = ""
838 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
839 print_feature_map(npu_op.ifm, "IFM")
840 if npu_op.ifm2_scalar is not None:
841 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
842 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
843 else:
844 print_feature_map(npu_op.ifm2, "IFM2")
845 print_feature_map(npu_op.ofm, "OFM")
846 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
847 print(f" Kernel: {k}")
848 if npu_op.padding is not None:
849 print(f" {npu_op.padding}")
850 for weights in npu_op.weights:
851 print(f" Weights: {weights}")
852 for bias in npu_op.biases:
853 print(f" Scales: {bias}")
854 if npu_op.activation is not None:
855 act = npu_op.activation
856 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
857 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
858 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100859 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100860 print(f" {npu_op.block_traversal}")
861 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100862 rescale = (
863 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
864 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100865 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100866
Tim Hall79d07d22020-04-27 18:20:16 +0100867
Dwight Lidman9b43f842020-12-08 17:56:44 +0100868def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
869 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100870 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100871 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100872
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100873
874# -------------------------------------------------------------------
875# OPERATIONS
876# -------------------------------------------------------------------
877
878
879def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
880 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100881 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100882 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100883 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100885 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100886 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100887 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100888 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100889 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100890 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
891 else:
892 assert 0, "Unsupported operation"
893
894
Louis Verhaard933f55e2020-11-25 14:10:30 +0100895def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100896 """Generates register commands for Conv2D operations"""
897 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100898
899
Dwight Lidman9b43f842020-12-08 17:56:44 +0100900def generate_conv_depthwise_op(
901 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
902):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100903 """Generates register commands for depthwise convolution operations"""
904 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100905
906
907def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
908 """Generates register commands for pooling operations"""
909 use_global_scale = (
910 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
911 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200912 # Note: reuse of rescale for explicit scaling to not expose this in the external API
913 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
914 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100915 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
916 # Pooling op specific
917 if use_global_scale:
918 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100919
920
921def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
922 """Generates register commands for elementwise operations"""
923 use_global_scale = npu_op.sub_op_type in (
924 NpuElementWiseOp.ADD,
925 NpuElementWiseOp.SUB,
926 NpuElementWiseOp.MUL,
927 NpuElementWiseOp.LRELU,
928 NpuElementWiseOp.ABS,
929 )
930 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
931 generate_common(
932 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
933 )
934 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100935 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100936 # Binary operation; generate IFM2 registers
937 assert npu_op.ifm2 is not None
938 has_scalar = npu_op.ifm2_scalar is not None
939 generate_ifm2(emit, npu_op.ifm2, has_scalar)
940 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
941 generate_ifm2_broadcast(emit, npu_op)
942 if has_scalar:
943 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
944 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
945 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100946
947
948def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
949 """Generates register commands for DMA operations"""
950 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100951 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100952 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
953
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100954 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
955 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100956
957
Louis Verhaard933f55e2020-11-25 14:10:30 +0100958def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100959 """
960 Generates register commands for the given operation, but not the final NPU_OP_... command.
961 Returns the selected block config
962 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100963 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100964 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100965 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100966 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100967 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100968 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100969 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100970 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100971 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100972 generate_dma_op(emit, npu_op)
973 else:
974 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100975
976
977def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100978 npu_op_list: List[NpuOperation],
979 arch: ArchitectureFeatures,
980 verbose: bool,
981 mem_limits: Dict[int, int],
982 add_to_debug_db=None,
983 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100984) -> List[int]:
985 """
986 Generates register commands for the given list of NPU operations.
987 Returns Ethos-U instructions, as a list of 32-bit integers.
988 """
989 emit = CommandStreamEmitter()
990 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100991 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100992 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100993 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100994 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100995 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100996 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100997 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100998 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100999 else:
1000 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001001
Tim Hallc8a73862020-10-27 12:43:14 +00001002 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001003 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1004 dep_watermark = Watermark(0, 0)
1005 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001006 # Generate register commands for all operations
1007 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001008 try:
1009 check_mem_limits(memory_accesses[npu_op], mem_limits)
1010 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1011 generate_registers_for_op(emit, npu_op, arch)
1012 except VelaError as e:
1013 # Add operation info and rethrow
1014 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001015 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001016 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001017 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001018 blockdep = min(blockdep, arch.max_blockdep)
1019 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1020 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001021
1022 generate_cmd_waits(emit, cmd_waits)
1023 # Generate the actual NPU_OP command
1024 generate_operation_code(emit, npu_op)
1025 if add_to_debug_db is not None:
1026 add_to_debug_db(npu_op, emit.offset)
1027 # Fill in final part of command stream:
1028 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001029 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001030
1031 if emit.size_in_bytes() >= 1 << 24:
1032 raise VelaError(
1033 f"The command stream size exceeds the hardware limit of 16 MiB. "
1034 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1035 )
1036
Tim Hall79d07d22020-04-27 18:20:16 +01001037 if verbose:
1038 emit.print_cmds()
1039 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +01001040 print("command stream length in words", len(res))
1041 return res
1042
1043
Louis Verhaardaeae5672020-11-02 18:04:27 +01001044def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001045 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001046 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001047 Calculates dependencies between commands and inserts wait operations if needed.
1048
1049 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001050 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1051 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001052 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001053 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001054 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001055 mem_limits = dict()
1056 for region in range(0, 8):
1057 mem_limits[region] = arch.max_address_offset
1058 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1059 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)