blob: 0e68b1408f468b023546759b6e934758e88d0725 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010024from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010025from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010026from typing import List
27from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010028
29import numpy as np
30
31from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010032from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010033from .api import NpuActivation
34from .api import NpuActivationOp
35from .api import NpuAddressRange
36from .api import NpuBlockOperation
37from .api import NpuBlockTraversal
38from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010039from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010040from .api import NpuDataType
41from .api import NpuDmaOperation
42from .api import NpuElementWiseOp
43from .api import NpuElementWiseOperation
44from .api import NpuFeatureMap
45from .api import NpuKernel
46from .api import NpuLayout
47from .api import NpuOperation
48from .api import NpuOperationType
49from .api import NpuPadding
50from .api import NpuPoolingOp
51from .api import NpuPoolingOperation
52from .api import NpuQuantization
53from .api import NpuResamplingMode
54from .api import NpuRoundingMode
55from .api import NpuShape3D
56from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010057from .architecture_allocator import ArchitectureBlockConfig
58from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010059from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010061from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010062from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010063from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010064from .ethos_u55_regs.ethos_u55_regs import acc_format
65from .ethos_u55_regs.ethos_u55_regs import activation
66from .ethos_u55_regs.ethos_u55_regs import cmd0
67from .ethos_u55_regs.ethos_u55_regs import cmd1
68from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020069from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020070from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010071from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import quantise_float32
73from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010074from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020075from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010076from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010077from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010078from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010079from .register_command_stream_util import calc_blockdep
80from .register_command_stream_util import get_dma_memory_accesses
81from .register_command_stream_util import get_op_memory_accesses
82from .register_command_stream_util import get_strides
83from .register_command_stream_util import get_wait_dependency
84from .register_command_stream_util import has_ifm2
Tim Halld8339a72021-05-27 18:49:40 +010085from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010086from .register_command_stream_util import to_kernel
87from .register_command_stream_util import UNARY_ELEMWISE_OPS
88from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
Tim Hall114baba2022-05-10 12:42:27 +0100139 s = f" {'Offset':6}:"
140 s += f" {'Payload':8}"
141 s += f"{'Param':4}" # no leading space for alignment
142 s += f" {'Code':4}"
143 s += f" - {'Command':30}"
144 s += f" {'Param':5}"
145 print(s)
146
147 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100148 for words_for_one_command in self.cmd_stream:
149 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
150 param = words_for_one_command[0] >> 16 # higher 16 bits
151
152 payload_mode = CmdMode(code & CmdMode.Mask)
153
Tim Hall114baba2022-05-10 12:42:27 +0100154 s = f"0x{offset:06x}:"
155
Tim Hall79d07d22020-04-27 18:20:16 +0100156 if payload_mode == CmdMode.NoPayload:
Tim Hall114baba2022-05-10 12:42:27 +0100157 s += f" {'':8}"
Tim Hall79d07d22020-04-27 18:20:16 +0100158 else:
Tim Hall114baba2022-05-10 12:42:27 +0100159 assert payload_mode == CmdMode.Payload32
160 s += f" {words_for_one_command[1]:08x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100161
Tim Hall114baba2022-05-10 12:42:27 +0100162 s += f" {param:04x}"
163 s += f" {code:04x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100164
Tim Hall114baba2022-05-10 12:42:27 +0100165 if payload_mode == CmdMode.NoPayload:
166 s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
167 offset += 4
Tim Hall79d07d22020-04-27 18:20:16 +0100168 else:
Tim Hall114baba2022-05-10 12:42:27 +0100169 s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
170 offset += 8
Tim Hall79d07d22020-04-27 18:20:16 +0100171
Tim Hall114baba2022-05-10 12:42:27 +0100172 s += f" {param:5}"
Tim Hall79d07d22020-04-27 18:20:16 +0100173 print(s)
174
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100175 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100176 if isinstance(param, Enum):
177 param = int(param.value)
178 else:
179 param = int(param)
180 param = param & 0xFFFF
181 command = cmd.value | (param << 16)
182 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
183 return
184
185 # This is not a redundant command, actually write it
186 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000187 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100188
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100189 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200190 offset = int(offset) & 0xFFFFFFFF
191 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100192 command = cmd.value | CmdMode.Payload32.value | (param << 16)
193
194 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
195 return
196
197 # This is not a redundant command, actually write it
198 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000199 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100200
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100201 def cmd1_with_address(self, cmd: cmd1, offset):
202 self.cmd1_with_offset(cmd, offset, offset >> 32)
203
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100204 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100205 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100206 command = ((param & 0xFFFF) << 16) | cmd.value
207 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000208 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100210 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100211 param = int(param)
212 command = ((param & 0xFFFF) << 16) | cmd.value
213
214 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000215 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100216 self.get_reg_machine(cmd).switch_bank()
217
218
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100219# -------------------------------------------------------------------
220# REGISTER GENERATION
221# -------------------------------------------------------------------
222
223
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100224# TODO: Replace with definitions from ethos_u55_regs
225class IFM2Broadcast(IntEnum):
226 BroadcastHdim = 1 << 0
227 BroadcastWdim = 1 << 1
228 BroadcastCdim = 1 << 2
229 ReverseOperandOrder = 1 << 6
230 UseIFM2Scalar = 1 << 7
231
232
233pooling_op_map = {
234 NpuPoolingOp.MAX: pooling_mode.MAX.value,
235 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
236 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
237}
238
239elementwise_op_map = {
240 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
241 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
242 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
243 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
244 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
245 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
246 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
247 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
248 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
249 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
250}
251
252activation_op_map = {
253 NpuActivationOp.NONE_OR_RELU: activation.NONE,
254 NpuActivationOp.TANH: activation.TANH,
255 NpuActivationOp.SIGMOID: activation.SIGMOID,
256}
257
258# Maps an AccumulatorType enum to the corresponding acc_format value
259acc_format_map = {
260 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
261 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
262 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
263}
264
265resampling_mode_map = {
266 NpuResamplingMode.NONE: resampling_mode.NONE,
267 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
268 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
269}
270
271# Maps data type size in bits to activation precision
272precision_map = {8: 0, 16: 1, 32: 2}
273
274# Maps rounding mode to the corresponding value
275rounding_mode_map = {
276 NpuRoundingMode.TFL: rounding.TFL.value,
277 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
278 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
279}
280
281
Louis Verhaard024c3552021-03-17 14:26:34 +0100282def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
283 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
284 for mem_access in memory_accesses.accesses:
285 for region, range_set in mem_access.regions.items():
286 if region not in mem_limits:
287 raise VelaError(f"Invalid region: {region}")
288 max = mem_limits[region]
289 for start, end in range_set.ranges:
290 for offset in (start, end):
291 if offset < 0:
292 raise VelaError(f"Negative address offset: {offset}, region: {region}")
293 if offset > max:
294 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
295
296
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100297def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
298 """Quantizes the given value"""
299 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
300 zp = 0 if quant is None else quant.zero_point
301 return quantise_float32(value, scale, zp)
302
303
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100304def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
305 """Generates IFM_PAD registers"""
306 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
307 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
308 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
309 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
310
311
312def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
313 """Generates ACTIVATION registers"""
314 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
315
316 if act.min is None:
317 quantized_min = ofm.data_type.min_value()
318 else:
319 quantized_min = quantise(act.min, ofm.quantization)
320 if act.max is None:
321 quantized_max = ofm.data_type.max_value()
322 else:
323 quantized_max = quantise(act.max, ofm.quantization)
324 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
325 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
326 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
327 assert 0 <= act.lookup_table_index < 8
328 activation_value = 16 + act.lookup_table_index
329 if ofm.data_type == NpuDataType.INT32:
330 activation_value |= 3 << 12 # Force I8 range
331 quantized_min = max(-128, quantized_min)
332 quantized_max = min(127, quantized_max)
333 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100334 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100335 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
336 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
337 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
338
339
340def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
341 """Generates xFM_BASE registers"""
342 if layout == NpuLayout.NHCWB16:
343 # Check that all BasePointer addresses are aligned to 16 bytes
344 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100345 for i in range(4):
346 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100347
348
349def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
350 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
351 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
352 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
353 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
354
355
356def generate_strides(
357 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
358):
359 """Generates STRIDE_C/Y/X registers"""
360 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100361 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
362 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
363 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100364
365
366def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
367 """Generates IFM/IFM2_PRECISION register"""
368 dtype = fm.data_type
369 prec = 1 if dtype.is_signed() else 0
370 activation_precision = precision_map[dtype.size_in_bits()]
371 prec += activation_precision << 2
372
373 if fm.layout == NpuLayout.NHCWB16:
374 prec |= 1 << 6
375
376 prec |= op_to_scale << 8
377 emit.cmd0_with_param(precision_cmd, prec)
378
379
380def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
381 """Generates OFM_PRECISION register"""
382 dtype = npu_op.ofm.data_type
383 prec = 1 if dtype.is_signed() else 0
384 activation_precision = precision_map[dtype.size_in_bits()]
385 prec += activation_precision << 1
386
387 if use_global_scale:
388 # Set global scale bit, as opposed to using per channel scale
389 prec |= 1 << 8
390 if npu_op.ofm.layout == NpuLayout.NHCWB16:
391 prec |= 1 << 6
392 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
393 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
394
395
396def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
397 """Generates IFM2_BROADCAST register for binary elementwise operations"""
398 ifm2_broadcast = 0
399 ifm = npu_op.ifm
400 ifm2 = npu_op.ifm2
401 if npu_op.reversed_operands:
402 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
403 if npu_op.ifm2_scalar is not None:
404 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
405 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
406 else:
407 if ifm.shape.height != ifm2.shape.height:
408 # Broadcast in 'H' dimension
409 assert ifm2.shape.height == 1
410 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
411
412 if ifm.shape.width != ifm2.shape.width:
413 # Broadcast in 'W' dimension
414 assert ifm2.shape.width == 1
415 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
416
417 if ifm.shape.depth != ifm2.shape.depth:
418 # Broadcast in 'C' dimension
419 assert ifm2.shape.depth == 1
420 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
421
422 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
423
424
425def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
426 """Generates general IFM registers"""
427 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
428 generate_addresses(
429 emit,
430 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
431 ifm.tiles.addresses,
432 ifm.layout,
433 )
434 generate_tiles(
435 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
436 )
437 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
438 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
439 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
440
441
442def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
443 """Generates general IFM2 registers"""
444 if not has_scalar:
445 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
446 generate_addresses(
447 emit,
448 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
449 ifm2.tiles.addresses,
450 ifm2.layout,
451 )
452 generate_tiles(
453 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
454 )
455 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
456 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
457
458
459def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
460 """Generates general OFM registers"""
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
462 generate_addresses(
463 emit,
464 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
465 ofm.tiles.addresses,
466 ofm.layout,
467 )
468 generate_tiles(
469 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
470 )
471 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
472 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
473 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
474 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
475 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
476
477
478def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
479 """Generates KERNEL related registers"""
480 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
481 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
482 # set kernel x stride low bit
483 stride = (kernel.stride_x - 1) & 1
484 # set kernel y stride low bit
485 stride |= (kernel.stride_y - 1 & 1) << 1
486 # set kernel x stride extension bits
487 stride |= (kernel.stride_x - 1 >> 1) << 6
488 # set kernel y stride extension bits
489 stride |= (kernel.stride_y - 1 >> 1) << 9
490 stride |= (kernel.dilation_x - 1) << 3
491 stride |= (kernel.dilation_y - 1) << 4
492 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
493 stride |= 1 << 2
494 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
495
496
497def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
498 """Generates WEIGHT registers"""
499 if len(weights) == 0:
500 return
501 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
502 # Set weights sources for active and present cores
503 for core, (addr, length) in enumerate(
504 [
505 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
506 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
507 ]
508 ):
509 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100510 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100511 emit.cmd1_with_offset(length, weights[core].length)
512 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100513 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100514 emit.cmd1_with_offset(length, 0)
515
516
517def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
518 """Generates SCALE registers"""
519 if len(biases) == 0:
520 return
521 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
522 # Set weights sources for active and present cores
523 for core, (addr, length) in enumerate(
524 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
525 ):
526 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100527 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100528 emit.cmd1_with_offset(length, biases[core].length)
529 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100530 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100531 emit.cmd1_with_offset(length, 0)
532
533
534def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200535 emit: CommandStreamEmitter,
536 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100537):
538 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100539 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
540 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
541 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100542
543
Tim Halld8339a72021-05-27 18:49:40 +0100544def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200545 emit: CommandStreamEmitter,
546 npu_op: NpuBlockOperation,
547 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100548):
Tim Halld8339a72021-05-27 18:49:40 +0100549 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
550 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
551 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100552 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100553 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
554 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100555
556
Tim Halld8339a72021-05-27 18:49:40 +0100557def get_block_config_for_npu_op(
558 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
559) -> Optional[ArchitectureBlockConfig]:
560 """
561 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
562 Returns None if the block_config does not fit.
563 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100564
565
Tim Halld8339a72021-05-27 18:49:40 +0100566def get_arch_block_config(
567 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
568) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100569 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100570 assert npu_op.block_config is not None, "block_config has not been set"
571 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100572 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100573 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100574 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100575 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100576 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100577 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100578 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100579 block_type = NpuBlockType.ElementWise
580 else:
581 assert 0, "Unsupported operation"
582 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100583 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
584 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
585 lut_banks = 2 if uses_lut else 0
586 fms = [npu_op.ifm, npu_op.ofm]
587 if npu_op.ifm2 is not None:
588 fms.append(npu_op.ifm2)
589 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
590 ifm_bits = npu_op.ifm.data_type.size_in_bits()
591 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
592 if has_ifm2(npu_op):
593 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
594 else:
595 ifm2_shape = None
596 uses_scalar = npu_op.ifm2_scalar is not None
597 block_config = shape3d_to_block(npu_op.block_config)
598 arch_block_config = try_block_config(
599 block_config,
600 arch,
601 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100602 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100603 ifm_shape,
604 ifm2_shape,
605 uses_scalar,
606 ifm_bits,
607 is_partkernel=is_partkernel,
608 kernel=to_kernel(npu_op.kernel),
609 lut_banks=lut_banks,
610 scaled=all_fms_have_quant,
611 ifm_resampling=ifm_resampling_mode,
612 )
613 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
614 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100615
616
Louis Verhaard1e170182020-11-26 11:42:04 +0100617def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
618 """Generates KERNEL_WAIT/DMA_WAIT"""
619 if cmd_waits.npu >= 0:
620 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
621
622 if cmd_waits.dma >= 0:
623 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
624
625
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100626def generate_common(
627 emit: CommandStreamEmitter,
628 npu_op: NpuBlockOperation,
629 block_traversal: NpuBlockTraversal,
630 arch: ArchitectureFeatures,
631 use_global_scale: bool = False,
632 op_to_scale: int = 0,
633):
634 """Generate registers that are common to most operations"""
635 assert npu_op.ifm is not None and npu_op.ofm is not None
636 generate_ifm(emit, npu_op.ifm)
637 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
638 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
639 if npu_op.padding is not None:
640 generate_padding(emit, npu_op.padding)
641 generate_ofm(emit, npu_op.ofm)
642 generate_ofm_precision(emit, npu_op, use_global_scale)
643 if npu_op.op_type != NpuOperationType.ElementWise:
644 assert npu_op.kernel is not None
645 generate_kernel(emit, npu_op.kernel, block_traversal)
646 generate_weights(emit, npu_op.weights, arch)
647 generate_biases(emit, npu_op.biases, arch)
648 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100649 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
650 generate_block_config(emit, npu_op.block_config)
651 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100652
653
654# -------------------------------------------------------------------
655# SCALING
656# -------------------------------------------------------------------
657
658
659def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
660 """Generates OFM_SCALE register for pooling operations"""
661 # For valid padding vela has to output scaling values
662 kernel = pool_op.kernel
663 ifm_quant = pool_op.ifm.quantization
664 ofm_quant = pool_op.ofm.quantization
665 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
666 assert ifm_quant.scale_f32 is not None
667 rescale = 0x3000 * ifm_quant.scale_f32
668 if pool_op.ifm.data_type == NpuDataType.INT16:
669 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100670 x_log2 = math.log2(ifm_quant.scale_f32)
671 rounded_log2 = int(round(x_log2))
672 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
673 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100674 if is_power_of_two and (
675 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
676 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
677 ):
678 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100679 scale = 3 << shift
680 shift = 0
681 else:
682 shift = 0
683 max_rescale = np.iinfo(np.int16).max / 2
684 while rescale <= max_rescale and shift <= 30:
685 shift += 1
686 rescale *= 2
687 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100688 else:
689 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
690 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
691 scale = int(round_away_zero(scale * rescale))
692 elif pool_op.fused_quantize:
693 # Quantize op requires different scaling
694 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
695 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
696 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
697 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200698 if type(pool_op.rescale) == ExplicitScaling:
699 # Note: reuse of rescale for explicit scaling to not expose this in the external API
700 explicit_scaling = pool_op.rescale
701 assert explicit_scaling.per_channel is False
702 scale = explicit_scaling.multiplier[0]
703 shift = explicit_scaling.shift[0]
704 else:
705 # for ResizeBilinear operations with rescale
706 rescale = pool_op.rescale
707 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
708 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
709 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100710 else:
711 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
712 # kernel height == kernel width == 1 is always true in this case
713 # Normally the scale is maximised, to get maximum precision, which means that
714 # if rescale != 1, scale need to consider the number of bits needed for rescaling
715 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
716 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
717 rescale_bits = 0
718 if kernel.height == kernel.width == 1:
719 if rescale > 1:
720 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
721 elif rescale < 1:
722 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
723 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
724 scale = int(round_away_zero(scale * rescale))
725 else:
726 scale = 1
727 shift = 0
728
729 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
730
731
732def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
733 """
734 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
735 Returns the operator to scale
736 """
737 op_to_scale = 0
738 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
739 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
740 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
741 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
742
743 if npu_op.activation is not None and npu_op.activation.op_type in (
744 NpuActivationOp.SIGMOID,
745 NpuActivationOp.TANH,
746 ):
747 output_scale = 1 / 0x3000
748
749 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200750 if npu_op.rescale:
751 ofm_scale, shift = npu_op.rescale
752 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100753 ofm_scale = 1
754 shift = 0
755 else:
756 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
757 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
758 else: # Add/Sub
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100759 opa_scale: float
760 opb_scale: float
Henrik G Olssonad656a82021-03-19 15:50:28 +0100761 bitdepth = npu_op.ifm.data_type.size_in_bits()
762 use_advanced_scaling = False
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100763 if None in (input_scale, input2_scale, output_scale):
764 opa_scale = opb_scale = ofm_scale = 1
765 opa_shift = shift = 0
766 if npu_op.rescale is not None:
767 ofm_scale, shift = npu_op.rescale
Henrik G Olssonad656a82021-03-19 15:50:28 +0100768 elif input_scale == input2_scale and bitdepth == 16:
769 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
770 input_scale, input2_scale, output_scale
771 )
772 # align the double rounding with that of advanced scaling
773 opa_scale /= 2
774 opb_scale /= 2
775 shift -= 1
776 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100777 elif input_scale == input2_scale:
778 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
779 input_scale, input2_scale, output_scale
780 )
781 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100782 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
783 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
784 # the following we know that double rounding will have no effect for advanced scaling
785 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
786 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100787 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100788 use_advanced_scaling = True
789 if use_advanced_scaling:
790 # Use advanced implementation only when input/output scales differ,
791 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200792 (
793 opa_scale,
794 opa_shift,
795 ofm_scale,
796 shift,
797 op_to_scale,
798 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100799 opb_scale = 0 # Unused for this case
800 if npu_op.reversed_operands:
801 # If the operand order is reversed we also have to swap which operand is scaled
802 if op_to_scale == scaling.OperandToScale.OPa:
803 op_to_scale = scaling.OperandToScale.OPb
804 else:
805 op_to_scale = scaling.OperandToScale.OPa
806 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
807 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
808 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
809 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
810 output_scale = npu_op.ofm.quantization.scale_f32
811 ofm_scale, shift = scaling.quantise_scale(output_scale)
812 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
813 else:
814 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
815 return op_to_scale
816
817
818# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100819# PRINT
820# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200821
822
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100823def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100824 if fm is not None:
825 q = (
826 "no quantization"
827 if fm.quantization is None
828 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
829 )
830 h, w, c = fm.shape
831 sz = h * w * c * fm.data_type.size_in_bytes()
832 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
833 strides = get_strides(fm)
834 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
835 t = fm.tiles
836 addresses = [hex(addr) for addr in t.addresses]
837 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000838 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100839
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100840
Dwight Lidman9b43f842020-12-08 17:56:44 +0100841def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000842 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100843 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000844 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100845 return
846 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000847 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100848 return
849 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100850 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000851 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200852 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100853 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100854 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100855 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
856 ):
857 fc = "FullyConnected "
858 else:
859 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000860 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100861 print_feature_map(npu_op.ifm, "IFM")
862 if npu_op.ifm2_scalar is not None:
863 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
864 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
865 else:
866 print_feature_map(npu_op.ifm2, "IFM2")
867 print_feature_map(npu_op.ofm, "OFM")
868 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
869 print(f" Kernel: {k}")
870 if npu_op.padding is not None:
871 print(f" {npu_op.padding}")
872 for weights in npu_op.weights:
873 print(f" Weights: {weights}")
874 for bias in npu_op.biases:
875 print(f" Scales: {bias}")
876 if npu_op.activation is not None:
877 act = npu_op.activation
878 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
879 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
880 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100881 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100882 print(f" {npu_op.block_traversal}")
883 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100884 rescale = (
885 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
886 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100887 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100888
Tim Hall79d07d22020-04-27 18:20:16 +0100889
Dwight Lidman9b43f842020-12-08 17:56:44 +0100890def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
891 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100892 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100893 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100894
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100895
896# -------------------------------------------------------------------
897# OPERATIONS
898# -------------------------------------------------------------------
899
900
901def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
902 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100903 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100904 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100905 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100906 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100907 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100908 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100909 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100910 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100911 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100912 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
913 else:
914 assert 0, "Unsupported operation"
915
916
Louis Verhaard933f55e2020-11-25 14:10:30 +0100917def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100918 """Generates register commands for Conv2D operations"""
919 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920
921
Dwight Lidman9b43f842020-12-08 17:56:44 +0100922def generate_conv_depthwise_op(
923 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
924):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100925 """Generates register commands for depthwise convolution operations"""
926 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100927
928
929def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
930 """Generates register commands for pooling operations"""
931 use_global_scale = (
932 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
933 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200934 # Note: reuse of rescale for explicit scaling to not expose this in the external API
935 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
936 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100937 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
938 # Pooling op specific
939 if use_global_scale:
940 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100941
942
943def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
944 """Generates register commands for elementwise operations"""
945 use_global_scale = npu_op.sub_op_type in (
946 NpuElementWiseOp.ADD,
947 NpuElementWiseOp.SUB,
948 NpuElementWiseOp.MUL,
949 NpuElementWiseOp.LRELU,
950 NpuElementWiseOp.ABS,
951 )
952 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
953 generate_common(
954 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
955 )
956 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100957 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100958 # Binary operation; generate IFM2 registers
959 assert npu_op.ifm2 is not None
960 has_scalar = npu_op.ifm2_scalar is not None
961 generate_ifm2(emit, npu_op.ifm2, has_scalar)
962 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
963 generate_ifm2_broadcast(emit, npu_op)
964 if has_scalar:
965 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
966 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
967 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100968
969
970def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
971 """Generates register commands for DMA operations"""
972 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100973 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100974 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
975
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100976 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
977 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100978
979
Louis Verhaard933f55e2020-11-25 14:10:30 +0100980def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100981 """
982 Generates register commands for the given operation, but not the final NPU_OP_... command.
983 Returns the selected block config
984 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100985 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100986 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100987 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100988 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100989 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100990 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100991 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100992 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100993 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100994 generate_dma_op(emit, npu_op)
995 else:
996 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100997
998
999def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +01001000 npu_op_list: List[NpuOperation],
1001 arch: ArchitectureFeatures,
1002 verbose: bool,
1003 mem_limits: Dict[int, int],
1004 add_to_debug_db=None,
1005 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +01001006) -> List[int]:
1007 """
1008 Generates register commands for the given list of NPU operations.
1009 Returns Ethos-U instructions, as a list of 32-bit integers.
1010 """
1011 emit = CommandStreamEmitter()
1012 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001013 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001014 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001015 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001016 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001017 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001018 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001019 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001020 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001021 else:
1022 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001023
Tim Hallc8a73862020-10-27 12:43:14 +00001024 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001025 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1026 dep_watermark = Watermark(0, 0)
1027 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001028 # Generate register commands for all operations
1029 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001030 try:
1031 check_mem_limits(memory_accesses[npu_op], mem_limits)
1032 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1033 generate_registers_for_op(emit, npu_op, arch)
1034 except VelaError as e:
1035 # Add operation info and rethrow
1036 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001037 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001038 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001039 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001040 blockdep = min(blockdep, arch.max_blockdep)
1041 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1042 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001043
1044 generate_cmd_waits(emit, cmd_waits)
1045 # Generate the actual NPU_OP command
1046 generate_operation_code(emit, npu_op)
1047 if add_to_debug_db is not None:
1048 add_to_debug_db(npu_op, emit.offset)
1049 # Fill in final part of command stream:
1050 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001051 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001052
1053 if emit.size_in_bytes() >= 1 << 24:
1054 raise VelaError(
1055 f"The command stream size exceeds the hardware limit of 16 MiB. "
1056 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1057 )
1058
Tim Hall79d07d22020-04-27 18:20:16 +01001059 if verbose:
1060 emit.print_cmds()
Tim Hall114baba2022-05-10 12:42:27 +01001061 print(f"Number of commands = {len(emit.cmd_stream)}")
1062 print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard1e170182020-11-26 11:42:04 +01001063 return res
1064
1065
Louis Verhaardaeae5672020-11-02 18:04:27 +01001066def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001067 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001068 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001069 Calculates dependencies between commands and inserts wait operations if needed.
1070
1071 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001072 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1073 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001074 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001075 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001076 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001077 mem_limits = dict()
1078 for region in range(0, 8):
1079 mem_limits[region] = arch.max_address_offset
1080 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1081 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)