blob: 7858e70bdcb1c180c999a9096a320b5436b32b12 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010024from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010025from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010026from typing import List
27from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010028
29import numpy as np
30
31from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010032from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010033from .api import NpuActivation
34from .api import NpuActivationOp
35from .api import NpuAddressRange
36from .api import NpuBlockOperation
37from .api import NpuBlockTraversal
38from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010039from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010040from .api import NpuDataType
41from .api import NpuDmaOperation
42from .api import NpuElementWiseOp
43from .api import NpuElementWiseOperation
44from .api import NpuFeatureMap
45from .api import NpuKernel
46from .api import NpuLayout
47from .api import NpuOperation
48from .api import NpuOperationType
49from .api import NpuPadding
50from .api import NpuPoolingOp
51from .api import NpuPoolingOperation
52from .api import NpuQuantization
53from .api import NpuResamplingMode
54from .api import NpuRoundingMode
55from .api import NpuShape3D
56from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010057from .architecture_allocator import ArchitectureBlockConfig
58from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010059from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010061from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010062from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010063from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010064from .ethos_u55_regs.ethos_u55_regs import acc_format
65from .ethos_u55_regs.ethos_u55_regs import activation
66from .ethos_u55_regs.ethos_u55_regs import cmd0
67from .ethos_u55_regs.ethos_u55_regs import cmd1
68from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020069from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020070from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010071from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import quantise_float32
73from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010074from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020075from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010076from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010077from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010078from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010079from .register_command_stream_util import calc_blockdep
80from .register_command_stream_util import get_dma_memory_accesses
81from .register_command_stream_util import get_op_memory_accesses
82from .register_command_stream_util import get_strides
83from .register_command_stream_util import get_wait_dependency
84from .register_command_stream_util import has_ifm2
Tim Halld8339a72021-05-27 18:49:40 +010085from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010086from .register_command_stream_util import to_kernel
87from .register_command_stream_util import UNARY_ELEMWISE_OPS
88from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
139 print("Code: Command: Param: Payload:")
140 for words_for_one_command in self.cmd_stream:
141 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
142 param = words_for_one_command[0] >> 16 # higher 16 bits
143
144 payload_mode = CmdMode(code & CmdMode.Mask)
145
146 # code and command
147 s = " 0x%04x " % code
148 if payload_mode == CmdMode.NoPayload:
149 s += str(cmd0(code & CmdMode.CmdOpMask))
150 else:
151 s += str(cmd1(code & CmdMode.CmdOpMask))
152
153 s = s.ljust(40)
154 s += "%5d" % param
155
156 # payload
157 if payload_mode == CmdMode.Payload32:
158 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
159 else:
160 s += " -"
161
162 print(s)
163
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100164 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100165 if isinstance(param, Enum):
166 param = int(param.value)
167 else:
168 param = int(param)
169 param = param & 0xFFFF
170 command = cmd.value | (param << 16)
171 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
172 return
173
174 # This is not a redundant command, actually write it
175 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000176 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100177
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100178 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200179 offset = int(offset) & 0xFFFFFFFF
180 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100181 command = cmd.value | CmdMode.Payload32.value | (param << 16)
182
183 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
184 return
185
186 # This is not a redundant command, actually write it
187 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000188 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100190 def cmd1_with_address(self, cmd: cmd1, offset):
191 self.cmd1_with_offset(cmd, offset, offset >> 32)
192
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100193 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100194 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100195 command = ((param & 0xFFFF) << 16) | cmd.value
196 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000197 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100198
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100199 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100200 param = int(param)
201 command = ((param & 0xFFFF) << 16) | cmd.value
202
203 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000204 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100205 self.get_reg_machine(cmd).switch_bank()
206
207
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100208# -------------------------------------------------------------------
209# REGISTER GENERATION
210# -------------------------------------------------------------------
211
212
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100213# TODO: Replace with definitions from ethos_u55_regs
214class IFM2Broadcast(IntEnum):
215 BroadcastHdim = 1 << 0
216 BroadcastWdim = 1 << 1
217 BroadcastCdim = 1 << 2
218 ReverseOperandOrder = 1 << 6
219 UseIFM2Scalar = 1 << 7
220
221
222pooling_op_map = {
223 NpuPoolingOp.MAX: pooling_mode.MAX.value,
224 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
225 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
226}
227
228elementwise_op_map = {
229 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
230 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
231 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
232 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
233 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
234 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
235 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
236 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
237 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
238 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
239}
240
241activation_op_map = {
242 NpuActivationOp.NONE_OR_RELU: activation.NONE,
243 NpuActivationOp.TANH: activation.TANH,
244 NpuActivationOp.SIGMOID: activation.SIGMOID,
245}
246
247# Maps an AccumulatorType enum to the corresponding acc_format value
248acc_format_map = {
249 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
250 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
251 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
252}
253
254resampling_mode_map = {
255 NpuResamplingMode.NONE: resampling_mode.NONE,
256 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
257 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
258}
259
260# Maps data type size in bits to activation precision
261precision_map = {8: 0, 16: 1, 32: 2}
262
263# Maps rounding mode to the corresponding value
264rounding_mode_map = {
265 NpuRoundingMode.TFL: rounding.TFL.value,
266 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
267 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
268}
269
270
Louis Verhaard024c3552021-03-17 14:26:34 +0100271def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
272 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
273 for mem_access in memory_accesses.accesses:
274 for region, range_set in mem_access.regions.items():
275 if region not in mem_limits:
276 raise VelaError(f"Invalid region: {region}")
277 max = mem_limits[region]
278 for start, end in range_set.ranges:
279 for offset in (start, end):
280 if offset < 0:
281 raise VelaError(f"Negative address offset: {offset}, region: {region}")
282 if offset > max:
283 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
284
285
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100286def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
287 """Quantizes the given value"""
288 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
289 zp = 0 if quant is None else quant.zero_point
290 return quantise_float32(value, scale, zp)
291
292
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100293def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
294 """Generates IFM_PAD registers"""
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
297 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
298 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
299
300
301def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
302 """Generates ACTIVATION registers"""
303 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
304
305 if act.min is None:
306 quantized_min = ofm.data_type.min_value()
307 else:
308 quantized_min = quantise(act.min, ofm.quantization)
309 if act.max is None:
310 quantized_max = ofm.data_type.max_value()
311 else:
312 quantized_max = quantise(act.max, ofm.quantization)
313 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
314 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
315 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
316 assert 0 <= act.lookup_table_index < 8
317 activation_value = 16 + act.lookup_table_index
318 if ofm.data_type == NpuDataType.INT32:
319 activation_value |= 3 << 12 # Force I8 range
320 quantized_min = max(-128, quantized_min)
321 quantized_max = min(127, quantized_max)
322 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100323 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
325 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
326 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
327
328
329def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
330 """Generates xFM_BASE registers"""
331 if layout == NpuLayout.NHCWB16:
332 # Check that all BasePointer addresses are aligned to 16 bytes
333 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100334 for i in range(4):
335 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100336
337
338def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
339 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
340 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
341 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
342 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
343
344
345def generate_strides(
346 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
347):
348 """Generates STRIDE_C/Y/X registers"""
349 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100350 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
351 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
352 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100353
354
355def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
356 """Generates IFM/IFM2_PRECISION register"""
357 dtype = fm.data_type
358 prec = 1 if dtype.is_signed() else 0
359 activation_precision = precision_map[dtype.size_in_bits()]
360 prec += activation_precision << 2
361
362 if fm.layout == NpuLayout.NHCWB16:
363 prec |= 1 << 6
364
365 prec |= op_to_scale << 8
366 emit.cmd0_with_param(precision_cmd, prec)
367
368
369def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
370 """Generates OFM_PRECISION register"""
371 dtype = npu_op.ofm.data_type
372 prec = 1 if dtype.is_signed() else 0
373 activation_precision = precision_map[dtype.size_in_bits()]
374 prec += activation_precision << 1
375
376 if use_global_scale:
377 # Set global scale bit, as opposed to using per channel scale
378 prec |= 1 << 8
379 if npu_op.ofm.layout == NpuLayout.NHCWB16:
380 prec |= 1 << 6
381 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
382 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
383
384
385def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
386 """Generates IFM2_BROADCAST register for binary elementwise operations"""
387 ifm2_broadcast = 0
388 ifm = npu_op.ifm
389 ifm2 = npu_op.ifm2
390 if npu_op.reversed_operands:
391 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
392 if npu_op.ifm2_scalar is not None:
393 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
394 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
395 else:
396 if ifm.shape.height != ifm2.shape.height:
397 # Broadcast in 'H' dimension
398 assert ifm2.shape.height == 1
399 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
400
401 if ifm.shape.width != ifm2.shape.width:
402 # Broadcast in 'W' dimension
403 assert ifm2.shape.width == 1
404 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
405
406 if ifm.shape.depth != ifm2.shape.depth:
407 # Broadcast in 'C' dimension
408 assert ifm2.shape.depth == 1
409 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
410
411 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
412
413
414def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
415 """Generates general IFM registers"""
416 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
417 generate_addresses(
418 emit,
419 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
420 ifm.tiles.addresses,
421 ifm.layout,
422 )
423 generate_tiles(
424 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
425 )
426 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
427 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
428 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
429
430
431def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
432 """Generates general IFM2 registers"""
433 if not has_scalar:
434 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
435 generate_addresses(
436 emit,
437 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
438 ifm2.tiles.addresses,
439 ifm2.layout,
440 )
441 generate_tiles(
442 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
443 )
444 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
445 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
446
447
448def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
449 """Generates general OFM registers"""
450 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
451 generate_addresses(
452 emit,
453 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
454 ofm.tiles.addresses,
455 ofm.layout,
456 )
457 generate_tiles(
458 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
459 )
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
462 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
463 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
464 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
465
466
467def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
468 """Generates KERNEL related registers"""
469 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
470 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
471 # set kernel x stride low bit
472 stride = (kernel.stride_x - 1) & 1
473 # set kernel y stride low bit
474 stride |= (kernel.stride_y - 1 & 1) << 1
475 # set kernel x stride extension bits
476 stride |= (kernel.stride_x - 1 >> 1) << 6
477 # set kernel y stride extension bits
478 stride |= (kernel.stride_y - 1 >> 1) << 9
479 stride |= (kernel.dilation_x - 1) << 3
480 stride |= (kernel.dilation_y - 1) << 4
481 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
482 stride |= 1 << 2
483 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
484
485
486def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
487 """Generates WEIGHT registers"""
488 if len(weights) == 0:
489 return
490 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
491 # Set weights sources for active and present cores
492 for core, (addr, length) in enumerate(
493 [
494 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
495 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
496 ]
497 ):
498 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100499 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100500 emit.cmd1_with_offset(length, weights[core].length)
501 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100502 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100503 emit.cmd1_with_offset(length, 0)
504
505
506def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
507 """Generates SCALE registers"""
508 if len(biases) == 0:
509 return
510 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
511 # Set weights sources for active and present cores
512 for core, (addr, length) in enumerate(
513 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
514 ):
515 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100516 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100517 emit.cmd1_with_offset(length, biases[core].length)
518 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100519 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100520 emit.cmd1_with_offset(length, 0)
521
522
523def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200524 emit: CommandStreamEmitter,
525 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100526):
527 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100528 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
529 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
530 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100531
532
Tim Halld8339a72021-05-27 18:49:40 +0100533def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200534 emit: CommandStreamEmitter,
535 npu_op: NpuBlockOperation,
536 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100537):
Tim Halld8339a72021-05-27 18:49:40 +0100538 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
539 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
540 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100541 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100542 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
543 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100544
545
Tim Halld8339a72021-05-27 18:49:40 +0100546def get_block_config_for_npu_op(
547 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
548) -> Optional[ArchitectureBlockConfig]:
549 """
550 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
551 Returns None if the block_config does not fit.
552 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100553
554
Tim Halld8339a72021-05-27 18:49:40 +0100555def get_arch_block_config(
556 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
557) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100558 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100559 assert npu_op.block_config is not None, "block_config has not been set"
560 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100561 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100562 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100563 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100564 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100565 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100566 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100567 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100568 block_type = NpuBlockType.ElementWise
569 else:
570 assert 0, "Unsupported operation"
571 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100572 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
573 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
574 lut_banks = 2 if uses_lut else 0
575 fms = [npu_op.ifm, npu_op.ofm]
576 if npu_op.ifm2 is not None:
577 fms.append(npu_op.ifm2)
578 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
579 ifm_bits = npu_op.ifm.data_type.size_in_bits()
580 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
581 if has_ifm2(npu_op):
582 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
583 else:
584 ifm2_shape = None
585 uses_scalar = npu_op.ifm2_scalar is not None
586 block_config = shape3d_to_block(npu_op.block_config)
587 arch_block_config = try_block_config(
588 block_config,
589 arch,
590 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100591 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100592 ifm_shape,
593 ifm2_shape,
594 uses_scalar,
595 ifm_bits,
596 is_partkernel=is_partkernel,
597 kernel=to_kernel(npu_op.kernel),
598 lut_banks=lut_banks,
599 scaled=all_fms_have_quant,
600 ifm_resampling=ifm_resampling_mode,
601 )
602 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
603 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100604
605
Louis Verhaard1e170182020-11-26 11:42:04 +0100606def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
607 """Generates KERNEL_WAIT/DMA_WAIT"""
608 if cmd_waits.npu >= 0:
609 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
610
611 if cmd_waits.dma >= 0:
612 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
613
614
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100615def generate_common(
616 emit: CommandStreamEmitter,
617 npu_op: NpuBlockOperation,
618 block_traversal: NpuBlockTraversal,
619 arch: ArchitectureFeatures,
620 use_global_scale: bool = False,
621 op_to_scale: int = 0,
622):
623 """Generate registers that are common to most operations"""
624 assert npu_op.ifm is not None and npu_op.ofm is not None
625 generate_ifm(emit, npu_op.ifm)
626 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
627 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
628 if npu_op.padding is not None:
629 generate_padding(emit, npu_op.padding)
630 generate_ofm(emit, npu_op.ofm)
631 generate_ofm_precision(emit, npu_op, use_global_scale)
632 if npu_op.op_type != NpuOperationType.ElementWise:
633 assert npu_op.kernel is not None
634 generate_kernel(emit, npu_op.kernel, block_traversal)
635 generate_weights(emit, npu_op.weights, arch)
636 generate_biases(emit, npu_op.biases, arch)
637 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100638 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
639 generate_block_config(emit, npu_op.block_config)
640 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100641
642
643# -------------------------------------------------------------------
644# SCALING
645# -------------------------------------------------------------------
646
647
648def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
649 """Generates OFM_SCALE register for pooling operations"""
650 # For valid padding vela has to output scaling values
651 kernel = pool_op.kernel
652 ifm_quant = pool_op.ifm.quantization
653 ofm_quant = pool_op.ofm.quantization
654 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
655 assert ifm_quant.scale_f32 is not None
656 rescale = 0x3000 * ifm_quant.scale_f32
657 if pool_op.ifm.data_type == NpuDataType.INT16:
658 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100659 x_log2 = math.log2(ifm_quant.scale_f32)
660 rounded_log2 = int(round(x_log2))
661 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
662 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100663 if is_power_of_two and (
664 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
665 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
666 ):
667 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100668 scale = 3 << shift
669 shift = 0
670 else:
671 shift = 0
672 max_rescale = np.iinfo(np.int16).max / 2
673 while rescale <= max_rescale and shift <= 30:
674 shift += 1
675 rescale *= 2
676 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100677 else:
678 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
679 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
680 scale = int(round_away_zero(scale * rescale))
681 elif pool_op.fused_quantize:
682 # Quantize op requires different scaling
683 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
684 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
685 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
686 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200687 if type(pool_op.rescale) == ExplicitScaling:
688 # Note: reuse of rescale for explicit scaling to not expose this in the external API
689 explicit_scaling = pool_op.rescale
690 assert explicit_scaling.per_channel is False
691 scale = explicit_scaling.multiplier[0]
692 shift = explicit_scaling.shift[0]
693 else:
694 # for ResizeBilinear operations with rescale
695 rescale = pool_op.rescale
696 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
697 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
698 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100699 else:
700 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
701 # kernel height == kernel width == 1 is always true in this case
702 # Normally the scale is maximised, to get maximum precision, which means that
703 # if rescale != 1, scale need to consider the number of bits needed for rescaling
704 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
705 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
706 rescale_bits = 0
707 if kernel.height == kernel.width == 1:
708 if rescale > 1:
709 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
710 elif rescale < 1:
711 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
712 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
713 scale = int(round_away_zero(scale * rescale))
714 else:
715 scale = 1
716 shift = 0
717
718 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
719
720
721def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
722 """
723 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
724 Returns the operator to scale
725 """
726 op_to_scale = 0
727 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
728 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
729 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
730 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
731
732 if npu_op.activation is not None and npu_op.activation.op_type in (
733 NpuActivationOp.SIGMOID,
734 NpuActivationOp.TANH,
735 ):
736 output_scale = 1 / 0x3000
737
738 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200739 if npu_op.rescale:
740 ofm_scale, shift = npu_op.rescale
741 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100742 ofm_scale = 1
743 shift = 0
744 else:
745 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
746 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
747 else: # Add/Sub
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100748 opa_scale: float
749 opb_scale: float
Henrik G Olssonad656a82021-03-19 15:50:28 +0100750 bitdepth = npu_op.ifm.data_type.size_in_bits()
751 use_advanced_scaling = False
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100752 if None in (input_scale, input2_scale, output_scale):
753 opa_scale = opb_scale = ofm_scale = 1
754 opa_shift = shift = 0
755 if npu_op.rescale is not None:
756 ofm_scale, shift = npu_op.rescale
Henrik G Olssonad656a82021-03-19 15:50:28 +0100757 elif input_scale == input2_scale and bitdepth == 16:
758 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
759 input_scale, input2_scale, output_scale
760 )
761 # align the double rounding with that of advanced scaling
762 opa_scale /= 2
763 opb_scale /= 2
764 shift -= 1
765 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100766 elif input_scale == input2_scale:
767 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
768 input_scale, input2_scale, output_scale
769 )
770 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100771 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
772 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
773 # the following we know that double rounding will have no effect for advanced scaling
774 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
775 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100776 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100777 use_advanced_scaling = True
778 if use_advanced_scaling:
779 # Use advanced implementation only when input/output scales differ,
780 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200781 (
782 opa_scale,
783 opa_shift,
784 ofm_scale,
785 shift,
786 op_to_scale,
787 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100788 opb_scale = 0 # Unused for this case
789 if npu_op.reversed_operands:
790 # If the operand order is reversed we also have to swap which operand is scaled
791 if op_to_scale == scaling.OperandToScale.OPa:
792 op_to_scale = scaling.OperandToScale.OPb
793 else:
794 op_to_scale = scaling.OperandToScale.OPa
795 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
796 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
797 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
798 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
799 output_scale = npu_op.ofm.quantization.scale_f32
800 ofm_scale, shift = scaling.quantise_scale(output_scale)
801 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
802 else:
803 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
804 return op_to_scale
805
806
807# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100808# PRINT
809# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200810
811
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100812def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100813 if fm is not None:
814 q = (
815 "no quantization"
816 if fm.quantization is None
817 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
818 )
819 h, w, c = fm.shape
820 sz = h * w * c * fm.data_type.size_in_bytes()
821 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
822 strides = get_strides(fm)
823 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
824 t = fm.tiles
825 addresses = [hex(addr) for addr in t.addresses]
826 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000827 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100828
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100829
Dwight Lidman9b43f842020-12-08 17:56:44 +0100830def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000831 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100832 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000833 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100834 return
835 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000836 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100837 return
838 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100839 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000840 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200841 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100842 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100843 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100844 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
845 ):
846 fc = "FullyConnected "
847 else:
848 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000849 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100850 print_feature_map(npu_op.ifm, "IFM")
851 if npu_op.ifm2_scalar is not None:
852 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
853 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
854 else:
855 print_feature_map(npu_op.ifm2, "IFM2")
856 print_feature_map(npu_op.ofm, "OFM")
857 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
858 print(f" Kernel: {k}")
859 if npu_op.padding is not None:
860 print(f" {npu_op.padding}")
861 for weights in npu_op.weights:
862 print(f" Weights: {weights}")
863 for bias in npu_op.biases:
864 print(f" Scales: {bias}")
865 if npu_op.activation is not None:
866 act = npu_op.activation
867 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
868 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
869 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100870 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100871 print(f" {npu_op.block_traversal}")
872 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100873 rescale = (
874 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
875 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100876 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100877
Tim Hall79d07d22020-04-27 18:20:16 +0100878
Dwight Lidman9b43f842020-12-08 17:56:44 +0100879def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
880 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100881 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100882 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100883
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884
885# -------------------------------------------------------------------
886# OPERATIONS
887# -------------------------------------------------------------------
888
889
890def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
891 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100892 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100893 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100894 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100895 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100896 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100897 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100898 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100899 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100900 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100901 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
902 else:
903 assert 0, "Unsupported operation"
904
905
Louis Verhaard933f55e2020-11-25 14:10:30 +0100906def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100907 """Generates register commands for Conv2D operations"""
908 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100909
910
Dwight Lidman9b43f842020-12-08 17:56:44 +0100911def generate_conv_depthwise_op(
912 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
913):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100914 """Generates register commands for depthwise convolution operations"""
915 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100916
917
918def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
919 """Generates register commands for pooling operations"""
920 use_global_scale = (
921 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
922 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200923 # Note: reuse of rescale for explicit scaling to not expose this in the external API
924 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
925 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100926 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
927 # Pooling op specific
928 if use_global_scale:
929 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100930
931
932def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
933 """Generates register commands for elementwise operations"""
934 use_global_scale = npu_op.sub_op_type in (
935 NpuElementWiseOp.ADD,
936 NpuElementWiseOp.SUB,
937 NpuElementWiseOp.MUL,
938 NpuElementWiseOp.LRELU,
939 NpuElementWiseOp.ABS,
940 )
941 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
942 generate_common(
943 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
944 )
945 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100946 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100947 # Binary operation; generate IFM2 registers
948 assert npu_op.ifm2 is not None
949 has_scalar = npu_op.ifm2_scalar is not None
950 generate_ifm2(emit, npu_op.ifm2, has_scalar)
951 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
952 generate_ifm2_broadcast(emit, npu_op)
953 if has_scalar:
954 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
955 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
956 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100957
958
959def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
960 """Generates register commands for DMA operations"""
961 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100962 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100963 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
964
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100965 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
966 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100967
968
Louis Verhaard933f55e2020-11-25 14:10:30 +0100969def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100970 """
971 Generates register commands for the given operation, but not the final NPU_OP_... command.
972 Returns the selected block config
973 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100974 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100975 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100976 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100977 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100978 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100979 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100980 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100981 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100982 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100983 generate_dma_op(emit, npu_op)
984 else:
985 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100986
987
988def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100989 npu_op_list: List[NpuOperation],
990 arch: ArchitectureFeatures,
991 verbose: bool,
992 mem_limits: Dict[int, int],
993 add_to_debug_db=None,
994 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100995) -> List[int]:
996 """
997 Generates register commands for the given list of NPU operations.
998 Returns Ethos-U instructions, as a list of 32-bit integers.
999 """
1000 emit = CommandStreamEmitter()
1001 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001002 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001003 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001004 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001005 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001006 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001007 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001008 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001009 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001010 else:
1011 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001012
Tim Hallc8a73862020-10-27 12:43:14 +00001013 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001014 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1015 dep_watermark = Watermark(0, 0)
1016 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001017 # Generate register commands for all operations
1018 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001019 try:
1020 check_mem_limits(memory_accesses[npu_op], mem_limits)
1021 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1022 generate_registers_for_op(emit, npu_op, arch)
1023 except VelaError as e:
1024 # Add operation info and rethrow
1025 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001026 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001027 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001028 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001029 blockdep = min(blockdep, arch.max_blockdep)
1030 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1031 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001032
1033 generate_cmd_waits(emit, cmd_waits)
1034 # Generate the actual NPU_OP command
1035 generate_operation_code(emit, npu_op)
1036 if add_to_debug_db is not None:
1037 add_to_debug_db(npu_op, emit.offset)
1038 # Fill in final part of command stream:
1039 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001040 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001041
1042 if emit.size_in_bytes() >= 1 << 24:
1043 raise VelaError(
1044 f"The command stream size exceeds the hardware limit of 16 MiB. "
1045 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1046 )
1047
Tim Hall79d07d22020-04-27 18:20:16 +01001048 if verbose:
1049 emit.print_cmds()
1050 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +01001051 print("command stream length in words", len(res))
1052 return res
1053
1054
Louis Verhaardaeae5672020-11-02 18:04:27 +01001055def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001056 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001057 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001058 Calculates dependencies between commands and inserts wait operations if needed.
1059
1060 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001061 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1062 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001063 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001064 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001065 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001066 mem_limits = dict()
1067 for region in range(0, 8):
1068 mem_limits[region] = arch.max_address_offset
1069 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1070 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)