blob: d61e5717b0ec35956238d4854a14e8d2317b6678 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010024from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025from typing import List
26from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010027
28import numpy as np
29
30from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010038from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010039from .api import NpuDataType
40from .api import NpuDmaOperation
41from .api import NpuElementWiseOp
42from .api import NpuElementWiseOperation
43from .api import NpuFeatureMap
44from .api import NpuKernel
45from .api import NpuLayout
46from .api import NpuOperation
47from .api import NpuOperationType
48from .api import NpuPadding
49from .api import NpuPoolingOp
50from .api import NpuPoolingOperation
51from .api import NpuQuantization
52from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010056from .architecture_allocator import ArchitectureBlockConfig
57from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010058from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010060from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010061from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010062from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import quantise_float32
72from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010074from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010075from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010076from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010077from .register_command_stream_util import calc_blockdep
78from .register_command_stream_util import get_dma_memory_accesses
79from .register_command_stream_util import get_op_memory_accesses
80from .register_command_stream_util import get_strides
81from .register_command_stream_util import get_wait_dependency
82from .register_command_stream_util import has_ifm2
Tim Halld8339a72021-05-27 18:49:40 +010083from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010084from .register_command_stream_util import to_kernel
85from .register_command_stream_util import UNARY_ELEMWISE_OPS
86from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010087
88
89class RegisterMachine:
90 def __init__(self):
91 self.n_banks = 1
92 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
93 self.bank_idx = 0
94
95 def set_register(self, reg, value):
96 is_changed = self.registers[self.bank_idx][reg] != value
97 self.registers[self.bank_idx][reg] = value
98 # is_changed = True # force command
99 return is_changed
100
101 def switch_bank(self):
102 self.bank_idx = (self.bank_idx + 1) % self.n_banks
103
104
105class CmdMode(IntEnum):
106 NoPayload = 0x0000
107 Payload32 = 0x4000
108 Mask = 0xC000
109 CmdOpMask = 0x03FF
110
111
Tim Hall79d07d22020-04-27 18:20:16 +0100112class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000113 WORD_SIZE = 4
114
Tim Hall79d07d22020-04-27 18:20:16 +0100115 def __init__(self):
116 self.cmd_stream = []
117 self.reg_machine = [RegisterMachine(), RegisterMachine()]
118 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000119 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100120
121 def get_reg_machine(self, cmd):
122 if "DMA" in cmd.name:
123 return self.reg_machine[1]
124 else:
125 return self.reg_machine[0]
126
127 def size_in_bytes(self):
128 sz = 0
129 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000130 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100131 return sz
132
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100133 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100134 return [elem for cmd in self.cmd_stream for elem in cmd]
135
136 def print_cmds(self):
137 print("Code: Command: Param: Payload:")
138 for words_for_one_command in self.cmd_stream:
139 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
140 param = words_for_one_command[0] >> 16 # higher 16 bits
141
142 payload_mode = CmdMode(code & CmdMode.Mask)
143
144 # code and command
145 s = " 0x%04x " % code
146 if payload_mode == CmdMode.NoPayload:
147 s += str(cmd0(code & CmdMode.CmdOpMask))
148 else:
149 s += str(cmd1(code & CmdMode.CmdOpMask))
150
151 s = s.ljust(40)
152 s += "%5d" % param
153
154 # payload
155 if payload_mode == CmdMode.Payload32:
156 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
157 else:
158 s += " -"
159
160 print(s)
161
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100162 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100163 if isinstance(param, Enum):
164 param = int(param.value)
165 else:
166 param = int(param)
167 param = param & 0xFFFF
168 command = cmd.value | (param << 16)
169 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
170 return
171
172 # This is not a redundant command, actually write it
173 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000174 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100175
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100176 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200177 offset = int(offset) & 0xFFFFFFFF
178 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100179 command = cmd.value | CmdMode.Payload32.value | (param << 16)
180
181 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
182 return
183
184 # This is not a redundant command, actually write it
185 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000186 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100187
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100188 def cmd1_with_address(self, cmd: cmd1, offset):
189 self.cmd1_with_offset(cmd, offset, offset >> 32)
190
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100191 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100192 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100193 command = ((param & 0xFFFF) << 16) | cmd.value
194 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000195 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100196
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100197 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100198 param = int(param)
199 command = ((param & 0xFFFF) << 16) | cmd.value
200
201 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000202 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100203 self.get_reg_machine(cmd).switch_bank()
204
205
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100206# -------------------------------------------------------------------
207# REGISTER GENERATION
208# -------------------------------------------------------------------
209
210
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100211# TODO: Replace with definitions from ethos_u55_regs
212class IFM2Broadcast(IntEnum):
213 BroadcastHdim = 1 << 0
214 BroadcastWdim = 1 << 1
215 BroadcastCdim = 1 << 2
216 ReverseOperandOrder = 1 << 6
217 UseIFM2Scalar = 1 << 7
218
219
220pooling_op_map = {
221 NpuPoolingOp.MAX: pooling_mode.MAX.value,
222 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
223 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
224}
225
226elementwise_op_map = {
227 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
228 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
229 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
230 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
231 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
232 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
233 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
234 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
235 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
236 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
237}
238
239activation_op_map = {
240 NpuActivationOp.NONE_OR_RELU: activation.NONE,
241 NpuActivationOp.TANH: activation.TANH,
242 NpuActivationOp.SIGMOID: activation.SIGMOID,
243}
244
245# Maps an AccumulatorType enum to the corresponding acc_format value
246acc_format_map = {
247 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
248 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
249 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
250}
251
252resampling_mode_map = {
253 NpuResamplingMode.NONE: resampling_mode.NONE,
254 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
255 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
256}
257
258# Maps data type size in bits to activation precision
259precision_map = {8: 0, 16: 1, 32: 2}
260
261# Maps rounding mode to the corresponding value
262rounding_mode_map = {
263 NpuRoundingMode.TFL: rounding.TFL.value,
264 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
265 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
266}
267
268
Louis Verhaard024c3552021-03-17 14:26:34 +0100269def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
270 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
271 for mem_access in memory_accesses.accesses:
272 for region, range_set in mem_access.regions.items():
273 if region not in mem_limits:
274 raise VelaError(f"Invalid region: {region}")
275 max = mem_limits[region]
276 for start, end in range_set.ranges:
277 for offset in (start, end):
278 if offset < 0:
279 raise VelaError(f"Negative address offset: {offset}, region: {region}")
280 if offset > max:
281 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
282
283
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100284def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
285 """Quantizes the given value"""
286 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
287 zp = 0 if quant is None else quant.zero_point
288 return quantise_float32(value, scale, zp)
289
290
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100291def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
292 """Generates IFM_PAD registers"""
293 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
294 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
297
298
299def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
300 """Generates ACTIVATION registers"""
301 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
302
303 if act.min is None:
304 quantized_min = ofm.data_type.min_value()
305 else:
306 quantized_min = quantise(act.min, ofm.quantization)
307 if act.max is None:
308 quantized_max = ofm.data_type.max_value()
309 else:
310 quantized_max = quantise(act.max, ofm.quantization)
311 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
312 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
313 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
314 assert 0 <= act.lookup_table_index < 8
315 activation_value = 16 + act.lookup_table_index
316 if ofm.data_type == NpuDataType.INT32:
317 activation_value |= 3 << 12 # Force I8 range
318 quantized_min = max(-128, quantized_min)
319 quantized_max = min(127, quantized_max)
320 else:
321 activation_value = activation_op_map[act.op_type]
322 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
323 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
325
326
327def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
328 """Generates xFM_BASE registers"""
329 if layout == NpuLayout.NHCWB16:
330 # Check that all BasePointer addresses are aligned to 16 bytes
331 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100332 for i in range(4):
333 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100334
335
336def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
337 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
338 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
339 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
340 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
341
342
343def generate_strides(
344 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
345):
346 """Generates STRIDE_C/Y/X registers"""
347 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100348 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
349 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
350 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100351
352
353def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
354 """Generates IFM/IFM2_PRECISION register"""
355 dtype = fm.data_type
356 prec = 1 if dtype.is_signed() else 0
357 activation_precision = precision_map[dtype.size_in_bits()]
358 prec += activation_precision << 2
359
360 if fm.layout == NpuLayout.NHCWB16:
361 prec |= 1 << 6
362
363 prec |= op_to_scale << 8
364 emit.cmd0_with_param(precision_cmd, prec)
365
366
367def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
368 """Generates OFM_PRECISION register"""
369 dtype = npu_op.ofm.data_type
370 prec = 1 if dtype.is_signed() else 0
371 activation_precision = precision_map[dtype.size_in_bits()]
372 prec += activation_precision << 1
373
374 if use_global_scale:
375 # Set global scale bit, as opposed to using per channel scale
376 prec |= 1 << 8
377 if npu_op.ofm.layout == NpuLayout.NHCWB16:
378 prec |= 1 << 6
379 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
380 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
381
382
383def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
384 """Generates IFM2_BROADCAST register for binary elementwise operations"""
385 ifm2_broadcast = 0
386 ifm = npu_op.ifm
387 ifm2 = npu_op.ifm2
388 if npu_op.reversed_operands:
389 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
390 if npu_op.ifm2_scalar is not None:
391 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
392 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
393 else:
394 if ifm.shape.height != ifm2.shape.height:
395 # Broadcast in 'H' dimension
396 assert ifm2.shape.height == 1
397 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
398
399 if ifm.shape.width != ifm2.shape.width:
400 # Broadcast in 'W' dimension
401 assert ifm2.shape.width == 1
402 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
403
404 if ifm.shape.depth != ifm2.shape.depth:
405 # Broadcast in 'C' dimension
406 assert ifm2.shape.depth == 1
407 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
408
409 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
410
411
412def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
413 """Generates general IFM registers"""
414 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
415 generate_addresses(
416 emit,
417 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
418 ifm.tiles.addresses,
419 ifm.layout,
420 )
421 generate_tiles(
422 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
423 )
424 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
425 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
426 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
427
428
429def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
430 """Generates general IFM2 registers"""
431 if not has_scalar:
432 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
433 generate_addresses(
434 emit,
435 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
436 ifm2.tiles.addresses,
437 ifm2.layout,
438 )
439 generate_tiles(
440 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
441 )
442 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
443 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
444
445
446def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
447 """Generates general OFM registers"""
448 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
449 generate_addresses(
450 emit,
451 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
452 ofm.tiles.addresses,
453 ofm.layout,
454 )
455 generate_tiles(
456 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
457 )
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
461 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
462 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
463
464
465def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
466 """Generates KERNEL related registers"""
467 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
468 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
469 # set kernel x stride low bit
470 stride = (kernel.stride_x - 1) & 1
471 # set kernel y stride low bit
472 stride |= (kernel.stride_y - 1 & 1) << 1
473 # set kernel x stride extension bits
474 stride |= (kernel.stride_x - 1 >> 1) << 6
475 # set kernel y stride extension bits
476 stride |= (kernel.stride_y - 1 >> 1) << 9
477 stride |= (kernel.dilation_x - 1) << 3
478 stride |= (kernel.dilation_y - 1) << 4
479 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
480 stride |= 1 << 2
481 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
482
483
484def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
485 """Generates WEIGHT registers"""
486 if len(weights) == 0:
487 return
488 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
489 # Set weights sources for active and present cores
490 for core, (addr, length) in enumerate(
491 [
492 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
493 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
494 ]
495 ):
496 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100497 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100498 emit.cmd1_with_offset(length, weights[core].length)
499 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100500 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100501 emit.cmd1_with_offset(length, 0)
502
503
504def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
505 """Generates SCALE registers"""
506 if len(biases) == 0:
507 return
508 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
509 # Set weights sources for active and present cores
510 for core, (addr, length) in enumerate(
511 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
512 ):
513 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100514 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100515 emit.cmd1_with_offset(length, biases[core].length)
516 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100517 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100518 emit.cmd1_with_offset(length, 0)
519
520
521def generate_block_config(
Tim Halld8339a72021-05-27 18:49:40 +0100522 emit: CommandStreamEmitter, block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100523):
524 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100525 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
526 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
527 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100528
529
Tim Halld8339a72021-05-27 18:49:40 +0100530def generate_shram_registers(
531 emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100532):
Tim Halld8339a72021-05-27 18:49:40 +0100533 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
534 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
535 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100536 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100537 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
538 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100539
540
Tim Halld8339a72021-05-27 18:49:40 +0100541def get_block_config_for_npu_op(
542 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
543) -> Optional[ArchitectureBlockConfig]:
544 """
545 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
546 Returns None if the block_config does not fit.
547 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100548
549
Tim Halld8339a72021-05-27 18:49:40 +0100550def get_arch_block_config(
551 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
552) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100553 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100554 assert npu_op.block_config is not None, "block_config has not been set"
555 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100556 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100557 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100558 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100559 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100560 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100561 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100562 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100563 block_type = NpuBlockType.ElementWise
564 else:
565 assert 0, "Unsupported operation"
566 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100567 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
568 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
569 lut_banks = 2 if uses_lut else 0
570 fms = [npu_op.ifm, npu_op.ofm]
571 if npu_op.ifm2 is not None:
572 fms.append(npu_op.ifm2)
573 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
574 ifm_bits = npu_op.ifm.data_type.size_in_bits()
575 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
576 if has_ifm2(npu_op):
577 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
578 else:
579 ifm2_shape = None
580 uses_scalar = npu_op.ifm2_scalar is not None
581 block_config = shape3d_to_block(npu_op.block_config)
582 arch_block_config = try_block_config(
583 block_config,
584 arch,
585 block_type,
Tim Hall30161572021-06-17 17:03:49 +0100586 npu_op.ofm.shape,
Tim Halld8339a72021-05-27 18:49:40 +0100587 ifm_shape,
588 ifm2_shape,
589 uses_scalar,
590 ifm_bits,
591 is_partkernel=is_partkernel,
592 kernel=to_kernel(npu_op.kernel),
593 lut_banks=lut_banks,
594 scaled=all_fms_have_quant,
595 ifm_resampling=ifm_resampling_mode,
596 )
597 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
598 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100599
600
Louis Verhaard1e170182020-11-26 11:42:04 +0100601def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
602 """Generates KERNEL_WAIT/DMA_WAIT"""
603 if cmd_waits.npu >= 0:
604 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
605
606 if cmd_waits.dma >= 0:
607 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
608
609
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100610def generate_common(
611 emit: CommandStreamEmitter,
612 npu_op: NpuBlockOperation,
613 block_traversal: NpuBlockTraversal,
614 arch: ArchitectureFeatures,
615 use_global_scale: bool = False,
616 op_to_scale: int = 0,
617):
618 """Generate registers that are common to most operations"""
619 assert npu_op.ifm is not None and npu_op.ofm is not None
620 generate_ifm(emit, npu_op.ifm)
621 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
622 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
623 if npu_op.padding is not None:
624 generate_padding(emit, npu_op.padding)
625 generate_ofm(emit, npu_op.ofm)
626 generate_ofm_precision(emit, npu_op, use_global_scale)
627 if npu_op.op_type != NpuOperationType.ElementWise:
628 assert npu_op.kernel is not None
629 generate_kernel(emit, npu_op.kernel, block_traversal)
630 generate_weights(emit, npu_op.weights, arch)
631 generate_biases(emit, npu_op.biases, arch)
632 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100633 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
634 generate_block_config(emit, npu_op.block_config)
635 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100636
637
638# -------------------------------------------------------------------
639# SCALING
640# -------------------------------------------------------------------
641
642
643def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
644 """Generates OFM_SCALE register for pooling operations"""
645 # For valid padding vela has to output scaling values
646 kernel = pool_op.kernel
647 ifm_quant = pool_op.ifm.quantization
648 ofm_quant = pool_op.ofm.quantization
649 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
650 assert ifm_quant.scale_f32 is not None
651 rescale = 0x3000 * ifm_quant.scale_f32
652 if pool_op.ifm.data_type == NpuDataType.INT16:
653 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100654 x_log2 = math.log2(ifm_quant.scale_f32)
655 rounded_log2 = int(round(x_log2))
656 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
657 shift = rounded_log2 + 12
658 if is_power_of_two and shift in (0, 1):
659 # Special handling if input scale is 1/2048 or 1/4096
660 scale = 3 << shift
661 shift = 0
662 else:
663 shift = 0
664 max_rescale = np.iinfo(np.int16).max / 2
665 while rescale <= max_rescale and shift <= 30:
666 shift += 1
667 rescale *= 2
668 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100669 else:
670 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
671 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
672 scale = int(round_away_zero(scale * rescale))
673 elif pool_op.fused_quantize:
674 # Quantize op requires different scaling
675 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
676 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
677 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
678 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100679 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100680 rescale = pool_op.rescale
681 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
682 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
683 scale = int(round_away_zero(scale * rescale))
684 else:
685 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
686 # kernel height == kernel width == 1 is always true in this case
687 # Normally the scale is maximised, to get maximum precision, which means that
688 # if rescale != 1, scale need to consider the number of bits needed for rescaling
689 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
690 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
691 rescale_bits = 0
692 if kernel.height == kernel.width == 1:
693 if rescale > 1:
694 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
695 elif rescale < 1:
696 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
697 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
698 scale = int(round_away_zero(scale * rescale))
699 else:
700 scale = 1
701 shift = 0
702
703 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
704
705
706def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
707 """
708 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
709 Returns the operator to scale
710 """
711 op_to_scale = 0
712 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
713 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
714 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
715 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
716
717 if npu_op.activation is not None and npu_op.activation.op_type in (
718 NpuActivationOp.SIGMOID,
719 NpuActivationOp.TANH,
720 ):
721 output_scale = 1 / 0x3000
722
723 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
724 if None in (input_scale, input2_scale, output_scale):
725 ofm_scale = 1
726 shift = 0
727 else:
728 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
729 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
730 else: # Add/Sub
Henrik G Olssonad656a82021-03-19 15:50:28 +0100731 bitdepth = npu_op.ifm.data_type.size_in_bits()
732 use_advanced_scaling = False
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100733 if None in (input_scale, input2_scale, output_scale):
734 opa_scale = opb_scale = ofm_scale = 1
735 opa_shift = shift = 0
736 if npu_op.rescale is not None:
737 ofm_scale, shift = npu_op.rescale
Henrik G Olssonad656a82021-03-19 15:50:28 +0100738 elif input_scale == input2_scale and bitdepth == 16:
739 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
740 input_scale, input2_scale, output_scale
741 )
742 # align the double rounding with that of advanced scaling
743 opa_scale /= 2
744 opb_scale /= 2
745 shift -= 1
746 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100747 elif input_scale == input2_scale:
748 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
749 input_scale, input2_scale, output_scale
750 )
751 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100752 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
753 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
754 # the following we know that double rounding will have no effect for advanced scaling
755 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
756 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100757 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100758 use_advanced_scaling = True
759 if use_advanced_scaling:
760 # Use advanced implementation only when input/output scales differ,
761 # or when we can't guarantee the absence of rounding errors
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100762 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
763 input_scale, input2_scale, output_scale, bitdepth
764 )
765 opb_scale = 0 # Unused for this case
766 if npu_op.reversed_operands:
767 # If the operand order is reversed we also have to swap which operand is scaled
768 if op_to_scale == scaling.OperandToScale.OPa:
769 op_to_scale = scaling.OperandToScale.OPb
770 else:
771 op_to_scale = scaling.OperandToScale.OPa
772 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
773 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
774 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
775 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
776 output_scale = npu_op.ofm.quantization.scale_f32
777 ofm_scale, shift = scaling.quantise_scale(output_scale)
778 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
779 else:
780 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
781 return op_to_scale
782
783
784# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100785# PRINT
786# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200787
788
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100789def print_feature_map(fm: NpuFeatureMap, name: str):
790 if fm is not None:
791 q = (
792 "no quantization"
793 if fm.quantization is None
794 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
795 )
796 h, w, c = fm.shape
797 sz = h * w * c * fm.data_type.size_in_bytes()
798 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
799 strides = get_strides(fm)
800 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
801 t = fm.tiles
802 addresses = [hex(addr) for addr in t.addresses]
803 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100804
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100805
Dwight Lidman9b43f842020-12-08 17:56:44 +0100806def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
807 pass_info = f", {cmd}" if cmd else ""
808 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
809 print(f"{index} {npu_op.op_type.name}{pass_info}")
810 return
811 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100812 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
813 return
814 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100815 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100816 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200817 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100818 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100819 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100820 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
821 ):
822 fc = "FullyConnected "
823 else:
824 fc = ""
825 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
826 print_feature_map(npu_op.ifm, "IFM")
827 if npu_op.ifm2_scalar is not None:
828 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
829 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
830 else:
831 print_feature_map(npu_op.ifm2, "IFM2")
832 print_feature_map(npu_op.ofm, "OFM")
833 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
834 print(f" Kernel: {k}")
835 if npu_op.padding is not None:
836 print(f" {npu_op.padding}")
837 for weights in npu_op.weights:
838 print(f" Weights: {weights}")
839 for bias in npu_op.biases:
840 print(f" Scales: {bias}")
841 if npu_op.activation is not None:
842 act = npu_op.activation
843 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
844 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
845 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100846 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100847 print(f" {npu_op.block_traversal}")
848 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100849 rescale = (
850 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
851 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100852 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100853
Tim Hall79d07d22020-04-27 18:20:16 +0100854
Dwight Lidman9b43f842020-12-08 17:56:44 +0100855def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
856 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100857 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100858 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100859
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100860
861# -------------------------------------------------------------------
862# OPERATIONS
863# -------------------------------------------------------------------
864
865
866def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
867 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100868 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100869 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100870 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100871 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100872 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100873 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100874 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100875 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100876 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100877 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
878 else:
879 assert 0, "Unsupported operation"
880
881
Louis Verhaard933f55e2020-11-25 14:10:30 +0100882def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100883 """Generates register commands for Conv2D operations"""
884 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100885
886
Dwight Lidman9b43f842020-12-08 17:56:44 +0100887def generate_conv_depthwise_op(
888 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
889):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100890 """Generates register commands for depthwise convolution operations"""
891 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100892
893
894def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
895 """Generates register commands for pooling operations"""
896 use_global_scale = (
897 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
898 )
899 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
900 # Pooling op specific
901 if use_global_scale:
902 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100903
904
905def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
906 """Generates register commands for elementwise operations"""
907 use_global_scale = npu_op.sub_op_type in (
908 NpuElementWiseOp.ADD,
909 NpuElementWiseOp.SUB,
910 NpuElementWiseOp.MUL,
911 NpuElementWiseOp.LRELU,
912 NpuElementWiseOp.ABS,
913 )
914 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
915 generate_common(
916 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
917 )
918 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100919 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920 # Binary operation; generate IFM2 registers
921 assert npu_op.ifm2 is not None
922 has_scalar = npu_op.ifm2_scalar is not None
923 generate_ifm2(emit, npu_op.ifm2, has_scalar)
924 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
925 generate_ifm2_broadcast(emit, npu_op)
926 if has_scalar:
927 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
928 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
929 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100930
931
932def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
933 """Generates register commands for DMA operations"""
934 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100935 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100936 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
937
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100938 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
939 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100940
941
Louis Verhaard933f55e2020-11-25 14:10:30 +0100942def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100943 """
944 Generates register commands for the given operation, but not the final NPU_OP_... command.
945 Returns the selected block config
946 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100947 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100948 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100949 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100950 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100951 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100952 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100953 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100954 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100955 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100956 generate_dma_op(emit, npu_op)
957 else:
958 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100959
960
961def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100962 npu_op_list: List[NpuOperation],
963 arch: ArchitectureFeatures,
964 verbose: bool,
965 mem_limits: Dict[int, int],
966 add_to_debug_db=None,
967 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100968) -> List[int]:
969 """
970 Generates register commands for the given list of NPU operations.
971 Returns Ethos-U instructions, as a list of 32-bit integers.
972 """
973 emit = CommandStreamEmitter()
974 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100975 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100976 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100977 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100978 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100979 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100980 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100981 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100982 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100983 else:
984 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +0100985
Tim Hallc8a73862020-10-27 12:43:14 +0000986 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100987 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
988 dep_watermark = Watermark(0, 0)
989 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100990 # Generate register commands for all operations
991 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +0100992 try:
993 check_mem_limits(memory_accesses[npu_op], mem_limits)
994 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
995 generate_registers_for_op(emit, npu_op, arch)
996 except VelaError as e:
997 # Add operation info and rethrow
998 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +0100999 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001000 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001001 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001002 blockdep = min(blockdep, arch.max_blockdep)
1003 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1004 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001005
1006 generate_cmd_waits(emit, cmd_waits)
1007 # Generate the actual NPU_OP command
1008 generate_operation_code(emit, npu_op)
1009 if add_to_debug_db is not None:
1010 add_to_debug_db(npu_op, emit.offset)
1011 # Fill in final part of command stream:
1012 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001013 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001014
1015 if emit.size_in_bytes() >= 1 << 24:
1016 raise VelaError(
1017 f"The command stream size exceeds the hardware limit of 16 MiB. "
1018 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1019 )
1020
Tim Hall79d07d22020-04-27 18:20:16 +01001021 if verbose:
1022 emit.print_cmds()
1023 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +01001024 print("command stream length in words", len(res))
1025 return res
1026
1027
Louis Verhaardaeae5672020-11-02 18:04:27 +01001028def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001029 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001030 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001031 Calculates dependencies between commands and inserts wait operations if needed.
1032
1033 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001034 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1035 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001036 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001037 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001038 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001039 mem_limits = dict()
1040 for region in range(0, 8):
1041 mem_limits[region] = arch.max_address_offset
1042 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1043 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)