blob: a4466c921771d09665f32a449e644164b5d41c14 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010023from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
29from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010030from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010031from .api import NpuActivation
32from .api import NpuActivationOp
33from .api import NpuAddressRange
34from .api import NpuBlockOperation
35from .api import NpuBlockTraversal
36from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010037from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010038from .api import NpuDataType
39from .api import NpuDmaOperation
40from .api import NpuElementWiseOp
41from .api import NpuElementWiseOperation
42from .api import NpuFeatureMap
43from .api import NpuKernel
44from .api import NpuLayout
45from .api import NpuOperation
46from .api import NpuOperationType
47from .api import NpuPadding
48from .api import NpuPoolingOp
49from .api import NpuPoolingOperation
50from .api import NpuQuantization
51from .api import NpuResamplingMode
52from .api import NpuRoundingMode
53from .api import NpuShape3D
54from .api import NpuTileBox
55from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010056from .architecture_features import ArchitectureFeatures
57from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010058from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import SharedBufferArea
60from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010061from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010062from .ethos_u55_regs.ethos_u55_regs import acc_format
63from .ethos_u55_regs.ethos_u55_regs import activation
64from .ethos_u55_regs.ethos_u55_regs import cmd0
65from .ethos_u55_regs.ethos_u55_regs import cmd1
66from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020067from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020068from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010069from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010070from .numeric_util import quantise_float32
71from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010073from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010074from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010075from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010076from .register_command_stream_util import calc_blockdep
77from .register_command_stream_util import get_dma_memory_accesses
78from .register_command_stream_util import get_op_memory_accesses
79from .register_command_stream_util import get_strides
80from .register_command_stream_util import get_wait_dependency
81from .register_command_stream_util import has_ifm2
Louis Verhaard1e170182020-11-26 11:42:04 +010082from .register_command_stream_util import to_kernel
83from .register_command_stream_util import UNARY_ELEMWISE_OPS
84from .register_command_stream_util import Watermark
Louis Verhaarde8a5a782020-11-02 18:04:27 +010085from .shared_buffer_allocation import find_suitable_block_configs
86from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
87from .shared_buffer_allocation import SharedBufferAllocation
Louis Verhaard024c3552021-03-17 14:26:34 +010088from ethosu.vela.errors import VelaError
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
139 print("Code: Command: Param: Payload:")
140 for words_for_one_command in self.cmd_stream:
141 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
142 param = words_for_one_command[0] >> 16 # higher 16 bits
143
144 payload_mode = CmdMode(code & CmdMode.Mask)
145
146 # code and command
147 s = " 0x%04x " % code
148 if payload_mode == CmdMode.NoPayload:
149 s += str(cmd0(code & CmdMode.CmdOpMask))
150 else:
151 s += str(cmd1(code & CmdMode.CmdOpMask))
152
153 s = s.ljust(40)
154 s += "%5d" % param
155
156 # payload
157 if payload_mode == CmdMode.Payload32:
158 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
159 else:
160 s += " -"
161
162 print(s)
163
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100164 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100165 if isinstance(param, Enum):
166 param = int(param.value)
167 else:
168 param = int(param)
169 param = param & 0xFFFF
170 command = cmd.value | (param << 16)
171 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
172 return
173
174 # This is not a redundant command, actually write it
175 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000176 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100177
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100178 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100179 offset = int(offset) & 0xFFFFFFFFF
180 command = cmd.value | CmdMode.Payload32.value | (param << 16)
181
182 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
183 return
184
185 # This is not a redundant command, actually write it
186 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000187 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100188
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100189 def cmd1_with_address(self, cmd: cmd1, offset):
190 self.cmd1_with_offset(cmd, offset, offset >> 32)
191
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100192 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100193 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100194 command = ((param & 0xFFFF) << 16) | cmd.value
195 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000196 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100197
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100198 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100199 param = int(param)
200 command = ((param & 0xFFFF) << 16) | cmd.value
201
202 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000203 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100204 self.get_reg_machine(cmd).switch_bank()
205
206
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100207# -------------------------------------------------------------------
208# REGISTER GENERATION
209# -------------------------------------------------------------------
210
211
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100212# TODO: Replace with definitions from ethos_u55_regs
213class IFM2Broadcast(IntEnum):
214 BroadcastHdim = 1 << 0
215 BroadcastWdim = 1 << 1
216 BroadcastCdim = 1 << 2
217 ReverseOperandOrder = 1 << 6
218 UseIFM2Scalar = 1 << 7
219
220
221pooling_op_map = {
222 NpuPoolingOp.MAX: pooling_mode.MAX.value,
223 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
224 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
225}
226
227elementwise_op_map = {
228 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
229 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
230 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
231 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
232 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
233 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
234 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
235 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
236 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
237 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
238}
239
240activation_op_map = {
241 NpuActivationOp.NONE_OR_RELU: activation.NONE,
242 NpuActivationOp.TANH: activation.TANH,
243 NpuActivationOp.SIGMOID: activation.SIGMOID,
244}
245
246# Maps an AccumulatorType enum to the corresponding acc_format value
247acc_format_map = {
248 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
249 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
250 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
251}
252
253resampling_mode_map = {
254 NpuResamplingMode.NONE: resampling_mode.NONE,
255 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
256 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
257}
258
259# Maps data type size in bits to activation precision
260precision_map = {8: 0, 16: 1, 32: 2}
261
262# Maps rounding mode to the corresponding value
263rounding_mode_map = {
264 NpuRoundingMode.TFL: rounding.TFL.value,
265 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
266 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
267}
268
269
Louis Verhaard024c3552021-03-17 14:26:34 +0100270def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
271 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
272 for mem_access in memory_accesses.accesses:
273 for region, range_set in mem_access.regions.items():
274 if region not in mem_limits:
275 raise VelaError(f"Invalid region: {region}")
276 max = mem_limits[region]
277 for start, end in range_set.ranges:
278 for offset in (start, end):
279 if offset < 0:
280 raise VelaError(f"Negative address offset: {offset}, region: {region}")
281 if offset > max:
282 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
283
284
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100285def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
286 """Quantizes the given value"""
287 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
288 zp = 0 if quant is None else quant.zero_point
289 return quantise_float32(value, scale, zp)
290
291
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100292def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
293 """Generates IFM_PAD registers"""
294 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
297 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
298
299
300def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
301 """Generates ACTIVATION registers"""
302 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
303
304 if act.min is None:
305 quantized_min = ofm.data_type.min_value()
306 else:
307 quantized_min = quantise(act.min, ofm.quantization)
308 if act.max is None:
309 quantized_max = ofm.data_type.max_value()
310 else:
311 quantized_max = quantise(act.max, ofm.quantization)
312 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
313 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
314 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
315 assert 0 <= act.lookup_table_index < 8
316 activation_value = 16 + act.lookup_table_index
317 if ofm.data_type == NpuDataType.INT32:
318 activation_value |= 3 << 12 # Force I8 range
319 quantized_min = max(-128, quantized_min)
320 quantized_max = min(127, quantized_max)
321 else:
322 activation_value = activation_op_map[act.op_type]
323 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
325 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
326
327
328def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
329 """Generates xFM_BASE registers"""
330 if layout == NpuLayout.NHCWB16:
331 # Check that all BasePointer addresses are aligned to 16 bytes
332 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100333 for i in range(4):
334 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100335
336
337def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
338 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
339 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
340 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
341 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
342
343
344def generate_strides(
345 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
346):
347 """Generates STRIDE_C/Y/X registers"""
348 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100349 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
350 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
351 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100352
353
354def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
355 """Generates IFM/IFM2_PRECISION register"""
356 dtype = fm.data_type
357 prec = 1 if dtype.is_signed() else 0
358 activation_precision = precision_map[dtype.size_in_bits()]
359 prec += activation_precision << 2
360
361 if fm.layout == NpuLayout.NHCWB16:
362 prec |= 1 << 6
363
364 prec |= op_to_scale << 8
365 emit.cmd0_with_param(precision_cmd, prec)
366
367
368def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
369 """Generates OFM_PRECISION register"""
370 dtype = npu_op.ofm.data_type
371 prec = 1 if dtype.is_signed() else 0
372 activation_precision = precision_map[dtype.size_in_bits()]
373 prec += activation_precision << 1
374
375 if use_global_scale:
376 # Set global scale bit, as opposed to using per channel scale
377 prec |= 1 << 8
378 if npu_op.ofm.layout == NpuLayout.NHCWB16:
379 prec |= 1 << 6
380 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
381 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
382
383
384def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
385 """Generates IFM2_BROADCAST register for binary elementwise operations"""
386 ifm2_broadcast = 0
387 ifm = npu_op.ifm
388 ifm2 = npu_op.ifm2
389 if npu_op.reversed_operands:
390 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
391 if npu_op.ifm2_scalar is not None:
392 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
393 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
394 else:
395 if ifm.shape.height != ifm2.shape.height:
396 # Broadcast in 'H' dimension
397 assert ifm2.shape.height == 1
398 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
399
400 if ifm.shape.width != ifm2.shape.width:
401 # Broadcast in 'W' dimension
402 assert ifm2.shape.width == 1
403 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
404
405 if ifm.shape.depth != ifm2.shape.depth:
406 # Broadcast in 'C' dimension
407 assert ifm2.shape.depth == 1
408 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
409
410 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
411
412
413def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
414 """Generates general IFM registers"""
415 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
416 generate_addresses(
417 emit,
418 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
419 ifm.tiles.addresses,
420 ifm.layout,
421 )
422 generate_tiles(
423 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
424 )
425 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
426 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
427 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
428
429
430def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
431 """Generates general IFM2 registers"""
432 if not has_scalar:
433 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
434 generate_addresses(
435 emit,
436 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
437 ifm2.tiles.addresses,
438 ifm2.layout,
439 )
440 generate_tiles(
441 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
442 )
443 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
444 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
445
446
447def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
448 """Generates general OFM registers"""
449 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
450 generate_addresses(
451 emit,
452 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
453 ofm.tiles.addresses,
454 ofm.layout,
455 )
456 generate_tiles(
457 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
458 )
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
462 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
463 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
464
465
466def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
467 """Generates KERNEL related registers"""
468 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
469 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
470 # set kernel x stride low bit
471 stride = (kernel.stride_x - 1) & 1
472 # set kernel y stride low bit
473 stride |= (kernel.stride_y - 1 & 1) << 1
474 # set kernel x stride extension bits
475 stride |= (kernel.stride_x - 1 >> 1) << 6
476 # set kernel y stride extension bits
477 stride |= (kernel.stride_y - 1 >> 1) << 9
478 stride |= (kernel.dilation_x - 1) << 3
479 stride |= (kernel.dilation_y - 1) << 4
480 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
481 stride |= 1 << 2
482 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
483
484
485def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
486 """Generates WEIGHT registers"""
487 if len(weights) == 0:
488 return
489 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
490 # Set weights sources for active and present cores
491 for core, (addr, length) in enumerate(
492 [
493 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
494 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
495 ]
496 ):
497 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100498 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100499 emit.cmd1_with_offset(length, weights[core].length)
500 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100501 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100502 emit.cmd1_with_offset(length, 0)
503
504
505def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
506 """Generates SCALE registers"""
507 if len(biases) == 0:
508 return
509 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
510 # Set weights sources for active and present cores
511 for core, (addr, length) in enumerate(
512 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
513 ):
514 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100515 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100516 emit.cmd1_with_offset(length, biases[core].length)
517 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100518 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100519 emit.cmd1_with_offset(length, 0)
520
521
522def generate_block_config(
523 emit: CommandStreamEmitter,
524 npu_op: NpuBlockOperation,
525 arch: ArchitectureFeatures,
526 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100527):
528 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100529 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100530 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100531 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
532 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
533 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
534 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
535 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100536
537
538def generate_shram_registers_elementwise(
539 emit: CommandStreamEmitter,
540 npu_op: NpuElementWiseOperation,
541 arch: ArchitectureFeatures,
542 shared_buffer: SharedBufferAllocation,
543):
544 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
545 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
546 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
547 shram_required = arch.available_shram_banks(uses_lut)
548
549 # Acc buffers not needed so set AB_START to size of SHRAM
550 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
551 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
552 if has_ifm2(npu_op):
553 # Set IFM2_IB_START to the latter half of the IB space
554 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
555 emit.cmd0_with_param(
556 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
557 )
558 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
559
560
561def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
562 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
563 emit.cmd0_with_param(
564 cmd0.NPU_SET_IFM_IB_END,
565 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
566 )
567 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
568 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
569
570
Louis Verhaard933f55e2020-11-25 14:10:30 +0100571def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
572 """Creates shared buffer allocation for the given operation"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100573 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100574 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100575 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100576 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100577 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100578 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100579 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100580 block_type = NpuBlockType.ElementWise
581 else:
582 assert 0, "Unsupported operation"
583 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
584 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
585
586
Louis Verhaard1e170182020-11-26 11:42:04 +0100587def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
588 """Generates KERNEL_WAIT/DMA_WAIT"""
589 if cmd_waits.npu >= 0:
590 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
591
592 if cmd_waits.dma >= 0:
593 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
594
595
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100596def generate_common(
597 emit: CommandStreamEmitter,
598 npu_op: NpuBlockOperation,
599 block_traversal: NpuBlockTraversal,
600 arch: ArchitectureFeatures,
601 use_global_scale: bool = False,
602 op_to_scale: int = 0,
603):
604 """Generate registers that are common to most operations"""
605 assert npu_op.ifm is not None and npu_op.ofm is not None
606 generate_ifm(emit, npu_op.ifm)
607 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
608 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
609 if npu_op.padding is not None:
610 generate_padding(emit, npu_op.padding)
611 generate_ofm(emit, npu_op.ofm)
612 generate_ofm_precision(emit, npu_op, use_global_scale)
613 if npu_op.op_type != NpuOperationType.ElementWise:
614 assert npu_op.kernel is not None
615 generate_kernel(emit, npu_op.kernel, block_traversal)
616 generate_weights(emit, npu_op.weights, arch)
617 generate_biases(emit, npu_op.biases, arch)
618 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100619 shared_buffer = create_shared_buffer(npu_op, arch)
620 generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100621 if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100622 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
623 else:
624 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100625
626
627# -------------------------------------------------------------------
628# SCALING
629# -------------------------------------------------------------------
630
631
632def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
633 """Generates OFM_SCALE register for pooling operations"""
634 # For valid padding vela has to output scaling values
635 kernel = pool_op.kernel
636 ifm_quant = pool_op.ifm.quantization
637 ofm_quant = pool_op.ofm.quantization
638 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
639 assert ifm_quant.scale_f32 is not None
640 rescale = 0x3000 * ifm_quant.scale_f32
641 if pool_op.ifm.data_type == NpuDataType.INT16:
642 # Calculate scale and shift for the output scale of 1/(3*4096)
643 shift = 0
644 max_rescale = np.iinfo(np.int16).max / 2
645 while rescale <= max_rescale and shift <= 30:
646 shift += 1
647 rescale *= 2
648 scale = int(rescale)
649 else:
650 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
651 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
652 scale = int(round_away_zero(scale * rescale))
653 elif pool_op.fused_quantize:
654 # Quantize op requires different scaling
655 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
656 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
657 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
658 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100659 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100660 rescale = pool_op.rescale
661 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
662 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
663 scale = int(round_away_zero(scale * rescale))
664 else:
665 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
666 # kernel height == kernel width == 1 is always true in this case
667 # Normally the scale is maximised, to get maximum precision, which means that
668 # if rescale != 1, scale need to consider the number of bits needed for rescaling
669 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
670 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
671 rescale_bits = 0
672 if kernel.height == kernel.width == 1:
673 if rescale > 1:
674 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
675 elif rescale < 1:
676 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
677 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
678 scale = int(round_away_zero(scale * rescale))
679 else:
680 scale = 1
681 shift = 0
682
683 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
684
685
686def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
687 """
688 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
689 Returns the operator to scale
690 """
691 op_to_scale = 0
692 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
693 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
694 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
695 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
696
697 if npu_op.activation is not None and npu_op.activation.op_type in (
698 NpuActivationOp.SIGMOID,
699 NpuActivationOp.TANH,
700 ):
701 output_scale = 1 / 0x3000
702
703 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
704 if None in (input_scale, input2_scale, output_scale):
705 ofm_scale = 1
706 shift = 0
707 else:
708 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
709 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
710 else: # Add/Sub
711 if None in (input_scale, input2_scale, output_scale):
712 opa_scale = opb_scale = ofm_scale = 1
713 opa_shift = shift = 0
714 if npu_op.rescale is not None:
715 ofm_scale, shift = npu_op.rescale
716 elif input_scale == input2_scale:
717 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
718 input_scale, input2_scale, output_scale
719 )
720 opa_shift = 0 # Unused for this case
721 else:
722 # Use advanced implementation only when input scales differ
723 bitdepth = npu_op.ifm.data_type.size_in_bits()
724 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
725 input_scale, input2_scale, output_scale, bitdepth
726 )
727 opb_scale = 0 # Unused for this case
728 if npu_op.reversed_operands:
729 # If the operand order is reversed we also have to swap which operand is scaled
730 if op_to_scale == scaling.OperandToScale.OPa:
731 op_to_scale = scaling.OperandToScale.OPb
732 else:
733 op_to_scale = scaling.OperandToScale.OPa
734 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
735 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
736 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
737 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
738 output_scale = npu_op.ofm.quantization.scale_f32
739 ofm_scale, shift = scaling.quantise_scale(output_scale)
740 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
741 else:
742 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
743 return op_to_scale
744
745
746# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100747# PRINT
748# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200749
750
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100751def print_feature_map(fm: NpuFeatureMap, name: str):
752 if fm is not None:
753 q = (
754 "no quantization"
755 if fm.quantization is None
756 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
757 )
758 h, w, c = fm.shape
759 sz = h * w * c * fm.data_type.size_in_bytes()
760 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
761 strides = get_strides(fm)
762 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
763 t = fm.tiles
764 addresses = [hex(addr) for addr in t.addresses]
765 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100766
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100767
Dwight Lidman9b43f842020-12-08 17:56:44 +0100768def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
769 pass_info = f", {cmd}" if cmd else ""
770 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
771 print(f"{index} {npu_op.op_type.name}{pass_info}")
772 return
773 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100774 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
775 return
776 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100777 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100778 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200779 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100780 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100781 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100782 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
783 ):
784 fc = "FullyConnected "
785 else:
786 fc = ""
787 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
788 print_feature_map(npu_op.ifm, "IFM")
789 if npu_op.ifm2_scalar is not None:
790 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
791 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
792 else:
793 print_feature_map(npu_op.ifm2, "IFM2")
794 print_feature_map(npu_op.ofm, "OFM")
795 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
796 print(f" Kernel: {k}")
797 if npu_op.padding is not None:
798 print(f" {npu_op.padding}")
799 for weights in npu_op.weights:
800 print(f" Weights: {weights}")
801 for bias in npu_op.biases:
802 print(f" Scales: {bias}")
803 if npu_op.activation is not None:
804 act = npu_op.activation
805 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
806 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
807 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100808 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100809 print(f" {npu_op.block_traversal}")
810 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100811 rescale = (
812 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
813 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100814 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100815
Tim Hall79d07d22020-04-27 18:20:16 +0100816
Dwight Lidman9b43f842020-12-08 17:56:44 +0100817def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
818 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100819 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100820 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100821
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100822
823# -------------------------------------------------------------------
824# OPERATIONS
825# -------------------------------------------------------------------
826
827
828def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
829 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100830 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100831 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100832 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100833 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100834 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100835 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100836 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100837 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100838 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100839 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
840 else:
841 assert 0, "Unsupported operation"
842
843
Louis Verhaard933f55e2020-11-25 14:10:30 +0100844def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100845 """Generates register commands for Conv2D operations"""
846 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100847
848
Dwight Lidman9b43f842020-12-08 17:56:44 +0100849def generate_conv_depthwise_op(
850 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
851):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100852 """Generates register commands for depthwise convolution operations"""
853 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100854
855
856def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
857 """Generates register commands for pooling operations"""
858 use_global_scale = (
859 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
860 )
861 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
862 # Pooling op specific
863 if use_global_scale:
864 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100865
866
867def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
868 """Generates register commands for elementwise operations"""
869 use_global_scale = npu_op.sub_op_type in (
870 NpuElementWiseOp.ADD,
871 NpuElementWiseOp.SUB,
872 NpuElementWiseOp.MUL,
873 NpuElementWiseOp.LRELU,
874 NpuElementWiseOp.ABS,
875 )
876 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
877 generate_common(
878 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
879 )
880 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100881 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100882 # Binary operation; generate IFM2 registers
883 assert npu_op.ifm2 is not None
884 has_scalar = npu_op.ifm2_scalar is not None
885 generate_ifm2(emit, npu_op.ifm2, has_scalar)
886 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
887 generate_ifm2_broadcast(emit, npu_op)
888 if has_scalar:
889 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
890 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
891 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100892
893
894def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
895 """Generates register commands for DMA operations"""
896 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100897 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100898 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
899
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100900 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
901 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100902
903
Louis Verhaard933f55e2020-11-25 14:10:30 +0100904def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100905 """
906 Generates register commands for the given operation, but not the final NPU_OP_... command.
907 Returns the selected block config
908 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100909 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100910 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100911 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100912 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100913 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100914 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100915 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100916 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100917 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100918 generate_dma_op(emit, npu_op)
919 else:
920 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100921
922
923def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100924 npu_op_list: List[NpuOperation],
925 arch: ArchitectureFeatures,
926 verbose: bool,
927 mem_limits: Dict[int, int],
928 add_to_debug_db=None,
929 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100930) -> List[int]:
931 """
932 Generates register commands for the given list of NPU operations.
933 Returns Ethos-U instructions, as a list of 32-bit integers.
934 """
935 emit = CommandStreamEmitter()
936 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100937 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100938 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100939 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100940 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100941 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100942 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100943 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100944 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100945 else:
946 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +0100947
Tim Hallc8a73862020-10-27 12:43:14 +0000948 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100949 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
950 dep_watermark = Watermark(0, 0)
951 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100952 # Generate register commands for all operations
953 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +0100954 try:
955 check_mem_limits(memory_accesses[npu_op], mem_limits)
956 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
957 generate_registers_for_op(emit, npu_op, arch)
958 except VelaError as e:
959 # Add operation info and rethrow
960 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +0100961 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100962 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +0100963 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100964 blockdep = min(blockdep, arch.max_blockdep)
965 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
966 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100967
968 generate_cmd_waits(emit, cmd_waits)
969 # Generate the actual NPU_OP command
970 generate_operation_code(emit, npu_op)
971 if add_to_debug_db is not None:
972 add_to_debug_db(npu_op, emit.offset)
973 # Fill in final part of command stream:
974 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +0100975 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +0100976
977 if emit.size_in_bytes() >= 1 << 24:
978 raise VelaError(
979 f"The command stream size exceeds the hardware limit of 16 MiB. "
980 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
981 )
982
Tim Hall79d07d22020-04-27 18:20:16 +0100983 if verbose:
984 emit.print_cmds()
985 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +0100986 print("command stream length in words", len(res))
987 return res
988
989
990# -------------------------------------------------------------------
991# EXTERNAL API
992# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100993
994
Louis Verhaard933f55e2020-11-25 14:10:30 +0100995def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
996 """
997 Internal implementation of the public facing API for finding block configs.
998 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100999 if isinstance(npu_op, NpuBlockOperation):
1000 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
1001 shared_buffer = create_shared_buffer(npu_op, arch)
1002 blocks = find_suitable_block_configs(arch, shared_buffer)
1003 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
1004 return []
Louis Verhaard933f55e2020-11-25 14:10:30 +01001005
1006
Louis Verhaardaeae5672020-11-02 18:04:27 +01001007def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001008 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001009 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001010 Calculates dependencies between commands and inserts wait operations if needed.
1011
1012 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001013 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1014 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001015 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001016 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001017 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001018 mem_limits = dict()
1019 for region in range(0, 8):
1020 mem_limits[region] = arch.max_address_offset
1021 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1022 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)