blob: f92536915ec3900e7ec2742602f817bfbd19700e [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010023from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
29from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010030from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010031from .api import NpuActivation
32from .api import NpuActivationOp
33from .api import NpuAddressRange
34from .api import NpuBlockOperation
35from .api import NpuBlockTraversal
36from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010037from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010038from .api import NpuDataType
39from .api import NpuDmaOperation
40from .api import NpuElementWiseOp
41from .api import NpuElementWiseOperation
42from .api import NpuFeatureMap
43from .api import NpuKernel
44from .api import NpuLayout
45from .api import NpuOperation
46from .api import NpuOperationType
47from .api import NpuPadding
48from .api import NpuPoolingOp
49from .api import NpuPoolingOperation
50from .api import NpuQuantization
51from .api import NpuResamplingMode
52from .api import NpuRoundingMode
53from .api import NpuShape3D
54from .api import NpuTileBox
55from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010056from .architecture_features import ArchitectureFeatures
57from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010058from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import SharedBufferArea
60from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010061from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010062from .ethos_u55_regs.ethos_u55_regs import acc_format
63from .ethos_u55_regs.ethos_u55_regs import activation
64from .ethos_u55_regs.ethos_u55_regs import cmd0
65from .ethos_u55_regs.ethos_u55_regs import cmd1
66from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020067from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020068from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010069from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010070from .numeric_util import quantise_float32
71from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010073from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010074from .range_set import MemoryAccessSet
Louis Verhaard1e170182020-11-26 11:42:04 +010075from .register_command_stream_util import calc_blockdep
76from .register_command_stream_util import get_dma_memory_accesses
77from .register_command_stream_util import get_op_memory_accesses
78from .register_command_stream_util import get_strides
79from .register_command_stream_util import get_wait_dependency
80from .register_command_stream_util import has_ifm2
Louis Verhaard1e170182020-11-26 11:42:04 +010081from .register_command_stream_util import to_kernel
82from .register_command_stream_util import UNARY_ELEMWISE_OPS
83from .register_command_stream_util import Watermark
Louis Verhaarde8a5a782020-11-02 18:04:27 +010084from .shared_buffer_allocation import find_suitable_block_configs
85from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
86from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010087
88
89class RegisterMachine:
90 def __init__(self):
91 self.n_banks = 1
92 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
93 self.bank_idx = 0
94
95 def set_register(self, reg, value):
96 is_changed = self.registers[self.bank_idx][reg] != value
97 self.registers[self.bank_idx][reg] = value
98 # is_changed = True # force command
99 return is_changed
100
101 def switch_bank(self):
102 self.bank_idx = (self.bank_idx + 1) % self.n_banks
103
104
105class CmdMode(IntEnum):
106 NoPayload = 0x0000
107 Payload32 = 0x4000
108 Mask = 0xC000
109 CmdOpMask = 0x03FF
110
111
Tim Hall79d07d22020-04-27 18:20:16 +0100112class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000113 WORD_SIZE = 4
114
Tim Hall79d07d22020-04-27 18:20:16 +0100115 def __init__(self):
116 self.cmd_stream = []
117 self.reg_machine = [RegisterMachine(), RegisterMachine()]
118 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000119 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100120
121 def get_reg_machine(self, cmd):
122 if "DMA" in cmd.name:
123 return self.reg_machine[1]
124 else:
125 return self.reg_machine[0]
126
127 def size_in_bytes(self):
128 sz = 0
129 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000130 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100131 return sz
132
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100133 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100134 return [elem for cmd in self.cmd_stream for elem in cmd]
135
136 def print_cmds(self):
137 print("Code: Command: Param: Payload:")
138 for words_for_one_command in self.cmd_stream:
139 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
140 param = words_for_one_command[0] >> 16 # higher 16 bits
141
142 payload_mode = CmdMode(code & CmdMode.Mask)
143
144 # code and command
145 s = " 0x%04x " % code
146 if payload_mode == CmdMode.NoPayload:
147 s += str(cmd0(code & CmdMode.CmdOpMask))
148 else:
149 s += str(cmd1(code & CmdMode.CmdOpMask))
150
151 s = s.ljust(40)
152 s += "%5d" % param
153
154 # payload
155 if payload_mode == CmdMode.Payload32:
156 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
157 else:
158 s += " -"
159
160 print(s)
161
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100162 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100163 if isinstance(param, Enum):
164 param = int(param.value)
165 else:
166 param = int(param)
167 param = param & 0xFFFF
168 command = cmd.value | (param << 16)
169 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
170 return
171
172 # This is not a redundant command, actually write it
173 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000174 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100175
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100176 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100177 offset = int(offset) & 0xFFFFFFFFF
178 command = cmd.value | CmdMode.Payload32.value | (param << 16)
179
180 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
181 return
182
183 # This is not a redundant command, actually write it
184 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000185 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100186
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100187 def cmd1_with_address(self, cmd: cmd1, offset):
188 self.cmd1_with_offset(cmd, offset, offset >> 32)
189
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100190 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100191 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100192 command = ((param & 0xFFFF) << 16) | cmd.value
193 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000194 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100195
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100196 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100197 param = int(param)
198 command = ((param & 0xFFFF) << 16) | cmd.value
199
200 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000201 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100202 self.get_reg_machine(cmd).switch_bank()
203
204
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100205# -------------------------------------------------------------------
206# REGISTER GENERATION
207# -------------------------------------------------------------------
208
209
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100210# TODO: Replace with definitions from ethos_u55_regs
211class IFM2Broadcast(IntEnum):
212 BroadcastHdim = 1 << 0
213 BroadcastWdim = 1 << 1
214 BroadcastCdim = 1 << 2
215 ReverseOperandOrder = 1 << 6
216 UseIFM2Scalar = 1 << 7
217
218
219pooling_op_map = {
220 NpuPoolingOp.MAX: pooling_mode.MAX.value,
221 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
222 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
223}
224
225elementwise_op_map = {
226 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
227 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
228 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
229 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
230 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
231 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
232 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
233 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
234 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
235 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
236}
237
238activation_op_map = {
239 NpuActivationOp.NONE_OR_RELU: activation.NONE,
240 NpuActivationOp.TANH: activation.TANH,
241 NpuActivationOp.SIGMOID: activation.SIGMOID,
242}
243
244# Maps an AccumulatorType enum to the corresponding acc_format value
245acc_format_map = {
246 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
247 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
248 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
249}
250
251resampling_mode_map = {
252 NpuResamplingMode.NONE: resampling_mode.NONE,
253 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
254 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
255}
256
257# Maps data type size in bits to activation precision
258precision_map = {8: 0, 16: 1, 32: 2}
259
260# Maps rounding mode to the corresponding value
261rounding_mode_map = {
262 NpuRoundingMode.TFL: rounding.TFL.value,
263 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
264 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
265}
266
267
268def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
269 """Quantizes the given value"""
270 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
271 zp = 0 if quant is None else quant.zero_point
272 return quantise_float32(value, scale, zp)
273
274
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100275def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
276 """Generates IFM_PAD registers"""
277 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
278 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
279 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
280 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
281
282
283def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
284 """Generates ACTIVATION registers"""
285 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
286
287 if act.min is None:
288 quantized_min = ofm.data_type.min_value()
289 else:
290 quantized_min = quantise(act.min, ofm.quantization)
291 if act.max is None:
292 quantized_max = ofm.data_type.max_value()
293 else:
294 quantized_max = quantise(act.max, ofm.quantization)
295 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
296 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
297 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
298 assert 0 <= act.lookup_table_index < 8
299 activation_value = 16 + act.lookup_table_index
300 if ofm.data_type == NpuDataType.INT32:
301 activation_value |= 3 << 12 # Force I8 range
302 quantized_min = max(-128, quantized_min)
303 quantized_max = min(127, quantized_max)
304 else:
305 activation_value = activation_op_map[act.op_type]
306 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
307 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
308 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
309
310
311def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
312 """Generates xFM_BASE registers"""
313 if layout == NpuLayout.NHCWB16:
314 # Check that all BasePointer addresses are aligned to 16 bytes
315 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100316 for i in range(4):
317 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100318
319
320def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
321 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
322 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
323 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
324 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
325
326
327def generate_strides(
328 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
329):
330 """Generates STRIDE_C/Y/X registers"""
331 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100332 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
333 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
334 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100335
336
337def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
338 """Generates IFM/IFM2_PRECISION register"""
339 dtype = fm.data_type
340 prec = 1 if dtype.is_signed() else 0
341 activation_precision = precision_map[dtype.size_in_bits()]
342 prec += activation_precision << 2
343
344 if fm.layout == NpuLayout.NHCWB16:
345 prec |= 1 << 6
346
347 prec |= op_to_scale << 8
348 emit.cmd0_with_param(precision_cmd, prec)
349
350
351def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
352 """Generates OFM_PRECISION register"""
353 dtype = npu_op.ofm.data_type
354 prec = 1 if dtype.is_signed() else 0
355 activation_precision = precision_map[dtype.size_in_bits()]
356 prec += activation_precision << 1
357
358 if use_global_scale:
359 # Set global scale bit, as opposed to using per channel scale
360 prec |= 1 << 8
361 if npu_op.ofm.layout == NpuLayout.NHCWB16:
362 prec |= 1 << 6
363 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
364 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
365
366
367def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
368 """Generates IFM2_BROADCAST register for binary elementwise operations"""
369 ifm2_broadcast = 0
370 ifm = npu_op.ifm
371 ifm2 = npu_op.ifm2
372 if npu_op.reversed_operands:
373 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
374 if npu_op.ifm2_scalar is not None:
375 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
376 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
377 else:
378 if ifm.shape.height != ifm2.shape.height:
379 # Broadcast in 'H' dimension
380 assert ifm2.shape.height == 1
381 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
382
383 if ifm.shape.width != ifm2.shape.width:
384 # Broadcast in 'W' dimension
385 assert ifm2.shape.width == 1
386 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
387
388 if ifm.shape.depth != ifm2.shape.depth:
389 # Broadcast in 'C' dimension
390 assert ifm2.shape.depth == 1
391 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
392
393 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
394
395
396def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
397 """Generates general IFM registers"""
398 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
399 generate_addresses(
400 emit,
401 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
402 ifm.tiles.addresses,
403 ifm.layout,
404 )
405 generate_tiles(
406 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
407 )
408 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
409 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
410 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
411
412
413def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
414 """Generates general IFM2 registers"""
415 if not has_scalar:
416 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
417 generate_addresses(
418 emit,
419 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
420 ifm2.tiles.addresses,
421 ifm2.layout,
422 )
423 generate_tiles(
424 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
425 )
426 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
427 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
428
429
430def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
431 """Generates general OFM registers"""
432 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
433 generate_addresses(
434 emit,
435 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
436 ofm.tiles.addresses,
437 ofm.layout,
438 )
439 generate_tiles(
440 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
441 )
442 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
443 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
444 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
445 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
446 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
447
448
449def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
450 """Generates KERNEL related registers"""
451 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
452 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
453 # set kernel x stride low bit
454 stride = (kernel.stride_x - 1) & 1
455 # set kernel y stride low bit
456 stride |= (kernel.stride_y - 1 & 1) << 1
457 # set kernel x stride extension bits
458 stride |= (kernel.stride_x - 1 >> 1) << 6
459 # set kernel y stride extension bits
460 stride |= (kernel.stride_y - 1 >> 1) << 9
461 stride |= (kernel.dilation_x - 1) << 3
462 stride |= (kernel.dilation_y - 1) << 4
463 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
464 stride |= 1 << 2
465 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
466
467
468def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
469 """Generates WEIGHT registers"""
470 if len(weights) == 0:
471 return
472 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
473 # Set weights sources for active and present cores
474 for core, (addr, length) in enumerate(
475 [
476 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
477 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
478 ]
479 ):
480 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100481 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100482 emit.cmd1_with_offset(length, weights[core].length)
483 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100484 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100485 emit.cmd1_with_offset(length, 0)
486
487
488def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
489 """Generates SCALE registers"""
490 if len(biases) == 0:
491 return
492 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
493 # Set weights sources for active and present cores
494 for core, (addr, length) in enumerate(
495 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
496 ):
497 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100498 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100499 emit.cmd1_with_offset(length, biases[core].length)
500 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100501 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100502 emit.cmd1_with_offset(length, 0)
503
504
505def generate_block_config(
506 emit: CommandStreamEmitter,
507 npu_op: NpuBlockOperation,
508 arch: ArchitectureFeatures,
509 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100510):
511 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100512 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100513 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100514 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
515 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
516 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
517 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
518 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100519
520
521def generate_shram_registers_elementwise(
522 emit: CommandStreamEmitter,
523 npu_op: NpuElementWiseOperation,
524 arch: ArchitectureFeatures,
525 shared_buffer: SharedBufferAllocation,
526):
527 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
528 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
529 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
530 shram_required = arch.available_shram_banks(uses_lut)
531
532 # Acc buffers not needed so set AB_START to size of SHRAM
533 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
534 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
535 if has_ifm2(npu_op):
536 # Set IFM2_IB_START to the latter half of the IB space
537 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
538 emit.cmd0_with_param(
539 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
540 )
541 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
542
543
544def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
545 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
546 emit.cmd0_with_param(
547 cmd0.NPU_SET_IFM_IB_END,
548 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
549 )
550 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
551 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
552
553
Louis Verhaard933f55e2020-11-25 14:10:30 +0100554def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
555 """Creates shared buffer allocation for the given operation"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100556 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100557 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100558 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100559 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100560 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100561 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100562 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100563 block_type = NpuBlockType.ElementWise
564 else:
565 assert 0, "Unsupported operation"
566 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
567 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
568
569
Louis Verhaard1e170182020-11-26 11:42:04 +0100570def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
571 """Generates KERNEL_WAIT/DMA_WAIT"""
572 if cmd_waits.npu >= 0:
573 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
574
575 if cmd_waits.dma >= 0:
576 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
577
578
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100579def generate_common(
580 emit: CommandStreamEmitter,
581 npu_op: NpuBlockOperation,
582 block_traversal: NpuBlockTraversal,
583 arch: ArchitectureFeatures,
584 use_global_scale: bool = False,
585 op_to_scale: int = 0,
586):
587 """Generate registers that are common to most operations"""
588 assert npu_op.ifm is not None and npu_op.ofm is not None
589 generate_ifm(emit, npu_op.ifm)
590 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
591 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
592 if npu_op.padding is not None:
593 generate_padding(emit, npu_op.padding)
594 generate_ofm(emit, npu_op.ofm)
595 generate_ofm_precision(emit, npu_op, use_global_scale)
596 if npu_op.op_type != NpuOperationType.ElementWise:
597 assert npu_op.kernel is not None
598 generate_kernel(emit, npu_op.kernel, block_traversal)
599 generate_weights(emit, npu_op.weights, arch)
600 generate_biases(emit, npu_op.biases, arch)
601 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100602 shared_buffer = create_shared_buffer(npu_op, arch)
603 generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100604 if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100605 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
606 else:
607 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100608
609
610# -------------------------------------------------------------------
611# SCALING
612# -------------------------------------------------------------------
613
614
615def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
616 """Generates OFM_SCALE register for pooling operations"""
617 # For valid padding vela has to output scaling values
618 kernel = pool_op.kernel
619 ifm_quant = pool_op.ifm.quantization
620 ofm_quant = pool_op.ofm.quantization
621 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
622 assert ifm_quant.scale_f32 is not None
623 rescale = 0x3000 * ifm_quant.scale_f32
624 if pool_op.ifm.data_type == NpuDataType.INT16:
625 # Calculate scale and shift for the output scale of 1/(3*4096)
626 shift = 0
627 max_rescale = np.iinfo(np.int16).max / 2
628 while rescale <= max_rescale and shift <= 30:
629 shift += 1
630 rescale *= 2
631 scale = int(rescale)
632 else:
633 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
634 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
635 scale = int(round_away_zero(scale * rescale))
636 elif pool_op.fused_quantize:
637 # Quantize op requires different scaling
638 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
639 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
640 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
641 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100642 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100643 rescale = pool_op.rescale
644 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
645 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
646 scale = int(round_away_zero(scale * rescale))
647 else:
648 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
649 # kernel height == kernel width == 1 is always true in this case
650 # Normally the scale is maximised, to get maximum precision, which means that
651 # if rescale != 1, scale need to consider the number of bits needed for rescaling
652 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
653 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
654 rescale_bits = 0
655 if kernel.height == kernel.width == 1:
656 if rescale > 1:
657 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
658 elif rescale < 1:
659 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
660 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
661 scale = int(round_away_zero(scale * rescale))
662 else:
663 scale = 1
664 shift = 0
665
666 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
667
668
669def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
670 """
671 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
672 Returns the operator to scale
673 """
674 op_to_scale = 0
675 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
676 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
677 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
678 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
679
680 if npu_op.activation is not None and npu_op.activation.op_type in (
681 NpuActivationOp.SIGMOID,
682 NpuActivationOp.TANH,
683 ):
684 output_scale = 1 / 0x3000
685
686 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
687 if None in (input_scale, input2_scale, output_scale):
688 ofm_scale = 1
689 shift = 0
690 else:
691 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
692 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
693 else: # Add/Sub
694 if None in (input_scale, input2_scale, output_scale):
695 opa_scale = opb_scale = ofm_scale = 1
696 opa_shift = shift = 0
697 if npu_op.rescale is not None:
698 ofm_scale, shift = npu_op.rescale
699 elif input_scale == input2_scale:
700 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
701 input_scale, input2_scale, output_scale
702 )
703 opa_shift = 0 # Unused for this case
704 else:
705 # Use advanced implementation only when input scales differ
706 bitdepth = npu_op.ifm.data_type.size_in_bits()
707 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
708 input_scale, input2_scale, output_scale, bitdepth
709 )
710 opb_scale = 0 # Unused for this case
711 if npu_op.reversed_operands:
712 # If the operand order is reversed we also have to swap which operand is scaled
713 if op_to_scale == scaling.OperandToScale.OPa:
714 op_to_scale = scaling.OperandToScale.OPb
715 else:
716 op_to_scale = scaling.OperandToScale.OPa
717 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
718 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
719 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
720 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
721 output_scale = npu_op.ofm.quantization.scale_f32
722 ofm_scale, shift = scaling.quantise_scale(output_scale)
723 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
724 else:
725 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
726 return op_to_scale
727
728
729# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100730# PRINT
731# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200732
733
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100734def print_feature_map(fm: NpuFeatureMap, name: str):
735 if fm is not None:
736 q = (
737 "no quantization"
738 if fm.quantization is None
739 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
740 )
741 h, w, c = fm.shape
742 sz = h * w * c * fm.data_type.size_in_bytes()
743 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
744 strides = get_strides(fm)
745 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
746 t = fm.tiles
747 addresses = [hex(addr) for addr in t.addresses]
748 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100749
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100750
Dwight Lidman9b43f842020-12-08 17:56:44 +0100751def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
752 pass_info = f", {cmd}" if cmd else ""
753 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
754 print(f"{index} {npu_op.op_type.name}{pass_info}")
755 return
756 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100757 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
758 return
759 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100760 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100761 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200762 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100763 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100764 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100765 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
766 ):
767 fc = "FullyConnected "
768 else:
769 fc = ""
770 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
771 print_feature_map(npu_op.ifm, "IFM")
772 if npu_op.ifm2_scalar is not None:
773 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
774 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
775 else:
776 print_feature_map(npu_op.ifm2, "IFM2")
777 print_feature_map(npu_op.ofm, "OFM")
778 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
779 print(f" Kernel: {k}")
780 if npu_op.padding is not None:
781 print(f" {npu_op.padding}")
782 for weights in npu_op.weights:
783 print(f" Weights: {weights}")
784 for bias in npu_op.biases:
785 print(f" Scales: {bias}")
786 if npu_op.activation is not None:
787 act = npu_op.activation
788 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
789 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
790 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100791 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100792 print(f" {npu_op.block_traversal}")
793 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100794 rescale = (
795 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
796 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100797 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100798
Tim Hall79d07d22020-04-27 18:20:16 +0100799
Dwight Lidman9b43f842020-12-08 17:56:44 +0100800def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
801 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100802 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100803 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100804
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100805
806# -------------------------------------------------------------------
807# OPERATIONS
808# -------------------------------------------------------------------
809
810
811def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
812 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100813 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100814 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100815 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100816 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100817 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100818 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100819 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100820 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100821 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100822 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
823 else:
824 assert 0, "Unsupported operation"
825
826
Louis Verhaard933f55e2020-11-25 14:10:30 +0100827def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100828 """Generates register commands for Conv2D operations"""
829 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100830
831
Dwight Lidman9b43f842020-12-08 17:56:44 +0100832def generate_conv_depthwise_op(
833 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
834):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100835 """Generates register commands for depthwise convolution operations"""
836 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100837
838
839def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
840 """Generates register commands for pooling operations"""
841 use_global_scale = (
842 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
843 )
844 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
845 # Pooling op specific
846 if use_global_scale:
847 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100848
849
850def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
851 """Generates register commands for elementwise operations"""
852 use_global_scale = npu_op.sub_op_type in (
853 NpuElementWiseOp.ADD,
854 NpuElementWiseOp.SUB,
855 NpuElementWiseOp.MUL,
856 NpuElementWiseOp.LRELU,
857 NpuElementWiseOp.ABS,
858 )
859 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
860 generate_common(
861 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
862 )
863 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100864 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100865 # Binary operation; generate IFM2 registers
866 assert npu_op.ifm2 is not None
867 has_scalar = npu_op.ifm2_scalar is not None
868 generate_ifm2(emit, npu_op.ifm2, has_scalar)
869 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
870 generate_ifm2_broadcast(emit, npu_op)
871 if has_scalar:
872 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
873 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
874 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100875
876
877def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
878 """Generates register commands for DMA operations"""
879 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100880 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100881 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
882
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100883 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
884 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100885
886
Louis Verhaard933f55e2020-11-25 14:10:30 +0100887def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100888 """
889 Generates register commands for the given operation, but not the final NPU_OP_... command.
890 Returns the selected block config
891 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100892 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100893 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100894 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100895 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100896 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100897 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100898 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100899 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100900 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100901 generate_dma_op(emit, npu_op)
902 else:
903 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100904
905
906def generate_command_stream(
Dwight Lidman9b43f842020-12-08 17:56:44 +0100907 npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None, npu_op_to_cmd=None
Louis Verhaard1e170182020-11-26 11:42:04 +0100908) -> List[int]:
909 """
910 Generates register commands for the given list of NPU operations.
911 Returns Ethos-U instructions, as a list of 32-bit integers.
912 """
913 emit = CommandStreamEmitter()
914 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100915 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100916 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100917 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100918 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100919 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100921 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100922 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100923 else:
924 assert 0, "Invalid operation type"
Tim Hallc8a73862020-10-27 12:43:14 +0000925 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100926 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
927 dep_watermark = Watermark(0, 0)
928 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100929 # Generate register commands for all operations
930 for op_index, npu_op in enumerate(npu_op_list):
931 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100932 generate_registers_for_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100933 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100934 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +0100935 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100936 blockdep = min(blockdep, arch.max_blockdep)
937 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
938 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100939
940 generate_cmd_waits(emit, cmd_waits)
941 # Generate the actual NPU_OP command
942 generate_operation_code(emit, npu_op)
943 if add_to_debug_db is not None:
944 add_to_debug_db(npu_op, emit.offset)
945 # Fill in final part of command stream:
946 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +0100947 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +0100948
949 if emit.size_in_bytes() >= 1 << 24:
950 raise VelaError(
951 f"The command stream size exceeds the hardware limit of 16 MiB. "
952 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
953 )
954
Tim Hall79d07d22020-04-27 18:20:16 +0100955 if verbose:
956 emit.print_cmds()
957 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +0100958 print("command stream length in words", len(res))
959 return res
960
961
962# -------------------------------------------------------------------
963# EXTERNAL API
964# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100965
966
Louis Verhaard933f55e2020-11-25 14:10:30 +0100967def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
968 """
969 Internal implementation of the public facing API for finding block configs.
970 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100971 if isinstance(npu_op, NpuBlockOperation):
972 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
973 shared_buffer = create_shared_buffer(npu_op, arch)
974 blocks = find_suitable_block_configs(arch, shared_buffer)
975 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
976 return []
Louis Verhaard933f55e2020-11-25 14:10:30 +0100977
978
Louis Verhaardaeae5672020-11-02 18:04:27 +0100979def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100980 """
Louis Verhaardaeae5672020-11-02 18:04:27 +0100981 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100982 Calculates dependencies between commands and inserts wait operations if needed.
983
984 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +0000985 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
986 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100987 """
Louis Verhaardaeae5672020-11-02 18:04:27 +0100988 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +0100989 arch = create_default_arch(accelerator)
Louis Verhaard1e170182020-11-26 11:42:04 +0100990 return generate_command_stream(npu_op_list, arch, verbose=False)