blob: 20431273cb7ff970c78aace98a676470091a80c1 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010024from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025from typing import List
26from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010027
28import numpy as np
29
30from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010038from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010039from .api import NpuDataType
40from .api import NpuDmaOperation
41from .api import NpuElementWiseOp
42from .api import NpuElementWiseOperation
43from .api import NpuFeatureMap
44from .api import NpuKernel
45from .api import NpuLayout
46from .api import NpuOperation
47from .api import NpuOperationType
48from .api import NpuPadding
49from .api import NpuPoolingOp
50from .api import NpuPoolingOperation
51from .api import NpuQuantization
52from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010056from .architecture_allocator import ArchitectureBlockConfig
57from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010058from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010060from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010061from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010062from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import quantise_float32
72from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010074from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010075from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010076from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010077from .register_command_stream_util import calc_blockdep
78from .register_command_stream_util import get_dma_memory_accesses
79from .register_command_stream_util import get_op_memory_accesses
80from .register_command_stream_util import get_strides
81from .register_command_stream_util import get_wait_dependency
82from .register_command_stream_util import has_ifm2
Tim Halld8339a72021-05-27 18:49:40 +010083from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010084from .register_command_stream_util import to_kernel
85from .register_command_stream_util import UNARY_ELEMWISE_OPS
86from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010087
88
89class RegisterMachine:
90 def __init__(self):
91 self.n_banks = 1
92 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
93 self.bank_idx = 0
94
95 def set_register(self, reg, value):
96 is_changed = self.registers[self.bank_idx][reg] != value
97 self.registers[self.bank_idx][reg] = value
98 # is_changed = True # force command
99 return is_changed
100
101 def switch_bank(self):
102 self.bank_idx = (self.bank_idx + 1) % self.n_banks
103
104
105class CmdMode(IntEnum):
106 NoPayload = 0x0000
107 Payload32 = 0x4000
108 Mask = 0xC000
109 CmdOpMask = 0x03FF
110
111
Tim Hall79d07d22020-04-27 18:20:16 +0100112class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000113 WORD_SIZE = 4
114
Tim Hall79d07d22020-04-27 18:20:16 +0100115 def __init__(self):
116 self.cmd_stream = []
117 self.reg_machine = [RegisterMachine(), RegisterMachine()]
118 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000119 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100120
121 def get_reg_machine(self, cmd):
122 if "DMA" in cmd.name:
123 return self.reg_machine[1]
124 else:
125 return self.reg_machine[0]
126
127 def size_in_bytes(self):
128 sz = 0
129 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000130 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100131 return sz
132
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100133 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100134 return [elem for cmd in self.cmd_stream for elem in cmd]
135
136 def print_cmds(self):
137 print("Code: Command: Param: Payload:")
138 for words_for_one_command in self.cmd_stream:
139 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
140 param = words_for_one_command[0] >> 16 # higher 16 bits
141
142 payload_mode = CmdMode(code & CmdMode.Mask)
143
144 # code and command
145 s = " 0x%04x " % code
146 if payload_mode == CmdMode.NoPayload:
147 s += str(cmd0(code & CmdMode.CmdOpMask))
148 else:
149 s += str(cmd1(code & CmdMode.CmdOpMask))
150
151 s = s.ljust(40)
152 s += "%5d" % param
153
154 # payload
155 if payload_mode == CmdMode.Payload32:
156 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
157 else:
158 s += " -"
159
160 print(s)
161
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100162 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100163 if isinstance(param, Enum):
164 param = int(param.value)
165 else:
166 param = int(param)
167 param = param & 0xFFFF
168 command = cmd.value | (param << 16)
169 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
170 return
171
172 # This is not a redundant command, actually write it
173 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000174 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100175
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100176 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200177 offset = int(offset) & 0xFFFFFFFF
178 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100179 command = cmd.value | CmdMode.Payload32.value | (param << 16)
180
181 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
182 return
183
184 # This is not a redundant command, actually write it
185 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000186 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100187
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100188 def cmd1_with_address(self, cmd: cmd1, offset):
189 self.cmd1_with_offset(cmd, offset, offset >> 32)
190
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100191 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100192 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100193 command = ((param & 0xFFFF) << 16) | cmd.value
194 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000195 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100196
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100197 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100198 param = int(param)
199 command = ((param & 0xFFFF) << 16) | cmd.value
200
201 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000202 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100203 self.get_reg_machine(cmd).switch_bank()
204
205
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100206# -------------------------------------------------------------------
207# REGISTER GENERATION
208# -------------------------------------------------------------------
209
210
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100211# TODO: Replace with definitions from ethos_u55_regs
212class IFM2Broadcast(IntEnum):
213 BroadcastHdim = 1 << 0
214 BroadcastWdim = 1 << 1
215 BroadcastCdim = 1 << 2
216 ReverseOperandOrder = 1 << 6
217 UseIFM2Scalar = 1 << 7
218
219
220pooling_op_map = {
221 NpuPoolingOp.MAX: pooling_mode.MAX.value,
222 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
223 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
224}
225
226elementwise_op_map = {
227 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
228 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
229 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
230 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
231 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
232 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
233 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
234 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
235 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
236 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
237}
238
239activation_op_map = {
240 NpuActivationOp.NONE_OR_RELU: activation.NONE,
241 NpuActivationOp.TANH: activation.TANH,
242 NpuActivationOp.SIGMOID: activation.SIGMOID,
243}
244
245# Maps an AccumulatorType enum to the corresponding acc_format value
246acc_format_map = {
247 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
248 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
249 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
250}
251
252resampling_mode_map = {
253 NpuResamplingMode.NONE: resampling_mode.NONE,
254 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
255 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
256}
257
258# Maps data type size in bits to activation precision
259precision_map = {8: 0, 16: 1, 32: 2}
260
261# Maps rounding mode to the corresponding value
262rounding_mode_map = {
263 NpuRoundingMode.TFL: rounding.TFL.value,
264 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
265 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
266}
267
268
Louis Verhaard024c3552021-03-17 14:26:34 +0100269def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
270 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
271 for mem_access in memory_accesses.accesses:
272 for region, range_set in mem_access.regions.items():
273 if region not in mem_limits:
274 raise VelaError(f"Invalid region: {region}")
275 max = mem_limits[region]
276 for start, end in range_set.ranges:
277 for offset in (start, end):
278 if offset < 0:
279 raise VelaError(f"Negative address offset: {offset}, region: {region}")
280 if offset > max:
281 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
282
283
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100284def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
285 """Quantizes the given value"""
286 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
287 zp = 0 if quant is None else quant.zero_point
288 return quantise_float32(value, scale, zp)
289
290
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100291def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
292 """Generates IFM_PAD registers"""
293 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
294 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
297
298
299def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
300 """Generates ACTIVATION registers"""
301 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
302
303 if act.min is None:
304 quantized_min = ofm.data_type.min_value()
305 else:
306 quantized_min = quantise(act.min, ofm.quantization)
307 if act.max is None:
308 quantized_max = ofm.data_type.max_value()
309 else:
310 quantized_max = quantise(act.max, ofm.quantization)
311 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
312 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
313 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
314 assert 0 <= act.lookup_table_index < 8
315 activation_value = 16 + act.lookup_table_index
316 if ofm.data_type == NpuDataType.INT32:
317 activation_value |= 3 << 12 # Force I8 range
318 quantized_min = max(-128, quantized_min)
319 quantized_max = min(127, quantized_max)
320 else:
321 activation_value = activation_op_map[act.op_type]
322 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
323 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
325
326
327def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
328 """Generates xFM_BASE registers"""
329 if layout == NpuLayout.NHCWB16:
330 # Check that all BasePointer addresses are aligned to 16 bytes
331 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100332 for i in range(4):
333 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100334
335
336def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
337 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
338 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
339 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
340 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
341
342
343def generate_strides(
344 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
345):
346 """Generates STRIDE_C/Y/X registers"""
347 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100348 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
349 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
350 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100351
352
353def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
354 """Generates IFM/IFM2_PRECISION register"""
355 dtype = fm.data_type
356 prec = 1 if dtype.is_signed() else 0
357 activation_precision = precision_map[dtype.size_in_bits()]
358 prec += activation_precision << 2
359
360 if fm.layout == NpuLayout.NHCWB16:
361 prec |= 1 << 6
362
363 prec |= op_to_scale << 8
364 emit.cmd0_with_param(precision_cmd, prec)
365
366
367def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
368 """Generates OFM_PRECISION register"""
369 dtype = npu_op.ofm.data_type
370 prec = 1 if dtype.is_signed() else 0
371 activation_precision = precision_map[dtype.size_in_bits()]
372 prec += activation_precision << 1
373
374 if use_global_scale:
375 # Set global scale bit, as opposed to using per channel scale
376 prec |= 1 << 8
377 if npu_op.ofm.layout == NpuLayout.NHCWB16:
378 prec |= 1 << 6
379 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
380 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
381
382
383def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
384 """Generates IFM2_BROADCAST register for binary elementwise operations"""
385 ifm2_broadcast = 0
386 ifm = npu_op.ifm
387 ifm2 = npu_op.ifm2
388 if npu_op.reversed_operands:
389 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
390 if npu_op.ifm2_scalar is not None:
391 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
392 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
393 else:
394 if ifm.shape.height != ifm2.shape.height:
395 # Broadcast in 'H' dimension
396 assert ifm2.shape.height == 1
397 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
398
399 if ifm.shape.width != ifm2.shape.width:
400 # Broadcast in 'W' dimension
401 assert ifm2.shape.width == 1
402 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
403
404 if ifm.shape.depth != ifm2.shape.depth:
405 # Broadcast in 'C' dimension
406 assert ifm2.shape.depth == 1
407 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
408
409 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
410
411
412def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
413 """Generates general IFM registers"""
414 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
415 generate_addresses(
416 emit,
417 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
418 ifm.tiles.addresses,
419 ifm.layout,
420 )
421 generate_tiles(
422 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
423 )
424 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
425 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
426 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
427
428
429def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
430 """Generates general IFM2 registers"""
431 if not has_scalar:
432 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
433 generate_addresses(
434 emit,
435 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
436 ifm2.tiles.addresses,
437 ifm2.layout,
438 )
439 generate_tiles(
440 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
441 )
442 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
443 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
444
445
446def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
447 """Generates general OFM registers"""
448 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
449 generate_addresses(
450 emit,
451 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
452 ofm.tiles.addresses,
453 ofm.layout,
454 )
455 generate_tiles(
456 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
457 )
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
461 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
462 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
463
464
465def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
466 """Generates KERNEL related registers"""
467 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
468 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
469 # set kernel x stride low bit
470 stride = (kernel.stride_x - 1) & 1
471 # set kernel y stride low bit
472 stride |= (kernel.stride_y - 1 & 1) << 1
473 # set kernel x stride extension bits
474 stride |= (kernel.stride_x - 1 >> 1) << 6
475 # set kernel y stride extension bits
476 stride |= (kernel.stride_y - 1 >> 1) << 9
477 stride |= (kernel.dilation_x - 1) << 3
478 stride |= (kernel.dilation_y - 1) << 4
479 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
480 stride |= 1 << 2
481 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
482
483
484def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
485 """Generates WEIGHT registers"""
486 if len(weights) == 0:
487 return
488 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
489 # Set weights sources for active and present cores
490 for core, (addr, length) in enumerate(
491 [
492 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
493 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
494 ]
495 ):
496 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100497 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100498 emit.cmd1_with_offset(length, weights[core].length)
499 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100500 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100501 emit.cmd1_with_offset(length, 0)
502
503
504def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
505 """Generates SCALE registers"""
506 if len(biases) == 0:
507 return
508 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
509 # Set weights sources for active and present cores
510 for core, (addr, length) in enumerate(
511 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
512 ):
513 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100514 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100515 emit.cmd1_with_offset(length, biases[core].length)
516 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100517 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100518 emit.cmd1_with_offset(length, 0)
519
520
521def generate_block_config(
Tim Halld8339a72021-05-27 18:49:40 +0100522 emit: CommandStreamEmitter, block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100523):
524 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100525 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
526 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
527 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100528
529
Tim Halld8339a72021-05-27 18:49:40 +0100530def generate_shram_registers(
531 emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100532):
Tim Halld8339a72021-05-27 18:49:40 +0100533 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
534 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
535 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100536 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100537 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
538 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100539
540
Tim Halld8339a72021-05-27 18:49:40 +0100541def get_block_config_for_npu_op(
542 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
543) -> Optional[ArchitectureBlockConfig]:
544 """
545 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
546 Returns None if the block_config does not fit.
547 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100548
549
Tim Halld8339a72021-05-27 18:49:40 +0100550def get_arch_block_config(
551 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
552) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100553 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100554 assert npu_op.block_config is not None, "block_config has not been set"
555 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100556 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100557 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100558 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100559 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100560 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100561 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100562 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100563 block_type = NpuBlockType.ElementWise
564 else:
565 assert 0, "Unsupported operation"
566 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100567 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
568 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
569 lut_banks = 2 if uses_lut else 0
570 fms = [npu_op.ifm, npu_op.ofm]
571 if npu_op.ifm2 is not None:
572 fms.append(npu_op.ifm2)
573 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
574 ifm_bits = npu_op.ifm.data_type.size_in_bits()
575 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
576 if has_ifm2(npu_op):
577 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
578 else:
579 ifm2_shape = None
580 uses_scalar = npu_op.ifm2_scalar is not None
581 block_config = shape3d_to_block(npu_op.block_config)
582 arch_block_config = try_block_config(
583 block_config,
584 arch,
585 block_type,
586 ifm_shape,
587 ifm2_shape,
588 uses_scalar,
589 ifm_bits,
590 is_partkernel=is_partkernel,
591 kernel=to_kernel(npu_op.kernel),
592 lut_banks=lut_banks,
593 scaled=all_fms_have_quant,
594 ifm_resampling=ifm_resampling_mode,
595 )
596 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
597 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100598
599
Louis Verhaard1e170182020-11-26 11:42:04 +0100600def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
601 """Generates KERNEL_WAIT/DMA_WAIT"""
602 if cmd_waits.npu >= 0:
603 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
604
605 if cmd_waits.dma >= 0:
606 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
607
608
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100609def generate_common(
610 emit: CommandStreamEmitter,
611 npu_op: NpuBlockOperation,
612 block_traversal: NpuBlockTraversal,
613 arch: ArchitectureFeatures,
614 use_global_scale: bool = False,
615 op_to_scale: int = 0,
616):
617 """Generate registers that are common to most operations"""
618 assert npu_op.ifm is not None and npu_op.ofm is not None
619 generate_ifm(emit, npu_op.ifm)
620 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
621 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
622 if npu_op.padding is not None:
623 generate_padding(emit, npu_op.padding)
624 generate_ofm(emit, npu_op.ofm)
625 generate_ofm_precision(emit, npu_op, use_global_scale)
626 if npu_op.op_type != NpuOperationType.ElementWise:
627 assert npu_op.kernel is not None
628 generate_kernel(emit, npu_op.kernel, block_traversal)
629 generate_weights(emit, npu_op.weights, arch)
630 generate_biases(emit, npu_op.biases, arch)
631 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100632 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
633 generate_block_config(emit, npu_op.block_config)
634 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100635
636
637# -------------------------------------------------------------------
638# SCALING
639# -------------------------------------------------------------------
640
641
642def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
643 """Generates OFM_SCALE register for pooling operations"""
644 # For valid padding vela has to output scaling values
645 kernel = pool_op.kernel
646 ifm_quant = pool_op.ifm.quantization
647 ofm_quant = pool_op.ofm.quantization
648 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
649 assert ifm_quant.scale_f32 is not None
650 rescale = 0x3000 * ifm_quant.scale_f32
651 if pool_op.ifm.data_type == NpuDataType.INT16:
652 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100653 x_log2 = math.log2(ifm_quant.scale_f32)
654 rounded_log2 = int(round(x_log2))
655 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
656 shift = rounded_log2 + 12
657 if is_power_of_two and shift in (0, 1):
658 # Special handling if input scale is 1/2048 or 1/4096
659 scale = 3 << shift
660 shift = 0
661 else:
662 shift = 0
663 max_rescale = np.iinfo(np.int16).max / 2
664 while rescale <= max_rescale and shift <= 30:
665 shift += 1
666 rescale *= 2
667 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100668 else:
669 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
670 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
671 scale = int(round_away_zero(scale * rescale))
672 elif pool_op.fused_quantize:
673 # Quantize op requires different scaling
674 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
675 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
676 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
677 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100678 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100679 rescale = pool_op.rescale
680 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
681 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
682 scale = int(round_away_zero(scale * rescale))
683 else:
684 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
685 # kernel height == kernel width == 1 is always true in this case
686 # Normally the scale is maximised, to get maximum precision, which means that
687 # if rescale != 1, scale need to consider the number of bits needed for rescaling
688 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
689 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
690 rescale_bits = 0
691 if kernel.height == kernel.width == 1:
692 if rescale > 1:
693 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
694 elif rescale < 1:
695 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
696 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
697 scale = int(round_away_zero(scale * rescale))
698 else:
699 scale = 1
700 shift = 0
701
702 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
703
704
705def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
706 """
707 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
708 Returns the operator to scale
709 """
710 op_to_scale = 0
711 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
712 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
713 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
714 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
715
716 if npu_op.activation is not None and npu_op.activation.op_type in (
717 NpuActivationOp.SIGMOID,
718 NpuActivationOp.TANH,
719 ):
720 output_scale = 1 / 0x3000
721
722 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
723 if None in (input_scale, input2_scale, output_scale):
724 ofm_scale = 1
725 shift = 0
726 else:
727 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
728 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
729 else: # Add/Sub
Henrik G Olssonad656a82021-03-19 15:50:28 +0100730 bitdepth = npu_op.ifm.data_type.size_in_bits()
731 use_advanced_scaling = False
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100732 if None in (input_scale, input2_scale, output_scale):
733 opa_scale = opb_scale = ofm_scale = 1
734 opa_shift = shift = 0
735 if npu_op.rescale is not None:
736 ofm_scale, shift = npu_op.rescale
Henrik G Olssonad656a82021-03-19 15:50:28 +0100737 elif input_scale == input2_scale and bitdepth == 16:
738 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
739 input_scale, input2_scale, output_scale
740 )
741 # align the double rounding with that of advanced scaling
742 opa_scale /= 2
743 opb_scale /= 2
744 shift -= 1
745 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100746 elif input_scale == input2_scale:
747 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
748 input_scale, input2_scale, output_scale
749 )
750 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100751 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
752 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
753 # the following we know that double rounding will have no effect for advanced scaling
754 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
755 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100756 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100757 use_advanced_scaling = True
758 if use_advanced_scaling:
759 # Use advanced implementation only when input/output scales differ,
760 # or when we can't guarantee the absence of rounding errors
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100761 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
762 input_scale, input2_scale, output_scale, bitdepth
763 )
764 opb_scale = 0 # Unused for this case
765 if npu_op.reversed_operands:
766 # If the operand order is reversed we also have to swap which operand is scaled
767 if op_to_scale == scaling.OperandToScale.OPa:
768 op_to_scale = scaling.OperandToScale.OPb
769 else:
770 op_to_scale = scaling.OperandToScale.OPa
771 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
772 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
773 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
774 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
775 output_scale = npu_op.ofm.quantization.scale_f32
776 ofm_scale, shift = scaling.quantise_scale(output_scale)
777 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
778 else:
779 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
780 return op_to_scale
781
782
783# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100784# PRINT
785# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200786
787
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100788def print_feature_map(fm: NpuFeatureMap, name: str):
789 if fm is not None:
790 q = (
791 "no quantization"
792 if fm.quantization is None
793 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
794 )
795 h, w, c = fm.shape
796 sz = h * w * c * fm.data_type.size_in_bytes()
797 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
798 strides = get_strides(fm)
799 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
800 t = fm.tiles
801 addresses = [hex(addr) for addr in t.addresses]
802 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100803
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100804
Dwight Lidman9b43f842020-12-08 17:56:44 +0100805def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
806 pass_info = f", {cmd}" if cmd else ""
807 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
808 print(f"{index} {npu_op.op_type.name}{pass_info}")
809 return
810 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100811 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
812 return
813 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100814 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100815 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200816 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100817 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100818 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100819 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
820 ):
821 fc = "FullyConnected "
822 else:
823 fc = ""
824 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
825 print_feature_map(npu_op.ifm, "IFM")
826 if npu_op.ifm2_scalar is not None:
827 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
828 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
829 else:
830 print_feature_map(npu_op.ifm2, "IFM2")
831 print_feature_map(npu_op.ofm, "OFM")
832 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
833 print(f" Kernel: {k}")
834 if npu_op.padding is not None:
835 print(f" {npu_op.padding}")
836 for weights in npu_op.weights:
837 print(f" Weights: {weights}")
838 for bias in npu_op.biases:
839 print(f" Scales: {bias}")
840 if npu_op.activation is not None:
841 act = npu_op.activation
842 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
843 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
844 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100845 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100846 print(f" {npu_op.block_traversal}")
847 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100848 rescale = (
849 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
850 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100851 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100852
Tim Hall79d07d22020-04-27 18:20:16 +0100853
Dwight Lidman9b43f842020-12-08 17:56:44 +0100854def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
855 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100856 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100857 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100858
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100859
860# -------------------------------------------------------------------
861# OPERATIONS
862# -------------------------------------------------------------------
863
864
865def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
866 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100867 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100868 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100869 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100870 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100871 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100872 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100873 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100874 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100875 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100876 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
877 else:
878 assert 0, "Unsupported operation"
879
880
Louis Verhaard933f55e2020-11-25 14:10:30 +0100881def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100882 """Generates register commands for Conv2D operations"""
883 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884
885
Dwight Lidman9b43f842020-12-08 17:56:44 +0100886def generate_conv_depthwise_op(
887 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
888):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100889 """Generates register commands for depthwise convolution operations"""
890 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100891
892
893def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
894 """Generates register commands for pooling operations"""
895 use_global_scale = (
896 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
897 )
898 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
899 # Pooling op specific
900 if use_global_scale:
901 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100902
903
904def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
905 """Generates register commands for elementwise operations"""
906 use_global_scale = npu_op.sub_op_type in (
907 NpuElementWiseOp.ADD,
908 NpuElementWiseOp.SUB,
909 NpuElementWiseOp.MUL,
910 NpuElementWiseOp.LRELU,
911 NpuElementWiseOp.ABS,
912 )
913 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
914 generate_common(
915 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
916 )
917 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100918 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100919 # Binary operation; generate IFM2 registers
920 assert npu_op.ifm2 is not None
921 has_scalar = npu_op.ifm2_scalar is not None
922 generate_ifm2(emit, npu_op.ifm2, has_scalar)
923 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
924 generate_ifm2_broadcast(emit, npu_op)
925 if has_scalar:
926 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
927 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
928 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100929
930
931def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
932 """Generates register commands for DMA operations"""
933 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100934 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100935 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
936
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100937 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
938 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100939
940
Louis Verhaard933f55e2020-11-25 14:10:30 +0100941def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100942 """
943 Generates register commands for the given operation, but not the final NPU_OP_... command.
944 Returns the selected block config
945 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100946 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100947 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100948 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100949 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100950 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100951 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100952 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100953 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100954 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100955 generate_dma_op(emit, npu_op)
956 else:
957 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100958
959
960def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100961 npu_op_list: List[NpuOperation],
962 arch: ArchitectureFeatures,
963 verbose: bool,
964 mem_limits: Dict[int, int],
965 add_to_debug_db=None,
966 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100967) -> List[int]:
968 """
969 Generates register commands for the given list of NPU operations.
970 Returns Ethos-U instructions, as a list of 32-bit integers.
971 """
972 emit = CommandStreamEmitter()
973 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100974 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100975 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100976 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100977 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100978 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100979 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100980 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100981 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100982 else:
983 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +0100984
Tim Hallc8a73862020-10-27 12:43:14 +0000985 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100986 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
987 dep_watermark = Watermark(0, 0)
988 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100989 # Generate register commands for all operations
990 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +0100991 try:
992 check_mem_limits(memory_accesses[npu_op], mem_limits)
993 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
994 generate_registers_for_op(emit, npu_op, arch)
995 except VelaError as e:
996 # Add operation info and rethrow
997 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +0100998 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100999 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001000 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001001 blockdep = min(blockdep, arch.max_blockdep)
1002 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1003 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001004
1005 generate_cmd_waits(emit, cmd_waits)
1006 # Generate the actual NPU_OP command
1007 generate_operation_code(emit, npu_op)
1008 if add_to_debug_db is not None:
1009 add_to_debug_db(npu_op, emit.offset)
1010 # Fill in final part of command stream:
1011 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001012 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001013
1014 if emit.size_in_bytes() >= 1 << 24:
1015 raise VelaError(
1016 f"The command stream size exceeds the hardware limit of 16 MiB. "
1017 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1018 )
1019
Tim Hall79d07d22020-04-27 18:20:16 +01001020 if verbose:
1021 emit.print_cmds()
1022 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +01001023 print("command stream length in words", len(res))
1024 return res
1025
1026
1027# -------------------------------------------------------------------
1028# EXTERNAL API
1029# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001030
1031
Louis Verhaard933f55e2020-11-25 14:10:30 +01001032def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
1033 """
1034 Internal implementation of the public facing API for finding block configs.
1035 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001036 if isinstance(npu_op, NpuBlockOperation):
Tim Halld8339a72021-05-27 18:49:40 +01001037 # TODO: implement this function
Dwight Lidman9b43f842020-12-08 17:56:44 +01001038 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
Tim Halld8339a72021-05-27 18:49:40 +01001039 block = arch.ofm_ublock
1040 return [NpuShape3D(height=block.height, width=block.width, depth=block.depth)]
Dwight Lidman9b43f842020-12-08 17:56:44 +01001041 return []
Louis Verhaard933f55e2020-11-25 14:10:30 +01001042
1043
Louis Verhaardaeae5672020-11-02 18:04:27 +01001044def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001045 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001046 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001047 Calculates dependencies between commands and inserts wait operations if needed.
1048
1049 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001050 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1051 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001052 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001053 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001054 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001055 mem_limits = dict()
1056 for region in range(0, 8):
1057 mem_limits[region] = arch.max_address_offset
1058 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1059 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)