blob: 3b552e0903a844707da6fdae03beb10daa63fe8c [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010024from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025from typing import List
26from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010027
28import numpy as np
29
30from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010031from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010032from .api import NpuActivation
33from .api import NpuActivationOp
34from .api import NpuAddressRange
35from .api import NpuBlockOperation
36from .api import NpuBlockTraversal
37from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010038from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010039from .api import NpuDataType
40from .api import NpuDmaOperation
41from .api import NpuElementWiseOp
42from .api import NpuElementWiseOperation
43from .api import NpuFeatureMap
44from .api import NpuKernel
45from .api import NpuLayout
46from .api import NpuOperation
47from .api import NpuOperationType
48from .api import NpuPadding
49from .api import NpuPoolingOp
50from .api import NpuPoolingOperation
51from .api import NpuQuantization
52from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
56from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010057from .architecture_features import ArchitectureFeatures
58from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010059from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import SharedBufferArea
61from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010062from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import quantise_float32
72from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010074from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010075from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010076from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010077from .register_command_stream_util import calc_blockdep
78from .register_command_stream_util import get_dma_memory_accesses
79from .register_command_stream_util import get_op_memory_accesses
80from .register_command_stream_util import get_strides
81from .register_command_stream_util import get_wait_dependency
82from .register_command_stream_util import has_ifm2
Louis Verhaard1e170182020-11-26 11:42:04 +010083from .register_command_stream_util import to_kernel
84from .register_command_stream_util import UNARY_ELEMWISE_OPS
85from .register_command_stream_util import Watermark
Louis Verhaarde8a5a782020-11-02 18:04:27 +010086from .shared_buffer_allocation import find_suitable_block_configs
87from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
88from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
139 print("Code: Command: Param: Payload:")
140 for words_for_one_command in self.cmd_stream:
141 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
142 param = words_for_one_command[0] >> 16 # higher 16 bits
143
144 payload_mode = CmdMode(code & CmdMode.Mask)
145
146 # code and command
147 s = " 0x%04x " % code
148 if payload_mode == CmdMode.NoPayload:
149 s += str(cmd0(code & CmdMode.CmdOpMask))
150 else:
151 s += str(cmd1(code & CmdMode.CmdOpMask))
152
153 s = s.ljust(40)
154 s += "%5d" % param
155
156 # payload
157 if payload_mode == CmdMode.Payload32:
158 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
159 else:
160 s += " -"
161
162 print(s)
163
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100164 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100165 if isinstance(param, Enum):
166 param = int(param.value)
167 else:
168 param = int(param)
169 param = param & 0xFFFF
170 command = cmd.value | (param << 16)
171 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
172 return
173
174 # This is not a redundant command, actually write it
175 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000176 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100177
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100178 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200179 offset = int(offset) & 0xFFFFFFFF
180 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100181 command = cmd.value | CmdMode.Payload32.value | (param << 16)
182
183 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
184 return
185
186 # This is not a redundant command, actually write it
187 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000188 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100190 def cmd1_with_address(self, cmd: cmd1, offset):
191 self.cmd1_with_offset(cmd, offset, offset >> 32)
192
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100193 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100194 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100195 command = ((param & 0xFFFF) << 16) | cmd.value
196 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000197 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100198
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100199 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100200 param = int(param)
201 command = ((param & 0xFFFF) << 16) | cmd.value
202
203 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000204 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100205 self.get_reg_machine(cmd).switch_bank()
206
207
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100208# -------------------------------------------------------------------
209# REGISTER GENERATION
210# -------------------------------------------------------------------
211
212
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100213# TODO: Replace with definitions from ethos_u55_regs
214class IFM2Broadcast(IntEnum):
215 BroadcastHdim = 1 << 0
216 BroadcastWdim = 1 << 1
217 BroadcastCdim = 1 << 2
218 ReverseOperandOrder = 1 << 6
219 UseIFM2Scalar = 1 << 7
220
221
222pooling_op_map = {
223 NpuPoolingOp.MAX: pooling_mode.MAX.value,
224 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
225 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
226}
227
228elementwise_op_map = {
229 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
230 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
231 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
232 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
233 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
234 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
235 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
236 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
237 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
238 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
239}
240
241activation_op_map = {
242 NpuActivationOp.NONE_OR_RELU: activation.NONE,
243 NpuActivationOp.TANH: activation.TANH,
244 NpuActivationOp.SIGMOID: activation.SIGMOID,
245}
246
247# Maps an AccumulatorType enum to the corresponding acc_format value
248acc_format_map = {
249 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
250 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
251 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
252}
253
254resampling_mode_map = {
255 NpuResamplingMode.NONE: resampling_mode.NONE,
256 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
257 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
258}
259
260# Maps data type size in bits to activation precision
261precision_map = {8: 0, 16: 1, 32: 2}
262
263# Maps rounding mode to the corresponding value
264rounding_mode_map = {
265 NpuRoundingMode.TFL: rounding.TFL.value,
266 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
267 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
268}
269
270
Louis Verhaard024c3552021-03-17 14:26:34 +0100271def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
272 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
273 for mem_access in memory_accesses.accesses:
274 for region, range_set in mem_access.regions.items():
275 if region not in mem_limits:
276 raise VelaError(f"Invalid region: {region}")
277 max = mem_limits[region]
278 for start, end in range_set.ranges:
279 for offset in (start, end):
280 if offset < 0:
281 raise VelaError(f"Negative address offset: {offset}, region: {region}")
282 if offset > max:
283 raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
284
285
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100286def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
287 """Quantizes the given value"""
288 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
289 zp = 0 if quant is None else quant.zero_point
290 return quantise_float32(value, scale, zp)
291
292
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100293def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
294 """Generates IFM_PAD registers"""
295 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
296 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
297 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
298 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
299
300
301def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
302 """Generates ACTIVATION registers"""
303 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
304
305 if act.min is None:
306 quantized_min = ofm.data_type.min_value()
307 else:
308 quantized_min = quantise(act.min, ofm.quantization)
309 if act.max is None:
310 quantized_max = ofm.data_type.max_value()
311 else:
312 quantized_max = quantise(act.max, ofm.quantization)
313 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
314 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
315 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
316 assert 0 <= act.lookup_table_index < 8
317 activation_value = 16 + act.lookup_table_index
318 if ofm.data_type == NpuDataType.INT32:
319 activation_value |= 3 << 12 # Force I8 range
320 quantized_min = max(-128, quantized_min)
321 quantized_max = min(127, quantized_max)
322 else:
323 activation_value = activation_op_map[act.op_type]
324 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
325 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
326 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
327
328
329def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
330 """Generates xFM_BASE registers"""
331 if layout == NpuLayout.NHCWB16:
332 # Check that all BasePointer addresses are aligned to 16 bytes
333 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100334 for i in range(4):
335 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100336
337
338def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
339 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
340 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
341 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
342 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
343
344
345def generate_strides(
346 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
347):
348 """Generates STRIDE_C/Y/X registers"""
349 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100350 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
351 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
352 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100353
354
355def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
356 """Generates IFM/IFM2_PRECISION register"""
357 dtype = fm.data_type
358 prec = 1 if dtype.is_signed() else 0
359 activation_precision = precision_map[dtype.size_in_bits()]
360 prec += activation_precision << 2
361
362 if fm.layout == NpuLayout.NHCWB16:
363 prec |= 1 << 6
364
365 prec |= op_to_scale << 8
366 emit.cmd0_with_param(precision_cmd, prec)
367
368
369def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
370 """Generates OFM_PRECISION register"""
371 dtype = npu_op.ofm.data_type
372 prec = 1 if dtype.is_signed() else 0
373 activation_precision = precision_map[dtype.size_in_bits()]
374 prec += activation_precision << 1
375
376 if use_global_scale:
377 # Set global scale bit, as opposed to using per channel scale
378 prec |= 1 << 8
379 if npu_op.ofm.layout == NpuLayout.NHCWB16:
380 prec |= 1 << 6
381 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
382 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
383
384
385def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
386 """Generates IFM2_BROADCAST register for binary elementwise operations"""
387 ifm2_broadcast = 0
388 ifm = npu_op.ifm
389 ifm2 = npu_op.ifm2
390 if npu_op.reversed_operands:
391 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
392 if npu_op.ifm2_scalar is not None:
393 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
394 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
395 else:
396 if ifm.shape.height != ifm2.shape.height:
397 # Broadcast in 'H' dimension
398 assert ifm2.shape.height == 1
399 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
400
401 if ifm.shape.width != ifm2.shape.width:
402 # Broadcast in 'W' dimension
403 assert ifm2.shape.width == 1
404 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
405
406 if ifm.shape.depth != ifm2.shape.depth:
407 # Broadcast in 'C' dimension
408 assert ifm2.shape.depth == 1
409 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
410
411 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
412
413
414def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
415 """Generates general IFM registers"""
416 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
417 generate_addresses(
418 emit,
419 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
420 ifm.tiles.addresses,
421 ifm.layout,
422 )
423 generate_tiles(
424 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
425 )
426 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
427 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
428 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
429
430
431def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
432 """Generates general IFM2 registers"""
433 if not has_scalar:
434 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
435 generate_addresses(
436 emit,
437 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
438 ifm2.tiles.addresses,
439 ifm2.layout,
440 )
441 generate_tiles(
442 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
443 )
444 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
445 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
446
447
448def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
449 """Generates general OFM registers"""
450 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
451 generate_addresses(
452 emit,
453 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
454 ofm.tiles.addresses,
455 ofm.layout,
456 )
457 generate_tiles(
458 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
459 )
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
462 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
463 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
464 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
465
466
467def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
468 """Generates KERNEL related registers"""
469 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
470 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
471 # set kernel x stride low bit
472 stride = (kernel.stride_x - 1) & 1
473 # set kernel y stride low bit
474 stride |= (kernel.stride_y - 1 & 1) << 1
475 # set kernel x stride extension bits
476 stride |= (kernel.stride_x - 1 >> 1) << 6
477 # set kernel y stride extension bits
478 stride |= (kernel.stride_y - 1 >> 1) << 9
479 stride |= (kernel.dilation_x - 1) << 3
480 stride |= (kernel.dilation_y - 1) << 4
481 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
482 stride |= 1 << 2
483 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
484
485
486def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
487 """Generates WEIGHT registers"""
488 if len(weights) == 0:
489 return
490 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
491 # Set weights sources for active and present cores
492 for core, (addr, length) in enumerate(
493 [
494 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
495 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
496 ]
497 ):
498 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100499 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100500 emit.cmd1_with_offset(length, weights[core].length)
501 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100502 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100503 emit.cmd1_with_offset(length, 0)
504
505
506def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
507 """Generates SCALE registers"""
508 if len(biases) == 0:
509 return
510 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
511 # Set weights sources for active and present cores
512 for core, (addr, length) in enumerate(
513 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
514 ):
515 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100516 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100517 emit.cmd1_with_offset(length, biases[core].length)
518 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100519 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100520 emit.cmd1_with_offset(length, 0)
521
522
523def generate_block_config(
524 emit: CommandStreamEmitter,
525 npu_op: NpuBlockOperation,
526 arch: ArchitectureFeatures,
527 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100528):
529 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100530 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100531 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100532 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
533 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
534 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
535 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
536 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100537
538
539def generate_shram_registers_elementwise(
540 emit: CommandStreamEmitter,
541 npu_op: NpuElementWiseOperation,
542 arch: ArchitectureFeatures,
543 shared_buffer: SharedBufferAllocation,
544):
545 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
546 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
547 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
548 shram_required = arch.available_shram_banks(uses_lut)
549
550 # Acc buffers not needed so set AB_START to size of SHRAM
551 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
552 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
553 if has_ifm2(npu_op):
554 # Set IFM2_IB_START to the latter half of the IB space
555 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
556 emit.cmd0_with_param(
557 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
558 )
559 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
560
561
562def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
563 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
564 emit.cmd0_with_param(
565 cmd0.NPU_SET_IFM_IB_END,
566 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
567 )
568 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
569 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
570
571
Louis Verhaard933f55e2020-11-25 14:10:30 +0100572def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
573 """Creates shared buffer allocation for the given operation"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100574 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100575 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100576 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100577 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100578 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100579 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100580 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100581 block_type = NpuBlockType.ElementWise
582 else:
583 assert 0, "Unsupported operation"
584 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
585 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
586
587
Louis Verhaard1e170182020-11-26 11:42:04 +0100588def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
589 """Generates KERNEL_WAIT/DMA_WAIT"""
590 if cmd_waits.npu >= 0:
591 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
592
593 if cmd_waits.dma >= 0:
594 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
595
596
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100597def generate_common(
598 emit: CommandStreamEmitter,
599 npu_op: NpuBlockOperation,
600 block_traversal: NpuBlockTraversal,
601 arch: ArchitectureFeatures,
602 use_global_scale: bool = False,
603 op_to_scale: int = 0,
604):
605 """Generate registers that are common to most operations"""
606 assert npu_op.ifm is not None and npu_op.ofm is not None
607 generate_ifm(emit, npu_op.ifm)
608 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
609 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
610 if npu_op.padding is not None:
611 generate_padding(emit, npu_op.padding)
612 generate_ofm(emit, npu_op.ofm)
613 generate_ofm_precision(emit, npu_op, use_global_scale)
614 if npu_op.op_type != NpuOperationType.ElementWise:
615 assert npu_op.kernel is not None
616 generate_kernel(emit, npu_op.kernel, block_traversal)
617 generate_weights(emit, npu_op.weights, arch)
618 generate_biases(emit, npu_op.biases, arch)
619 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100620 shared_buffer = create_shared_buffer(npu_op, arch)
621 generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100622 if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100623 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
624 else:
625 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100626
627
628# -------------------------------------------------------------------
629# SCALING
630# -------------------------------------------------------------------
631
632
633def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
634 """Generates OFM_SCALE register for pooling operations"""
635 # For valid padding vela has to output scaling values
636 kernel = pool_op.kernel
637 ifm_quant = pool_op.ifm.quantization
638 ofm_quant = pool_op.ofm.quantization
639 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
640 assert ifm_quant.scale_f32 is not None
641 rescale = 0x3000 * ifm_quant.scale_f32
642 if pool_op.ifm.data_type == NpuDataType.INT16:
643 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100644 x_log2 = math.log2(ifm_quant.scale_f32)
645 rounded_log2 = int(round(x_log2))
646 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
647 shift = rounded_log2 + 12
648 if is_power_of_two and shift in (0, 1):
649 # Special handling if input scale is 1/2048 or 1/4096
650 scale = 3 << shift
651 shift = 0
652 else:
653 shift = 0
654 max_rescale = np.iinfo(np.int16).max / 2
655 while rescale <= max_rescale and shift <= 30:
656 shift += 1
657 rescale *= 2
658 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100659 else:
660 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
661 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
662 scale = int(round_away_zero(scale * rescale))
663 elif pool_op.fused_quantize:
664 # Quantize op requires different scaling
665 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
666 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
667 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
668 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100669 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100670 rescale = pool_op.rescale
671 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
672 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
673 scale = int(round_away_zero(scale * rescale))
674 else:
675 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
676 # kernel height == kernel width == 1 is always true in this case
677 # Normally the scale is maximised, to get maximum precision, which means that
678 # if rescale != 1, scale need to consider the number of bits needed for rescaling
679 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
680 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
681 rescale_bits = 0
682 if kernel.height == kernel.width == 1:
683 if rescale > 1:
684 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
685 elif rescale < 1:
686 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
687 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
688 scale = int(round_away_zero(scale * rescale))
689 else:
690 scale = 1
691 shift = 0
692
693 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
694
695
696def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
697 """
698 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
699 Returns the operator to scale
700 """
701 op_to_scale = 0
702 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
703 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
704 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
705 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
706
707 if npu_op.activation is not None and npu_op.activation.op_type in (
708 NpuActivationOp.SIGMOID,
709 NpuActivationOp.TANH,
710 ):
711 output_scale = 1 / 0x3000
712
713 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
714 if None in (input_scale, input2_scale, output_scale):
715 ofm_scale = 1
716 shift = 0
717 else:
718 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
719 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
720 else: # Add/Sub
Henrik G Olssonad656a82021-03-19 15:50:28 +0100721 bitdepth = npu_op.ifm.data_type.size_in_bits()
722 use_advanced_scaling = False
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100723 if None in (input_scale, input2_scale, output_scale):
724 opa_scale = opb_scale = ofm_scale = 1
725 opa_shift = shift = 0
726 if npu_op.rescale is not None:
727 ofm_scale, shift = npu_op.rescale
Henrik G Olssonad656a82021-03-19 15:50:28 +0100728 elif input_scale == input2_scale and bitdepth == 16:
729 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
730 input_scale, input2_scale, output_scale
731 )
732 # align the double rounding with that of advanced scaling
733 opa_scale /= 2
734 opb_scale /= 2
735 shift -= 1
736 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100737 elif input_scale == input2_scale:
738 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
739 input_scale, input2_scale, output_scale
740 )
741 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100742 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
743 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
744 # the following we know that double rounding will have no effect for advanced scaling
745 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
746 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
747 if not use_advanced_scaling:
748 npu_op.rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100749 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100750 use_advanced_scaling = True
751 if use_advanced_scaling:
752 # Use advanced implementation only when input/output scales differ,
753 # or when we can't guarantee the absence of rounding errors
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100754 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
755 input_scale, input2_scale, output_scale, bitdepth
756 )
757 opb_scale = 0 # Unused for this case
758 if npu_op.reversed_operands:
759 # If the operand order is reversed we also have to swap which operand is scaled
760 if op_to_scale == scaling.OperandToScale.OPa:
761 op_to_scale = scaling.OperandToScale.OPb
762 else:
763 op_to_scale = scaling.OperandToScale.OPa
764 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
765 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
766 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
767 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
768 output_scale = npu_op.ofm.quantization.scale_f32
769 ofm_scale, shift = scaling.quantise_scale(output_scale)
770 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
771 else:
772 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
773 return op_to_scale
774
775
776# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100777# PRINT
778# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200779
780
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100781def print_feature_map(fm: NpuFeatureMap, name: str):
782 if fm is not None:
783 q = (
784 "no quantization"
785 if fm.quantization is None
786 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
787 )
788 h, w, c = fm.shape
789 sz = h * w * c * fm.data_type.size_in_bytes()
790 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
791 strides = get_strides(fm)
792 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
793 t = fm.tiles
794 addresses = [hex(addr) for addr in t.addresses]
795 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100796
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100797
Dwight Lidman9b43f842020-12-08 17:56:44 +0100798def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
799 pass_info = f", {cmd}" if cmd else ""
800 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
801 print(f"{index} {npu_op.op_type.name}{pass_info}")
802 return
803 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100804 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
805 return
806 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100807 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100808 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200809 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100810 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100811 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100812 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
813 ):
814 fc = "FullyConnected "
815 else:
816 fc = ""
817 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
818 print_feature_map(npu_op.ifm, "IFM")
819 if npu_op.ifm2_scalar is not None:
820 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
821 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
822 else:
823 print_feature_map(npu_op.ifm2, "IFM2")
824 print_feature_map(npu_op.ofm, "OFM")
825 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
826 print(f" Kernel: {k}")
827 if npu_op.padding is not None:
828 print(f" {npu_op.padding}")
829 for weights in npu_op.weights:
830 print(f" Weights: {weights}")
831 for bias in npu_op.biases:
832 print(f" Scales: {bias}")
833 if npu_op.activation is not None:
834 act = npu_op.activation
835 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
836 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
837 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100838 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100839 print(f" {npu_op.block_traversal}")
840 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100841 rescale = (
842 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
843 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100844 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100845
Tim Hall79d07d22020-04-27 18:20:16 +0100846
Dwight Lidman9b43f842020-12-08 17:56:44 +0100847def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
848 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100849 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100850 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100851
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100852
853# -------------------------------------------------------------------
854# OPERATIONS
855# -------------------------------------------------------------------
856
857
858def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
859 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100860 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100861 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100862 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100863 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100864 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100865 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100866 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100867 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100868 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100869 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
870 else:
871 assert 0, "Unsupported operation"
872
873
Louis Verhaard933f55e2020-11-25 14:10:30 +0100874def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100875 """Generates register commands for Conv2D operations"""
876 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100877
878
Dwight Lidman9b43f842020-12-08 17:56:44 +0100879def generate_conv_depthwise_op(
880 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
881):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100882 """Generates register commands for depthwise convolution operations"""
883 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884
885
886def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
887 """Generates register commands for pooling operations"""
888 use_global_scale = (
889 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
890 )
891 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
892 # Pooling op specific
893 if use_global_scale:
894 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100895
896
897def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
898 """Generates register commands for elementwise operations"""
899 use_global_scale = npu_op.sub_op_type in (
900 NpuElementWiseOp.ADD,
901 NpuElementWiseOp.SUB,
902 NpuElementWiseOp.MUL,
903 NpuElementWiseOp.LRELU,
904 NpuElementWiseOp.ABS,
905 )
906 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
907 generate_common(
908 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
909 )
910 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100911 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100912 # Binary operation; generate IFM2 registers
913 assert npu_op.ifm2 is not None
914 has_scalar = npu_op.ifm2_scalar is not None
915 generate_ifm2(emit, npu_op.ifm2, has_scalar)
916 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
917 generate_ifm2_broadcast(emit, npu_op)
918 if has_scalar:
919 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
920 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
921 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100922
923
924def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
925 """Generates register commands for DMA operations"""
926 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100927 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100928 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
929
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100930 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
931 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100932
933
Louis Verhaard933f55e2020-11-25 14:10:30 +0100934def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100935 """
936 Generates register commands for the given operation, but not the final NPU_OP_... command.
937 Returns the selected block config
938 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100939 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100940 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100941 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100942 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100943 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100944 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100945 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100946 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100947 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100948 generate_dma_op(emit, npu_op)
949 else:
950 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100951
952
953def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +0100954 npu_op_list: List[NpuOperation],
955 arch: ArchitectureFeatures,
956 verbose: bool,
957 mem_limits: Dict[int, int],
958 add_to_debug_db=None,
959 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +0100960) -> List[int]:
961 """
962 Generates register commands for the given list of NPU operations.
963 Returns Ethos-U instructions, as a list of 32-bit integers.
964 """
965 emit = CommandStreamEmitter()
966 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100967 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100968 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100969 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100970 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100971 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100972 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100973 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100974 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100975 else:
976 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +0100977
Tim Hallc8a73862020-10-27 12:43:14 +0000978 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100979 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
980 dep_watermark = Watermark(0, 0)
981 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100982 # Generate register commands for all operations
983 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +0100984 try:
985 check_mem_limits(memory_accesses[npu_op], mem_limits)
986 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
987 generate_registers_for_op(emit, npu_op, arch)
988 except VelaError as e:
989 # Add operation info and rethrow
990 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +0100991 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100992 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +0100993 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100994 blockdep = min(blockdep, arch.max_blockdep)
995 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
996 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100997
998 generate_cmd_waits(emit, cmd_waits)
999 # Generate the actual NPU_OP command
1000 generate_operation_code(emit, npu_op)
1001 if add_to_debug_db is not None:
1002 add_to_debug_db(npu_op, emit.offset)
1003 # Fill in final part of command stream:
1004 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001005 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001006
1007 if emit.size_in_bytes() >= 1 << 24:
1008 raise VelaError(
1009 f"The command stream size exceeds the hardware limit of 16 MiB. "
1010 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1011 )
1012
Tim Hall79d07d22020-04-27 18:20:16 +01001013 if verbose:
1014 emit.print_cmds()
1015 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +01001016 print("command stream length in words", len(res))
1017 return res
1018
1019
1020# -------------------------------------------------------------------
1021# EXTERNAL API
1022# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001023
1024
Louis Verhaard933f55e2020-11-25 14:10:30 +01001025def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
1026 """
1027 Internal implementation of the public facing API for finding block configs.
1028 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001029 if isinstance(npu_op, NpuBlockOperation):
1030 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
1031 shared_buffer = create_shared_buffer(npu_op, arch)
1032 blocks = find_suitable_block_configs(arch, shared_buffer)
1033 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
1034 return []
Louis Verhaard933f55e2020-11-25 14:10:30 +01001035
1036
Louis Verhaardaeae5672020-11-02 18:04:27 +01001037def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001038 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001039 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001040 Calculates dependencies between commands and inserts wait operations if needed.
1041
1042 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001043 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1044 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001045 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001046 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001047 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001048 mem_limits = dict()
1049 for region in range(0, 8):
1050 mem_limits[region] = arch.max_address_offset
1051 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1052 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)