blob: d4947b1abd94c6e1a24156d6384469e724558df8 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Louis Verhaarde8a5a782020-11-02 18:04:27 +010023from typing import List
24from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010025
26import numpy as np
27
28from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010029from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010030from .api import NpuActivation
31from .api import NpuActivationOp
32from .api import NpuAddressRange
33from .api import NpuBlockOperation
34from .api import NpuBlockTraversal
35from .api import NpuConv2DOperation
36from .api import NpuDataType
37from .api import NpuDmaOperation
38from .api import NpuElementWiseOp
39from .api import NpuElementWiseOperation
40from .api import NpuFeatureMap
41from .api import NpuKernel
42from .api import NpuLayout
43from .api import NpuOperation
44from .api import NpuOperationType
45from .api import NpuPadding
46from .api import NpuPoolingOp
47from .api import NpuPoolingOperation
48from .api import NpuQuantization
49from .api import NpuResamplingMode
50from .api import NpuRoundingMode
51from .api import NpuShape3D
52from .api import NpuTileBox
53from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010054from .architecture_features import ArchitectureFeatures
55from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010056from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010057from .architecture_features import SharedBufferArea
58from .architecture_features import SHRAMElements
Diego Russoe8a10452020-04-21 17:39:10 +010059from .ethos_u55_regs.ethos_u55_regs import acc_format
60from .ethos_u55_regs.ethos_u55_regs import activation
61from .ethos_u55_regs.ethos_u55_regs import cmd0
62from .ethos_u55_regs.ethos_u55_regs import cmd1
63from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020064from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020065from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010066from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010067from .numeric_util import quantise_float32
68from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010069from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010070from .operation import NpuBlockType
Louis Verhaard1e170182020-11-26 11:42:04 +010071from .register_command_stream_util import calc_blockdep
72from .register_command_stream_util import get_dma_memory_accesses
73from .register_command_stream_util import get_op_memory_accesses
74from .register_command_stream_util import get_strides
75from .register_command_stream_util import get_wait_dependency
76from .register_command_stream_util import has_ifm2
77from .register_command_stream_util import is_dma_op
78from .register_command_stream_util import to_kernel
79from .register_command_stream_util import UNARY_ELEMWISE_OPS
80from .register_command_stream_util import Watermark
Louis Verhaarde8a5a782020-11-02 18:04:27 +010081from .shared_buffer_allocation import find_suitable_block_configs
82from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
83from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010084
85
86class RegisterMachine:
87 def __init__(self):
88 self.n_banks = 1
89 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
90 self.bank_idx = 0
91
92 def set_register(self, reg, value):
93 is_changed = self.registers[self.bank_idx][reg] != value
94 self.registers[self.bank_idx][reg] = value
95 # is_changed = True # force command
96 return is_changed
97
98 def switch_bank(self):
99 self.bank_idx = (self.bank_idx + 1) % self.n_banks
100
101
102class CmdMode(IntEnum):
103 NoPayload = 0x0000
104 Payload32 = 0x4000
105 Mask = 0xC000
106 CmdOpMask = 0x03FF
107
108
Tim Hall79d07d22020-04-27 18:20:16 +0100109class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000110 WORD_SIZE = 4
111
Tim Hall79d07d22020-04-27 18:20:16 +0100112 def __init__(self):
113 self.cmd_stream = []
114 self.reg_machine = [RegisterMachine(), RegisterMachine()]
115 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000116 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100117
118 def get_reg_machine(self, cmd):
119 if "DMA" in cmd.name:
120 return self.reg_machine[1]
121 else:
122 return self.reg_machine[0]
123
124 def size_in_bytes(self):
125 sz = 0
126 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000127 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100128 return sz
129
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100130 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100131 return [elem for cmd in self.cmd_stream for elem in cmd]
132
133 def print_cmds(self):
134 print("Code: Command: Param: Payload:")
135 for words_for_one_command in self.cmd_stream:
136 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
137 param = words_for_one_command[0] >> 16 # higher 16 bits
138
139 payload_mode = CmdMode(code & CmdMode.Mask)
140
141 # code and command
142 s = " 0x%04x " % code
143 if payload_mode == CmdMode.NoPayload:
144 s += str(cmd0(code & CmdMode.CmdOpMask))
145 else:
146 s += str(cmd1(code & CmdMode.CmdOpMask))
147
148 s = s.ljust(40)
149 s += "%5d" % param
150
151 # payload
152 if payload_mode == CmdMode.Payload32:
153 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
154 else:
155 s += " -"
156
157 print(s)
158
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100159 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100160 if isinstance(param, Enum):
161 param = int(param.value)
162 else:
163 param = int(param)
164 param = param & 0xFFFF
165 command = cmd.value | (param << 16)
166 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
167 return
168
169 # This is not a redundant command, actually write it
170 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000171 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100172
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100173 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100174 offset = int(offset) & 0xFFFFFFFFF
175 command = cmd.value | CmdMode.Payload32.value | (param << 16)
176
177 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
178 return
179
180 # This is not a redundant command, actually write it
181 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000182 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100183
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100184 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100185 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100186 command = ((param & 0xFFFF) << 16) | cmd.value
187 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000188 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100190 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100191 param = int(param)
192 command = ((param & 0xFFFF) << 16) | cmd.value
193
194 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000195 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100196 self.get_reg_machine(cmd).switch_bank()
197
198
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100199# -------------------------------------------------------------------
200# REGISTER GENERATION
201# -------------------------------------------------------------------
202
203
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100204# TODO: Replace with definitions from ethos_u55_regs
205class IFM2Broadcast(IntEnum):
206 BroadcastHdim = 1 << 0
207 BroadcastWdim = 1 << 1
208 BroadcastCdim = 1 << 2
209 ReverseOperandOrder = 1 << 6
210 UseIFM2Scalar = 1 << 7
211
212
213pooling_op_map = {
214 NpuPoolingOp.MAX: pooling_mode.MAX.value,
215 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
216 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
217}
218
219elementwise_op_map = {
220 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
221 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
222 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
223 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
224 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
225 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
226 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
227 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
228 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
229 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
230}
231
232activation_op_map = {
233 NpuActivationOp.NONE_OR_RELU: activation.NONE,
234 NpuActivationOp.TANH: activation.TANH,
235 NpuActivationOp.SIGMOID: activation.SIGMOID,
236}
237
238# Maps an AccumulatorType enum to the corresponding acc_format value
239acc_format_map = {
240 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
241 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
242 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
243}
244
245resampling_mode_map = {
246 NpuResamplingMode.NONE: resampling_mode.NONE,
247 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
248 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
249}
250
251# Maps data type size in bits to activation precision
252precision_map = {8: 0, 16: 1, 32: 2}
253
254# Maps rounding mode to the corresponding value
255rounding_mode_map = {
256 NpuRoundingMode.TFL: rounding.TFL.value,
257 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
258 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
259}
260
261
262def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
263 """Quantizes the given value"""
264 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
265 zp = 0 if quant is None else quant.zero_point
266 return quantise_float32(value, scale, zp)
267
268
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100269def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
270 """Generates IFM_PAD registers"""
271 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
272 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
273 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
274 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
275
276
277def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
278 """Generates ACTIVATION registers"""
279 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
280
281 if act.min is None:
282 quantized_min = ofm.data_type.min_value()
283 else:
284 quantized_min = quantise(act.min, ofm.quantization)
285 if act.max is None:
286 quantized_max = ofm.data_type.max_value()
287 else:
288 quantized_max = quantise(act.max, ofm.quantization)
289 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
290 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
291 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
292 assert 0 <= act.lookup_table_index < 8
293 activation_value = 16 + act.lookup_table_index
294 if ofm.data_type == NpuDataType.INT32:
295 activation_value |= 3 << 12 # Force I8 range
296 quantized_min = max(-128, quantized_min)
297 quantized_max = min(127, quantized_max)
298 else:
299 activation_value = activation_op_map[act.op_type]
300 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
301 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
302 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
303
304
305def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
306 """Generates xFM_BASE registers"""
307 if layout == NpuLayout.NHCWB16:
308 # Check that all BasePointer addresses are aligned to 16 bytes
309 assert all((int(addr) % 16) == 0 for addr in addresses)
310 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
311 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
312 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
313 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
314
315
316def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
317 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
318 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
319 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
320 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
321
322
323def generate_strides(
324 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
325):
326 """Generates STRIDE_C/Y/X registers"""
327 strides = get_strides(fm)
328 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
329 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
330 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
331
332
333def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
334 """Generates IFM/IFM2_PRECISION register"""
335 dtype = fm.data_type
336 prec = 1 if dtype.is_signed() else 0
337 activation_precision = precision_map[dtype.size_in_bits()]
338 prec += activation_precision << 2
339
340 if fm.layout == NpuLayout.NHCWB16:
341 prec |= 1 << 6
342
343 prec |= op_to_scale << 8
344 emit.cmd0_with_param(precision_cmd, prec)
345
346
347def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
348 """Generates OFM_PRECISION register"""
349 dtype = npu_op.ofm.data_type
350 prec = 1 if dtype.is_signed() else 0
351 activation_precision = precision_map[dtype.size_in_bits()]
352 prec += activation_precision << 1
353
354 if use_global_scale:
355 # Set global scale bit, as opposed to using per channel scale
356 prec |= 1 << 8
357 if npu_op.ofm.layout == NpuLayout.NHCWB16:
358 prec |= 1 << 6
359 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
360 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
361
362
363def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
364 """Generates IFM2_BROADCAST register for binary elementwise operations"""
365 ifm2_broadcast = 0
366 ifm = npu_op.ifm
367 ifm2 = npu_op.ifm2
368 if npu_op.reversed_operands:
369 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
370 if npu_op.ifm2_scalar is not None:
371 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
372 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
373 else:
374 if ifm.shape.height != ifm2.shape.height:
375 # Broadcast in 'H' dimension
376 assert ifm2.shape.height == 1
377 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
378
379 if ifm.shape.width != ifm2.shape.width:
380 # Broadcast in 'W' dimension
381 assert ifm2.shape.width == 1
382 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
383
384 if ifm.shape.depth != ifm2.shape.depth:
385 # Broadcast in 'C' dimension
386 assert ifm2.shape.depth == 1
387 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
388
389 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
390
391
392def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
393 """Generates general IFM registers"""
394 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
395 generate_addresses(
396 emit,
397 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
398 ifm.tiles.addresses,
399 ifm.layout,
400 )
401 generate_tiles(
402 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
403 )
404 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
405 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
406 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
407
408
409def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
410 """Generates general IFM2 registers"""
411 if not has_scalar:
412 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
413 generate_addresses(
414 emit,
415 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
416 ifm2.tiles.addresses,
417 ifm2.layout,
418 )
419 generate_tiles(
420 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
421 )
422 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
423 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
424
425
426def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
427 """Generates general OFM registers"""
428 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
429 generate_addresses(
430 emit,
431 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
432 ofm.tiles.addresses,
433 ofm.layout,
434 )
435 generate_tiles(
436 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
437 )
438 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
439 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
440 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
441 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
442 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
443
444
445def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
446 """Generates KERNEL related registers"""
447 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
448 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
449 # set kernel x stride low bit
450 stride = (kernel.stride_x - 1) & 1
451 # set kernel y stride low bit
452 stride |= (kernel.stride_y - 1 & 1) << 1
453 # set kernel x stride extension bits
454 stride |= (kernel.stride_x - 1 >> 1) << 6
455 # set kernel y stride extension bits
456 stride |= (kernel.stride_y - 1 >> 1) << 9
457 stride |= (kernel.dilation_x - 1) << 3
458 stride |= (kernel.dilation_y - 1) << 4
459 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
460 stride |= 1 << 2
461 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
462
463
464def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
465 """Generates WEIGHT registers"""
466 if len(weights) == 0:
467 return
468 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
469 # Set weights sources for active and present cores
470 for core, (addr, length) in enumerate(
471 [
472 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
473 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
474 ]
475 ):
476 if core < len(weights):
477 emit.cmd1_with_offset(addr, weights[core].address)
478 emit.cmd1_with_offset(length, weights[core].length)
479 elif core < arch.ncores:
480 emit.cmd1_with_offset(addr, weights[0].address)
481 emit.cmd1_with_offset(length, 0)
482
483
484def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
485 """Generates SCALE registers"""
486 if len(biases) == 0:
487 return
488 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
489 # Set weights sources for active and present cores
490 for core, (addr, length) in enumerate(
491 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
492 ):
493 if core < len(biases):
494 emit.cmd1_with_offset(addr, biases[core].address)
495 emit.cmd1_with_offset(length, biases[core].length)
496 elif core < arch.ncores:
497 emit.cmd1_with_offset(addr, biases[0].address)
498 emit.cmd1_with_offset(length, 0)
499
500
501def generate_block_config(
502 emit: CommandStreamEmitter,
503 npu_op: NpuBlockOperation,
504 arch: ArchitectureFeatures,
505 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100506):
507 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100508 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100509 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100510 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
511 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
512 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
513 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
514 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100515
516
517def generate_shram_registers_elementwise(
518 emit: CommandStreamEmitter,
519 npu_op: NpuElementWiseOperation,
520 arch: ArchitectureFeatures,
521 shared_buffer: SharedBufferAllocation,
522):
523 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
524 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
525 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
526 shram_required = arch.available_shram_banks(uses_lut)
527
528 # Acc buffers not needed so set AB_START to size of SHRAM
529 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
530 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
531 if has_ifm2(npu_op):
532 # Set IFM2_IB_START to the latter half of the IB space
533 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
534 emit.cmd0_with_param(
535 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
536 )
537 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
538
539
540def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
541 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
542 emit.cmd0_with_param(
543 cmd0.NPU_SET_IFM_IB_END,
544 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
545 )
546 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
547 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
548
549
Louis Verhaard933f55e2020-11-25 14:10:30 +0100550def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
551 """Creates shared buffer allocation for the given operation"""
552 op_type = npu_op.op_type
553 block_type = NpuBlockType.Default
554 if op_type == NpuOperationType.Conv2D:
555 block_type = NpuBlockType.ConvolutionMxN
556 elif op_type == NpuOperationType.ConvDepthWise:
557 block_type = NpuBlockType.ConvolutionDepthWise
558 elif op_type == NpuOperationType.Pooling:
559 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
560 elif op_type == NpuOperationType.ElementWise:
561 block_type = NpuBlockType.ElementWise
562 else:
563 assert 0, "Unsupported operation"
564 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
565 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
566
567
Louis Verhaard1e170182020-11-26 11:42:04 +0100568def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
569 """Generates KERNEL_WAIT/DMA_WAIT"""
570 if cmd_waits.npu >= 0:
571 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
572
573 if cmd_waits.dma >= 0:
574 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
575
576
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100577def generate_common(
578 emit: CommandStreamEmitter,
579 npu_op: NpuBlockOperation,
580 block_traversal: NpuBlockTraversal,
581 arch: ArchitectureFeatures,
582 use_global_scale: bool = False,
583 op_to_scale: int = 0,
584):
585 """Generate registers that are common to most operations"""
586 assert npu_op.ifm is not None and npu_op.ofm is not None
587 generate_ifm(emit, npu_op.ifm)
588 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
589 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
590 if npu_op.padding is not None:
591 generate_padding(emit, npu_op.padding)
592 generate_ofm(emit, npu_op.ofm)
593 generate_ofm_precision(emit, npu_op, use_global_scale)
594 if npu_op.op_type != NpuOperationType.ElementWise:
595 assert npu_op.kernel is not None
596 generate_kernel(emit, npu_op.kernel, block_traversal)
597 generate_weights(emit, npu_op.weights, arch)
598 generate_biases(emit, npu_op.biases, arch)
599 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100600 shared_buffer = create_shared_buffer(npu_op, arch)
601 generate_block_config(emit, npu_op, arch, shared_buffer)
602 if npu_op.op_type == NpuOperationType.ElementWise:
603 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
604 else:
605 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100606
607
608# -------------------------------------------------------------------
609# SCALING
610# -------------------------------------------------------------------
611
612
613def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
614 """Generates OFM_SCALE register for pooling operations"""
615 # For valid padding vela has to output scaling values
616 kernel = pool_op.kernel
617 ifm_quant = pool_op.ifm.quantization
618 ofm_quant = pool_op.ofm.quantization
619 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
620 assert ifm_quant.scale_f32 is not None
621 rescale = 0x3000 * ifm_quant.scale_f32
622 if pool_op.ifm.data_type == NpuDataType.INT16:
623 # Calculate scale and shift for the output scale of 1/(3*4096)
624 shift = 0
625 max_rescale = np.iinfo(np.int16).max / 2
626 while rescale <= max_rescale and shift <= 30:
627 shift += 1
628 rescale *= 2
629 scale = int(rescale)
630 else:
631 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
632 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
633 scale = int(round_away_zero(scale * rescale))
634 elif pool_op.fused_quantize:
635 # Quantize op requires different scaling
636 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
637 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
638 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
639 elif pool_op.rescale is not None:
640 # for ResizeBilinear operations with "rescale" in primary_op.attrs
641 rescale = pool_op.rescale
642 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
643 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
644 scale = int(round_away_zero(scale * rescale))
645 else:
646 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
647 # kernel height == kernel width == 1 is always true in this case
648 # Normally the scale is maximised, to get maximum precision, which means that
649 # if rescale != 1, scale need to consider the number of bits needed for rescaling
650 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
651 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
652 rescale_bits = 0
653 if kernel.height == kernel.width == 1:
654 if rescale > 1:
655 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
656 elif rescale < 1:
657 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
658 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
659 scale = int(round_away_zero(scale * rescale))
660 else:
661 scale = 1
662 shift = 0
663
664 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
665
666
667def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
668 """
669 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
670 Returns the operator to scale
671 """
672 op_to_scale = 0
673 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
674 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
675 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
676 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
677
678 if npu_op.activation is not None and npu_op.activation.op_type in (
679 NpuActivationOp.SIGMOID,
680 NpuActivationOp.TANH,
681 ):
682 output_scale = 1 / 0x3000
683
684 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
685 if None in (input_scale, input2_scale, output_scale):
686 ofm_scale = 1
687 shift = 0
688 else:
689 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
690 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
691 else: # Add/Sub
692 if None in (input_scale, input2_scale, output_scale):
693 opa_scale = opb_scale = ofm_scale = 1
694 opa_shift = shift = 0
695 if npu_op.rescale is not None:
696 ofm_scale, shift = npu_op.rescale
697 elif input_scale == input2_scale:
698 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
699 input_scale, input2_scale, output_scale
700 )
701 opa_shift = 0 # Unused for this case
702 else:
703 # Use advanced implementation only when input scales differ
704 bitdepth = npu_op.ifm.data_type.size_in_bits()
705 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
706 input_scale, input2_scale, output_scale, bitdepth
707 )
708 opb_scale = 0 # Unused for this case
709 if npu_op.reversed_operands:
710 # If the operand order is reversed we also have to swap which operand is scaled
711 if op_to_scale == scaling.OperandToScale.OPa:
712 op_to_scale = scaling.OperandToScale.OPb
713 else:
714 op_to_scale = scaling.OperandToScale.OPa
715 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
716 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
717 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
718 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
719 output_scale = npu_op.ofm.quantization.scale_f32
720 ofm_scale, shift = scaling.quantise_scale(output_scale)
721 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
722 else:
723 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
724 return op_to_scale
725
726
727# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100728# PRINT
729# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200730
731
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100732def print_feature_map(fm: NpuFeatureMap, name: str):
733 if fm is not None:
734 q = (
735 "no quantization"
736 if fm.quantization is None
737 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
738 )
739 h, w, c = fm.shape
740 sz = h * w * c * fm.data_type.size_in_bytes()
741 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
742 strides = get_strides(fm)
743 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
744 t = fm.tiles
745 addresses = [hex(addr) for addr in t.addresses]
746 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100747
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100748
749def print_operation(npu_op: NpuOperation, index: int = 0):
750 pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
751 if is_dma_op(npu_op):
752 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
753 return
754 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
755 if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
756 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200757 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100758 if (
759 npu_op.op_type == NpuOperationType.Conv2D
760 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
761 ):
762 fc = "FullyConnected "
763 else:
764 fc = ""
765 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
766 print_feature_map(npu_op.ifm, "IFM")
767 if npu_op.ifm2_scalar is not None:
768 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
769 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
770 else:
771 print_feature_map(npu_op.ifm2, "IFM2")
772 print_feature_map(npu_op.ofm, "OFM")
773 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
774 print(f" Kernel: {k}")
775 if npu_op.padding is not None:
776 print(f" {npu_op.padding}")
777 for weights in npu_op.weights:
778 print(f" Weights: {weights}")
779 for bias in npu_op.biases:
780 print(f" Scales: {bias}")
781 if npu_op.activation is not None:
782 act = npu_op.activation
783 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
784 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
785 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
786 if npu_op.op_type == NpuOperationType.Conv2D:
787 print(f" {npu_op.block_traversal}")
788 bh, bw, bc = npu_op.block_config
789 rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
790 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100791
Tim Hall79d07d22020-04-27 18:20:16 +0100792
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100793def print_operations(npu_op_list: List[NpuOperation]):
794 for index, npu_op in enumerate(npu_op_list):
795 print_operation(npu_op, index)
Tim Hall79d07d22020-04-27 18:20:16 +0100796
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100797
798# -------------------------------------------------------------------
799# OPERATIONS
800# -------------------------------------------------------------------
801
802
803def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
804 """Generates NPU_OP_* command"""
805 op_type = npu_op.op_type
806 if op_type == NpuOperationType.Dma:
807 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
808 elif op_type == NpuOperationType.Conv2D:
809 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
810 elif op_type == NpuOperationType.ConvDepthWise:
811 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
812 elif op_type == NpuOperationType.Pooling:
813 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
814 elif op_type == NpuOperationType.ElementWise:
815 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
816 else:
817 assert 0, "Unsupported operation"
818
819
Louis Verhaard933f55e2020-11-25 14:10:30 +0100820def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100821 """Generates register commands for Conv2D operations"""
822 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100823
824
825def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
826 """Generates register commands for depthwise convolution operations"""
827 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100828
829
830def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
831 """Generates register commands for pooling operations"""
832 use_global_scale = (
833 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
834 )
835 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
836 # Pooling op specific
837 if use_global_scale:
838 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100839
840
841def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
842 """Generates register commands for elementwise operations"""
843 use_global_scale = npu_op.sub_op_type in (
844 NpuElementWiseOp.ADD,
845 NpuElementWiseOp.SUB,
846 NpuElementWiseOp.MUL,
847 NpuElementWiseOp.LRELU,
848 NpuElementWiseOp.ABS,
849 )
850 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
851 generate_common(
852 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
853 )
854 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100855 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100856 # Binary operation; generate IFM2 registers
857 assert npu_op.ifm2 is not None
858 has_scalar = npu_op.ifm2_scalar is not None
859 generate_ifm2(emit, npu_op.ifm2, has_scalar)
860 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
861 generate_ifm2_broadcast(emit, npu_op)
862 if has_scalar:
863 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
864 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
865 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100866
867
868def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
869 """Generates register commands for DMA operations"""
870 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
871 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
872 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
873
874 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
875 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
876
877
Louis Verhaard933f55e2020-11-25 14:10:30 +0100878def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100879 """
880 Generates register commands for the given operation, but not the final NPU_OP_... command.
881 Returns the selected block config
882 """
883 op_type = npu_op.op_type
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884 if op_type == NpuOperationType.Conv2D:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100885 generate_conv2d_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100886 elif op_type == NpuOperationType.ConvDepthWise:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100887 generate_conv_depthwise_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100888 elif op_type == NpuOperationType.Pooling:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100889 generate_pooling_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100890 elif op_type == NpuOperationType.ElementWise:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100891 generate_elementwise_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100892 elif op_type == NpuOperationType.Dma:
893 generate_dma_op(emit, npu_op)
894 else:
895 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100896
897
898def generate_command_stream(
Louis Verhaard1e170182020-11-26 11:42:04 +0100899 npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None,
900) -> List[int]:
901 """
902 Generates register commands for the given list of NPU operations.
903 Returns Ethos-U instructions, as a list of 32-bit integers.
904 """
905 emit = CommandStreamEmitter()
906 if verbose:
907 print_operations(npu_op_list)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100908 # Calculate memory accesses for every operation
Tim Hall289a41d2020-08-04 21:40:14 +0100909 memory_accesses = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100910 for npu_op in npu_op_list:
911 if is_dma_op(npu_op):
912 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
913 else:
914 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hallc8a73862020-10-27 12:43:14 +0000915 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100916 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
917 dep_watermark = Watermark(0, 0)
918 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100919 # Generate register commands for all operations
920 for op_index, npu_op in enumerate(npu_op_list):
921 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100922 generate_registers_for_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100923 if not is_dma_op(npu_op):
924 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +0100925 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100926 blockdep = min(blockdep, arch.max_blockdep)
927 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
928 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100929
930 generate_cmd_waits(emit, cmd_waits)
931 # Generate the actual NPU_OP command
932 generate_operation_code(emit, npu_op)
933 if add_to_debug_db is not None:
934 add_to_debug_db(npu_op, emit.offset)
935 # Fill in final part of command stream:
936 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +0100937 res = emit.to_list()
Tim Hall79d07d22020-04-27 18:20:16 +0100938 if verbose:
939 emit.print_cmds()
940 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +0100941 print("command stream length in words", len(res))
942 return res
943
944
945# -------------------------------------------------------------------
946# EXTERNAL API
947# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100948
949
Louis Verhaard933f55e2020-11-25 14:10:30 +0100950def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
951 """
952 Internal implementation of the public facing API for finding block configs.
953 """
954 if is_dma_op(npu_op):
955 return []
956 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
957 shared_buffer = create_shared_buffer(npu_op, arch)
958 blocks = find_suitable_block_configs(arch, shared_buffer)
959 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
960
961
Louis Verhaardaeae5672020-11-02 18:04:27 +0100962def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100963 """
Louis Verhaardaeae5672020-11-02 18:04:27 +0100964 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100965 Calculates dependencies between commands and inserts wait operations if needed.
966
967 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +0000968 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
969 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100970 """
Louis Verhaardaeae5672020-11-02 18:04:27 +0100971 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +0100972 arch = create_default_arch(accelerator)
Louis Verhaard1e170182020-11-26 11:42:04 +0100973 return generate_command_stream(npu_op_list, arch, verbose=False)