blob: 7de3d9ac6be47c824be4f21228a4705110a0ff82 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Dwight Lidman9b43f842020-12-08 17:56:44 +010023from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010026
27import numpy as np
28
29from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010030from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010031from .api import NpuActivation
32from .api import NpuActivationOp
33from .api import NpuAddressRange
34from .api import NpuBlockOperation
35from .api import NpuBlockTraversal
36from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010037from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010038from .api import NpuDataType
39from .api import NpuDmaOperation
40from .api import NpuElementWiseOp
41from .api import NpuElementWiseOperation
42from .api import NpuFeatureMap
43from .api import NpuKernel
44from .api import NpuLayout
45from .api import NpuOperation
46from .api import NpuOperationType
47from .api import NpuPadding
48from .api import NpuPoolingOp
49from .api import NpuPoolingOperation
50from .api import NpuQuantization
51from .api import NpuResamplingMode
52from .api import NpuRoundingMode
53from .api import NpuShape3D
54from .api import NpuTileBox
55from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010056from .architecture_features import ArchitectureFeatures
57from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010058from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import SharedBufferArea
60from .architecture_features import SHRAMElements
Diego Russoe8a10452020-04-21 17:39:10 +010061from .ethos_u55_regs.ethos_u55_regs import acc_format
62from .ethos_u55_regs.ethos_u55_regs import activation
63from .ethos_u55_regs.ethos_u55_regs import cmd0
64from .ethos_u55_regs.ethos_u55_regs import cmd1
65from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020066from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020067from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010068from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010069from .numeric_util import quantise_float32
70from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010072from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010073from .range_set import MemoryAccessSet
Louis Verhaard1e170182020-11-26 11:42:04 +010074from .register_command_stream_util import calc_blockdep
75from .register_command_stream_util import get_dma_memory_accesses
76from .register_command_stream_util import get_op_memory_accesses
77from .register_command_stream_util import get_strides
78from .register_command_stream_util import get_wait_dependency
79from .register_command_stream_util import has_ifm2
Louis Verhaard1e170182020-11-26 11:42:04 +010080from .register_command_stream_util import to_kernel
81from .register_command_stream_util import UNARY_ELEMWISE_OPS
82from .register_command_stream_util import Watermark
Louis Verhaarde8a5a782020-11-02 18:04:27 +010083from .shared_buffer_allocation import find_suitable_block_configs
84from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
85from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010086
87
88class RegisterMachine:
89 def __init__(self):
90 self.n_banks = 1
91 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
92 self.bank_idx = 0
93
94 def set_register(self, reg, value):
95 is_changed = self.registers[self.bank_idx][reg] != value
96 self.registers[self.bank_idx][reg] = value
97 # is_changed = True # force command
98 return is_changed
99
100 def switch_bank(self):
101 self.bank_idx = (self.bank_idx + 1) % self.n_banks
102
103
104class CmdMode(IntEnum):
105 NoPayload = 0x0000
106 Payload32 = 0x4000
107 Mask = 0xC000
108 CmdOpMask = 0x03FF
109
110
Tim Hall79d07d22020-04-27 18:20:16 +0100111class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000112 WORD_SIZE = 4
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114 def __init__(self):
115 self.cmd_stream = []
116 self.reg_machine = [RegisterMachine(), RegisterMachine()]
117 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000118 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100119
120 def get_reg_machine(self, cmd):
121 if "DMA" in cmd.name:
122 return self.reg_machine[1]
123 else:
124 return self.reg_machine[0]
125
126 def size_in_bytes(self):
127 sz = 0
128 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000129 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100130 return sz
131
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100132 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return [elem for cmd in self.cmd_stream for elem in cmd]
134
135 def print_cmds(self):
136 print("Code: Command: Param: Payload:")
137 for words_for_one_command in self.cmd_stream:
138 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
139 param = words_for_one_command[0] >> 16 # higher 16 bits
140
141 payload_mode = CmdMode(code & CmdMode.Mask)
142
143 # code and command
144 s = " 0x%04x " % code
145 if payload_mode == CmdMode.NoPayload:
146 s += str(cmd0(code & CmdMode.CmdOpMask))
147 else:
148 s += str(cmd1(code & CmdMode.CmdOpMask))
149
150 s = s.ljust(40)
151 s += "%5d" % param
152
153 # payload
154 if payload_mode == CmdMode.Payload32:
155 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
156 else:
157 s += " -"
158
159 print(s)
160
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100161 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100162 if isinstance(param, Enum):
163 param = int(param.value)
164 else:
165 param = int(param)
166 param = param & 0xFFFF
167 command = cmd.value | (param << 16)
168 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
169 return
170
171 # This is not a redundant command, actually write it
172 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000173 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100174
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100175 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100176 offset = int(offset) & 0xFFFFFFFFF
177 command = cmd.value | CmdMode.Payload32.value | (param << 16)
178
179 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
180 return
181
182 # This is not a redundant command, actually write it
183 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000184 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100185
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100186 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100187 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100188 command = ((param & 0xFFFF) << 16) | cmd.value
189 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000190 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100191
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100192 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100193 param = int(param)
194 command = ((param & 0xFFFF) << 16) | cmd.value
195
196 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000197 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100198 self.get_reg_machine(cmd).switch_bank()
199
200
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100201# -------------------------------------------------------------------
202# REGISTER GENERATION
203# -------------------------------------------------------------------
204
205
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100206# TODO: Replace with definitions from ethos_u55_regs
207class IFM2Broadcast(IntEnum):
208 BroadcastHdim = 1 << 0
209 BroadcastWdim = 1 << 1
210 BroadcastCdim = 1 << 2
211 ReverseOperandOrder = 1 << 6
212 UseIFM2Scalar = 1 << 7
213
214
215pooling_op_map = {
216 NpuPoolingOp.MAX: pooling_mode.MAX.value,
217 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
218 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
219}
220
221elementwise_op_map = {
222 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
223 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
224 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
225 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
226 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
227 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
228 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
229 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
230 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
231 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
232}
233
234activation_op_map = {
235 NpuActivationOp.NONE_OR_RELU: activation.NONE,
236 NpuActivationOp.TANH: activation.TANH,
237 NpuActivationOp.SIGMOID: activation.SIGMOID,
238}
239
240# Maps an AccumulatorType enum to the corresponding acc_format value
241acc_format_map = {
242 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
243 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
244 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
245}
246
247resampling_mode_map = {
248 NpuResamplingMode.NONE: resampling_mode.NONE,
249 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
250 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
251}
252
253# Maps data type size in bits to activation precision
254precision_map = {8: 0, 16: 1, 32: 2}
255
256# Maps rounding mode to the corresponding value
257rounding_mode_map = {
258 NpuRoundingMode.TFL: rounding.TFL.value,
259 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
260 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
261}
262
263
264def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
265 """Quantizes the given value"""
266 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
267 zp = 0 if quant is None else quant.zero_point
268 return quantise_float32(value, scale, zp)
269
270
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100271def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
272 """Generates IFM_PAD registers"""
273 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
274 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
275 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
276 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
277
278
279def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
280 """Generates ACTIVATION registers"""
281 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
282
283 if act.min is None:
284 quantized_min = ofm.data_type.min_value()
285 else:
286 quantized_min = quantise(act.min, ofm.quantization)
287 if act.max is None:
288 quantized_max = ofm.data_type.max_value()
289 else:
290 quantized_max = quantise(act.max, ofm.quantization)
291 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
292 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
293 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
294 assert 0 <= act.lookup_table_index < 8
295 activation_value = 16 + act.lookup_table_index
296 if ofm.data_type == NpuDataType.INT32:
297 activation_value |= 3 << 12 # Force I8 range
298 quantized_min = max(-128, quantized_min)
299 quantized_max = min(127, quantized_max)
300 else:
301 activation_value = activation_op_map[act.op_type]
302 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
303 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
304 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
305
306
307def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
308 """Generates xFM_BASE registers"""
309 if layout == NpuLayout.NHCWB16:
310 # Check that all BasePointer addresses are aligned to 16 bytes
311 assert all((int(addr) % 16) == 0 for addr in addresses)
312 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
313 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
314 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
315 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
316
317
318def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
319 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
320 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
321 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
322 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
323
324
325def generate_strides(
326 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
327):
328 """Generates STRIDE_C/Y/X registers"""
329 strides = get_strides(fm)
330 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
331 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
332 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
333
334
335def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
336 """Generates IFM/IFM2_PRECISION register"""
337 dtype = fm.data_type
338 prec = 1 if dtype.is_signed() else 0
339 activation_precision = precision_map[dtype.size_in_bits()]
340 prec += activation_precision << 2
341
342 if fm.layout == NpuLayout.NHCWB16:
343 prec |= 1 << 6
344
345 prec |= op_to_scale << 8
346 emit.cmd0_with_param(precision_cmd, prec)
347
348
349def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
350 """Generates OFM_PRECISION register"""
351 dtype = npu_op.ofm.data_type
352 prec = 1 if dtype.is_signed() else 0
353 activation_precision = precision_map[dtype.size_in_bits()]
354 prec += activation_precision << 1
355
356 if use_global_scale:
357 # Set global scale bit, as opposed to using per channel scale
358 prec |= 1 << 8
359 if npu_op.ofm.layout == NpuLayout.NHCWB16:
360 prec |= 1 << 6
361 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
362 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
363
364
365def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
366 """Generates IFM2_BROADCAST register for binary elementwise operations"""
367 ifm2_broadcast = 0
368 ifm = npu_op.ifm
369 ifm2 = npu_op.ifm2
370 if npu_op.reversed_operands:
371 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
372 if npu_op.ifm2_scalar is not None:
373 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
374 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
375 else:
376 if ifm.shape.height != ifm2.shape.height:
377 # Broadcast in 'H' dimension
378 assert ifm2.shape.height == 1
379 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
380
381 if ifm.shape.width != ifm2.shape.width:
382 # Broadcast in 'W' dimension
383 assert ifm2.shape.width == 1
384 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
385
386 if ifm.shape.depth != ifm2.shape.depth:
387 # Broadcast in 'C' dimension
388 assert ifm2.shape.depth == 1
389 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
390
391 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
392
393
394def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
395 """Generates general IFM registers"""
396 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
397 generate_addresses(
398 emit,
399 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
400 ifm.tiles.addresses,
401 ifm.layout,
402 )
403 generate_tiles(
404 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
405 )
406 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
407 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
408 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
409
410
411def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
412 """Generates general IFM2 registers"""
413 if not has_scalar:
414 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
415 generate_addresses(
416 emit,
417 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
418 ifm2.tiles.addresses,
419 ifm2.layout,
420 )
421 generate_tiles(
422 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
423 )
424 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
425 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
426
427
428def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
429 """Generates general OFM registers"""
430 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
431 generate_addresses(
432 emit,
433 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
434 ofm.tiles.addresses,
435 ofm.layout,
436 )
437 generate_tiles(
438 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
439 )
440 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
441 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
442 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
443 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
444 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
445
446
447def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
448 """Generates KERNEL related registers"""
449 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
450 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
451 # set kernel x stride low bit
452 stride = (kernel.stride_x - 1) & 1
453 # set kernel y stride low bit
454 stride |= (kernel.stride_y - 1 & 1) << 1
455 # set kernel x stride extension bits
456 stride |= (kernel.stride_x - 1 >> 1) << 6
457 # set kernel y stride extension bits
458 stride |= (kernel.stride_y - 1 >> 1) << 9
459 stride |= (kernel.dilation_x - 1) << 3
460 stride |= (kernel.dilation_y - 1) << 4
461 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
462 stride |= 1 << 2
463 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
464
465
466def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
467 """Generates WEIGHT registers"""
468 if len(weights) == 0:
469 return
470 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
471 # Set weights sources for active and present cores
472 for core, (addr, length) in enumerate(
473 [
474 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
475 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
476 ]
477 ):
478 if core < len(weights):
479 emit.cmd1_with_offset(addr, weights[core].address)
480 emit.cmd1_with_offset(length, weights[core].length)
481 elif core < arch.ncores:
482 emit.cmd1_with_offset(addr, weights[0].address)
483 emit.cmd1_with_offset(length, 0)
484
485
486def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
487 """Generates SCALE registers"""
488 if len(biases) == 0:
489 return
490 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
491 # Set weights sources for active and present cores
492 for core, (addr, length) in enumerate(
493 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
494 ):
495 if core < len(biases):
496 emit.cmd1_with_offset(addr, biases[core].address)
497 emit.cmd1_with_offset(length, biases[core].length)
498 elif core < arch.ncores:
499 emit.cmd1_with_offset(addr, biases[0].address)
500 emit.cmd1_with_offset(length, 0)
501
502
503def generate_block_config(
504 emit: CommandStreamEmitter,
505 npu_op: NpuBlockOperation,
506 arch: ArchitectureFeatures,
507 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100508):
509 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100510 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100511 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100512 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
513 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
514 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
515 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
516 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100517
518
519def generate_shram_registers_elementwise(
520 emit: CommandStreamEmitter,
521 npu_op: NpuElementWiseOperation,
522 arch: ArchitectureFeatures,
523 shared_buffer: SharedBufferAllocation,
524):
525 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
526 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
527 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
528 shram_required = arch.available_shram_banks(uses_lut)
529
530 # Acc buffers not needed so set AB_START to size of SHRAM
531 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
532 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
533 if has_ifm2(npu_op):
534 # Set IFM2_IB_START to the latter half of the IB space
535 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
536 emit.cmd0_with_param(
537 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
538 )
539 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
540
541
542def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
543 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
544 emit.cmd0_with_param(
545 cmd0.NPU_SET_IFM_IB_END,
546 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
547 )
548 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
549 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
550
551
Louis Verhaard933f55e2020-11-25 14:10:30 +0100552def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
553 """Creates shared buffer allocation for the given operation"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100554 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100555 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100556 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100557 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100558 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100559 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100560 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100561 block_type = NpuBlockType.ElementWise
562 else:
563 assert 0, "Unsupported operation"
564 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
565 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
566
567
Louis Verhaard1e170182020-11-26 11:42:04 +0100568def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
569 """Generates KERNEL_WAIT/DMA_WAIT"""
570 if cmd_waits.npu >= 0:
571 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
572
573 if cmd_waits.dma >= 0:
574 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
575
576
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100577def generate_common(
578 emit: CommandStreamEmitter,
579 npu_op: NpuBlockOperation,
580 block_traversal: NpuBlockTraversal,
581 arch: ArchitectureFeatures,
582 use_global_scale: bool = False,
583 op_to_scale: int = 0,
584):
585 """Generate registers that are common to most operations"""
586 assert npu_op.ifm is not None and npu_op.ofm is not None
587 generate_ifm(emit, npu_op.ifm)
588 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
589 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
590 if npu_op.padding is not None:
591 generate_padding(emit, npu_op.padding)
592 generate_ofm(emit, npu_op.ofm)
593 generate_ofm_precision(emit, npu_op, use_global_scale)
594 if npu_op.op_type != NpuOperationType.ElementWise:
595 assert npu_op.kernel is not None
596 generate_kernel(emit, npu_op.kernel, block_traversal)
597 generate_weights(emit, npu_op.weights, arch)
598 generate_biases(emit, npu_op.biases, arch)
599 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100600 shared_buffer = create_shared_buffer(npu_op, arch)
601 generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100602 if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100603 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
604 else:
605 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100606
607
608# -------------------------------------------------------------------
609# SCALING
610# -------------------------------------------------------------------
611
612
613def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
614 """Generates OFM_SCALE register for pooling operations"""
615 # For valid padding vela has to output scaling values
616 kernel = pool_op.kernel
617 ifm_quant = pool_op.ifm.quantization
618 ofm_quant = pool_op.ofm.quantization
619 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
620 assert ifm_quant.scale_f32 is not None
621 rescale = 0x3000 * ifm_quant.scale_f32
622 if pool_op.ifm.data_type == NpuDataType.INT16:
623 # Calculate scale and shift for the output scale of 1/(3*4096)
624 shift = 0
625 max_rescale = np.iinfo(np.int16).max / 2
626 while rescale <= max_rescale and shift <= 30:
627 shift += 1
628 rescale *= 2
629 scale = int(rescale)
630 else:
631 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
632 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
633 scale = int(round_away_zero(scale * rescale))
634 elif pool_op.fused_quantize:
635 # Quantize op requires different scaling
636 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
637 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
638 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
639 elif pool_op.rescale is not None:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100640 # for ResizeBilinear operations with rescale
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100641 rescale = pool_op.rescale
642 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
643 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
644 scale = int(round_away_zero(scale * rescale))
645 else:
646 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
647 # kernel height == kernel width == 1 is always true in this case
648 # Normally the scale is maximised, to get maximum precision, which means that
649 # if rescale != 1, scale need to consider the number of bits needed for rescaling
650 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
651 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
652 rescale_bits = 0
653 if kernel.height == kernel.width == 1:
654 if rescale > 1:
655 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
656 elif rescale < 1:
657 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
658 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
659 scale = int(round_away_zero(scale * rescale))
660 else:
661 scale = 1
662 shift = 0
663
664 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
665
666
667def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
668 """
669 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
670 Returns the operator to scale
671 """
672 op_to_scale = 0
673 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
674 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
675 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
676 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
677
678 if npu_op.activation is not None and npu_op.activation.op_type in (
679 NpuActivationOp.SIGMOID,
680 NpuActivationOp.TANH,
681 ):
682 output_scale = 1 / 0x3000
683
684 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
685 if None in (input_scale, input2_scale, output_scale):
686 ofm_scale = 1
687 shift = 0
688 else:
689 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
690 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
691 else: # Add/Sub
692 if None in (input_scale, input2_scale, output_scale):
693 opa_scale = opb_scale = ofm_scale = 1
694 opa_shift = shift = 0
695 if npu_op.rescale is not None:
696 ofm_scale, shift = npu_op.rescale
697 elif input_scale == input2_scale:
698 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
699 input_scale, input2_scale, output_scale
700 )
701 opa_shift = 0 # Unused for this case
702 else:
703 # Use advanced implementation only when input scales differ
704 bitdepth = npu_op.ifm.data_type.size_in_bits()
705 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
706 input_scale, input2_scale, output_scale, bitdepth
707 )
708 opb_scale = 0 # Unused for this case
709 if npu_op.reversed_operands:
710 # If the operand order is reversed we also have to swap which operand is scaled
711 if op_to_scale == scaling.OperandToScale.OPa:
712 op_to_scale = scaling.OperandToScale.OPb
713 else:
714 op_to_scale = scaling.OperandToScale.OPa
715 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
716 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
717 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
718 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
719 output_scale = npu_op.ofm.quantization.scale_f32
720 ofm_scale, shift = scaling.quantise_scale(output_scale)
721 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
722 else:
723 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
724 return op_to_scale
725
726
727# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100728# PRINT
729# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200730
731
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100732def print_feature_map(fm: NpuFeatureMap, name: str):
733 if fm is not None:
734 q = (
735 "no quantization"
736 if fm.quantization is None
737 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
738 )
739 h, w, c = fm.shape
740 sz = h * w * c * fm.data_type.size_in_bytes()
741 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
742 strides = get_strides(fm)
743 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
744 t = fm.tiles
745 addresses = [hex(addr) for addr in t.addresses]
746 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +0100747
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100748
Dwight Lidman9b43f842020-12-08 17:56:44 +0100749def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
750 pass_info = f", {cmd}" if cmd else ""
751 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
752 print(f"{index} {npu_op.op_type.name}{pass_info}")
753 return
754 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100755 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
756 return
757 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100758 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100759 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200760 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100761 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100762 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100763 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
764 ):
765 fc = "FullyConnected "
766 else:
767 fc = ""
768 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
769 print_feature_map(npu_op.ifm, "IFM")
770 if npu_op.ifm2_scalar is not None:
771 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
772 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
773 else:
774 print_feature_map(npu_op.ifm2, "IFM2")
775 print_feature_map(npu_op.ofm, "OFM")
776 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
777 print(f" Kernel: {k}")
778 if npu_op.padding is not None:
779 print(f" {npu_op.padding}")
780 for weights in npu_op.weights:
781 print(f" Weights: {weights}")
782 for bias in npu_op.biases:
783 print(f" Scales: {bias}")
784 if npu_op.activation is not None:
785 act = npu_op.activation
786 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
787 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
788 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100789 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100790 print(f" {npu_op.block_traversal}")
791 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100792 rescale = (
793 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
794 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100795 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100796
Tim Hall79d07d22020-04-27 18:20:16 +0100797
Dwight Lidman9b43f842020-12-08 17:56:44 +0100798def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
799 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100800 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100801 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100802
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100803
804# -------------------------------------------------------------------
805# OPERATIONS
806# -------------------------------------------------------------------
807
808
809def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
810 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100811 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100812 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100813 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100814 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100815 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100816 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100817 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100818 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100819 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100820 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
821 else:
822 assert 0, "Unsupported operation"
823
824
Louis Verhaard933f55e2020-11-25 14:10:30 +0100825def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100826 """Generates register commands for Conv2D operations"""
827 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100828
829
Dwight Lidman9b43f842020-12-08 17:56:44 +0100830def generate_conv_depthwise_op(
831 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
832):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100833 """Generates register commands for depthwise convolution operations"""
834 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100835
836
837def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
838 """Generates register commands for pooling operations"""
839 use_global_scale = (
840 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
841 )
842 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
843 # Pooling op specific
844 if use_global_scale:
845 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100846
847
848def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
849 """Generates register commands for elementwise operations"""
850 use_global_scale = npu_op.sub_op_type in (
851 NpuElementWiseOp.ADD,
852 NpuElementWiseOp.SUB,
853 NpuElementWiseOp.MUL,
854 NpuElementWiseOp.LRELU,
855 NpuElementWiseOp.ABS,
856 )
857 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
858 generate_common(
859 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
860 )
861 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100862 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100863 # Binary operation; generate IFM2 registers
864 assert npu_op.ifm2 is not None
865 has_scalar = npu_op.ifm2_scalar is not None
866 generate_ifm2(emit, npu_op.ifm2, has_scalar)
867 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
868 generate_ifm2_broadcast(emit, npu_op)
869 if has_scalar:
870 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
871 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
872 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100873
874
875def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
876 """Generates register commands for DMA operations"""
877 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
878 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
879 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
880
881 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
882 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
883
884
Louis Verhaard933f55e2020-11-25 14:10:30 +0100885def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100886 """
887 Generates register commands for the given operation, but not the final NPU_OP_... command.
888 Returns the selected block config
889 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100890 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100891 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100892 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100893 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100894 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100895 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100896 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100897 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100898 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100899 generate_dma_op(emit, npu_op)
900 else:
901 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100902
903
904def generate_command_stream(
Dwight Lidman9b43f842020-12-08 17:56:44 +0100905 npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None, npu_op_to_cmd=None
Louis Verhaard1e170182020-11-26 11:42:04 +0100906) -> List[int]:
907 """
908 Generates register commands for the given list of NPU operations.
909 Returns Ethos-U instructions, as a list of 32-bit integers.
910 """
911 emit = CommandStreamEmitter()
912 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100913 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100914 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +0100915 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100916 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100917 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100918 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100919 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100921 else:
922 assert 0, "Invalid operation type"
Tim Hallc8a73862020-10-27 12:43:14 +0000923 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100924 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
925 dep_watermark = Watermark(0, 0)
926 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100927 # Generate register commands for all operations
928 for op_index, npu_op in enumerate(npu_op_list):
929 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100930 generate_registers_for_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100931 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100932 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +0100933 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100934 blockdep = min(blockdep, arch.max_blockdep)
935 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
936 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100937
938 generate_cmd_waits(emit, cmd_waits)
939 # Generate the actual NPU_OP command
940 generate_operation_code(emit, npu_op)
941 if add_to_debug_db is not None:
942 add_to_debug_db(npu_op, emit.offset)
943 # Fill in final part of command stream:
944 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +0100945 res = emit.to_list()
Tim Hall79d07d22020-04-27 18:20:16 +0100946 if verbose:
947 emit.print_cmds()
948 print("number of commands", len(emit.cmd_stream))
Louis Verhaard1e170182020-11-26 11:42:04 +0100949 print("command stream length in words", len(res))
950 return res
951
952
953# -------------------------------------------------------------------
954# EXTERNAL API
955# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100956
957
Louis Verhaard933f55e2020-11-25 14:10:30 +0100958def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
959 """
960 Internal implementation of the public facing API for finding block configs.
961 """
Dwight Lidman9b43f842020-12-08 17:56:44 +0100962 if isinstance(npu_op, NpuBlockOperation):
963 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
964 shared_buffer = create_shared_buffer(npu_op, arch)
965 blocks = find_suitable_block_configs(arch, shared_buffer)
966 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
967 return []
Louis Verhaard933f55e2020-11-25 14:10:30 +0100968
969
Louis Verhaardaeae5672020-11-02 18:04:27 +0100970def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100971 """
Louis Verhaardaeae5672020-11-02 18:04:27 +0100972 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100973 Calculates dependencies between commands and inserts wait operations if needed.
974
975 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +0000976 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
977 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100978 """
Louis Verhaardaeae5672020-11-02 18:04:27 +0100979 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +0100980 arch = create_default_arch(accelerator)
Louis Verhaard1e170182020-11-26 11:42:04 +0100981 return generate_command_stream(npu_op_list, arch, verbose=False)