blob: 741b09c15cdb84785c450d4ecdfe4c1ec6031e4c [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Louis Verhaarde8a5a782020-11-02 18:04:27 +010024from typing import List
25from typing import Optional
Louis Verhaardd2665802020-11-20 13:08:55 +010026from typing import Tuple
Diego Russoea6111a2020-04-14 18:41:58 +010027
28import numpy as np
29
Louis Verhaarde8a5a782020-11-02 18:04:27 +010030from . import numeric_util
Diego Russoea6111a2020-04-14 18:41:58 +010031from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010032from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010033from .api import NpuActivation
34from .api import NpuActivationOp
35from .api import NpuAddressRange
36from .api import NpuBlockOperation
37from .api import NpuBlockTraversal
38from .api import NpuConv2DOperation
39from .api import NpuDataType
40from .api import NpuDmaOperation
41from .api import NpuElementWiseOp
42from .api import NpuElementWiseOperation
43from .api import NpuFeatureMap
44from .api import NpuKernel
45from .api import NpuLayout
46from .api import NpuOperation
47from .api import NpuOperationType
48from .api import NpuPadding
49from .api import NpuPoolingOp
50from .api import NpuPoolingOperation
51from .api import NpuQuantization
52from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
56from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010057from .architecture_features import ArchitectureFeatures
58from .architecture_features import Block
Louis Verhaard52078302020-11-18 13:35:06 +010059from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import Rect
61from .architecture_features import SharedBufferArea
62from .architecture_features import SHRAMElements
Tim Halle6ccd872020-11-09 16:46:37 +000063from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010064from .ethos_u55_regs.ethos_u55_regs import acc_format
65from .ethos_u55_regs.ethos_u55_regs import activation
66from .ethos_u55_regs.ethos_u55_regs import cmd0
67from .ethos_u55_regs.ethos_u55_regs import cmd1
68from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020069from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020070from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010071from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010072from .high_level_command_stream import CommandType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010073from .high_level_command_to_npu_op import convert_command_to_npu_op
74from .high_level_command_to_npu_op import to_kernel
75from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russoe8a10452020-04-21 17:39:10 +010076from .numeric_util import quantise_float32
77from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010078from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010079from .operation import NpuBlockType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010080from .range_set import AccessDirection
81from .range_set import MemoryAccessSet
82from .range_set import MemoryRangeSet
83from .shared_buffer_allocation import find_suitable_block_configs
84from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
85from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall79d07d22020-04-27 18:20:16 +010086
87
88class RegisterMachine:
89 def __init__(self):
90 self.n_banks = 1
91 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
92 self.bank_idx = 0
93
94 def set_register(self, reg, value):
95 is_changed = self.registers[self.bank_idx][reg] != value
96 self.registers[self.bank_idx][reg] = value
97 # is_changed = True # force command
98 return is_changed
99
100 def switch_bank(self):
101 self.bank_idx = (self.bank_idx + 1) % self.n_banks
102
103
104class CmdMode(IntEnum):
105 NoPayload = 0x0000
106 Payload32 = 0x4000
107 Mask = 0xC000
108 CmdOpMask = 0x03FF
109
110
Tim Hall79d07d22020-04-27 18:20:16 +0100111class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000112 WORD_SIZE = 4
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114 def __init__(self):
115 self.cmd_stream = []
116 self.reg_machine = [RegisterMachine(), RegisterMachine()]
117 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000118 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100119
120 def get_reg_machine(self, cmd):
121 if "DMA" in cmd.name:
122 return self.reg_machine[1]
123 else:
124 return self.reg_machine[0]
125
126 def size_in_bytes(self):
127 sz = 0
128 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000129 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100130 return sz
131
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100132 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return [elem for cmd in self.cmd_stream for elem in cmd]
134
135 def print_cmds(self):
136 print("Code: Command: Param: Payload:")
137 for words_for_one_command in self.cmd_stream:
138 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
139 param = words_for_one_command[0] >> 16 # higher 16 bits
140
141 payload_mode = CmdMode(code & CmdMode.Mask)
142
143 # code and command
144 s = " 0x%04x " % code
145 if payload_mode == CmdMode.NoPayload:
146 s += str(cmd0(code & CmdMode.CmdOpMask))
147 else:
148 s += str(cmd1(code & CmdMode.CmdOpMask))
149
150 s = s.ljust(40)
151 s += "%5d" % param
152
153 # payload
154 if payload_mode == CmdMode.Payload32:
155 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
156 else:
157 s += " -"
158
159 print(s)
160
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100161 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100162 if isinstance(param, Enum):
163 param = int(param.value)
164 else:
165 param = int(param)
166 param = param & 0xFFFF
167 command = cmd.value | (param << 16)
168 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
169 return
170
171 # This is not a redundant command, actually write it
172 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000173 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100174
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100175 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall79d07d22020-04-27 18:20:16 +0100176 offset = int(offset) & 0xFFFFFFFFF
177 command = cmd.value | CmdMode.Payload32.value | (param << 16)
178
179 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
180 return
181
182 # This is not a redundant command, actually write it
183 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000184 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100185
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100186 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100187 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100188 command = ((param & 0xFFFF) << 16) | cmd.value
189 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000190 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100191
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100192 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100193 param = int(param)
194 command = ((param & 0xFFFF) << 16) | cmd.value
195
196 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000197 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100198 self.get_reg_machine(cmd).switch_bank()
199
200
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100201# -------------------------------------------------------------------
202# REGISTER GENERATION
203# -------------------------------------------------------------------
204
205
206class BasePointerIndex(IntEnum):
207 WeightTensor = 0 # base address index for the Weight tensor
208 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
209 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
210 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
211
212
213# TODO: Replace with definitions from ethos_u55_regs
214class IFM2Broadcast(IntEnum):
215 BroadcastHdim = 1 << 0
216 BroadcastWdim = 1 << 1
217 BroadcastCdim = 1 << 2
218 ReverseOperandOrder = 1 << 6
219 UseIFM2Scalar = 1 << 7
220
221
222pooling_op_map = {
223 NpuPoolingOp.MAX: pooling_mode.MAX.value,
224 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
225 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
226}
227
228elementwise_op_map = {
229 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
230 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
231 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
232 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
233 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
234 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
235 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
236 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
237 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
238 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
239}
240
241activation_op_map = {
242 NpuActivationOp.NONE_OR_RELU: activation.NONE,
243 NpuActivationOp.TANH: activation.TANH,
244 NpuActivationOp.SIGMOID: activation.SIGMOID,
245}
246
247# Maps an AccumulatorType enum to the corresponding acc_format value
248acc_format_map = {
249 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
250 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
251 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
252}
253
254resampling_mode_map = {
255 NpuResamplingMode.NONE: resampling_mode.NONE,
256 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
257 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
258}
259
260# Maps data type size in bits to activation precision
261precision_map = {8: 0, 16: 1, 32: 2}
262
263# Maps rounding mode to the corresponding value
264rounding_mode_map = {
265 NpuRoundingMode.TFL: rounding.TFL.value,
266 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
267 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
268}
269
270
271def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
272 """Quantizes the given value"""
273 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
274 zp = 0 if quant is None else quant.zero_point
275 return quantise_float32(value, scale, zp)
276
277
278def has_ifm2(npu_op: NpuBlockOperation) -> bool:
279 """Checks if op has non-scalar IFM2"""
280 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
281
282
283def is_dma_op(npu_op: NpuOperation) -> bool:
284 """Checks if op is a DMA operation"""
285 return npu_op.op_type == NpuOperationType.Dma
286
287
288def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
289 """Generates IFM_PAD registers"""
290 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
291 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
292 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
293 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
294
295
296def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
297 """Generates ACTIVATION registers"""
298 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
299
300 if act.min is None:
301 quantized_min = ofm.data_type.min_value()
302 else:
303 quantized_min = quantise(act.min, ofm.quantization)
304 if act.max is None:
305 quantized_max = ofm.data_type.max_value()
306 else:
307 quantized_max = quantise(act.max, ofm.quantization)
308 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
309 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
310 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
311 assert 0 <= act.lookup_table_index < 8
312 activation_value = 16 + act.lookup_table_index
313 if ofm.data_type == NpuDataType.INT32:
314 activation_value |= 3 << 12 # Force I8 range
315 quantized_min = max(-128, quantized_min)
316 quantized_max = min(127, quantized_max)
317 else:
318 activation_value = activation_op_map[act.op_type]
319 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
320 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
321 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
322
323
324def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
325 """Generates xFM_BASE registers"""
326 if layout == NpuLayout.NHCWB16:
327 # Check that all BasePointer addresses are aligned to 16 bytes
328 assert all((int(addr) % 16) == 0 for addr in addresses)
329 emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
330 emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
331 emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
332 emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
333
334
335def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
336 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
337 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
338 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
339 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
340
341
342def generate_strides(
343 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
344):
345 """Generates STRIDE_C/Y/X registers"""
346 strides = get_strides(fm)
347 emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
348 emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
349 emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
350
351
352def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
353 """Generates IFM/IFM2_PRECISION register"""
354 dtype = fm.data_type
355 prec = 1 if dtype.is_signed() else 0
356 activation_precision = precision_map[dtype.size_in_bits()]
357 prec += activation_precision << 2
358
359 if fm.layout == NpuLayout.NHCWB16:
360 prec |= 1 << 6
361
362 prec |= op_to_scale << 8
363 emit.cmd0_with_param(precision_cmd, prec)
364
365
366def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
367 """Generates OFM_PRECISION register"""
368 dtype = npu_op.ofm.data_type
369 prec = 1 if dtype.is_signed() else 0
370 activation_precision = precision_map[dtype.size_in_bits()]
371 prec += activation_precision << 1
372
373 if use_global_scale:
374 # Set global scale bit, as opposed to using per channel scale
375 prec |= 1 << 8
376 if npu_op.ofm.layout == NpuLayout.NHCWB16:
377 prec |= 1 << 6
378 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
379 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
380
381
382def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
383 """Generates IFM2_BROADCAST register for binary elementwise operations"""
384 ifm2_broadcast = 0
385 ifm = npu_op.ifm
386 ifm2 = npu_op.ifm2
387 if npu_op.reversed_operands:
388 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
389 if npu_op.ifm2_scalar is not None:
390 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
391 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
392 else:
393 if ifm.shape.height != ifm2.shape.height:
394 # Broadcast in 'H' dimension
395 assert ifm2.shape.height == 1
396 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
397
398 if ifm.shape.width != ifm2.shape.width:
399 # Broadcast in 'W' dimension
400 assert ifm2.shape.width == 1
401 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
402
403 if ifm.shape.depth != ifm2.shape.depth:
404 # Broadcast in 'C' dimension
405 assert ifm2.shape.depth == 1
406 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
407
408 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
409
410
411def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
412 """Generates general IFM registers"""
413 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
414 generate_addresses(
415 emit,
416 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
417 ifm.tiles.addresses,
418 ifm.layout,
419 )
420 generate_tiles(
421 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
422 )
423 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
424 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
425 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
426
427
428def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
429 """Generates general IFM2 registers"""
430 if not has_scalar:
431 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
432 generate_addresses(
433 emit,
434 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
435 ifm2.tiles.addresses,
436 ifm2.layout,
437 )
438 generate_tiles(
439 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
440 )
441 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
442 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
443
444
445def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
446 """Generates general OFM registers"""
447 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
448 generate_addresses(
449 emit,
450 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
451 ofm.tiles.addresses,
452 ofm.layout,
453 )
454 generate_tiles(
455 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
456 )
457 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
460 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
462
463
464def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
465 """Generates KERNEL related registers"""
466 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
467 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
468 # set kernel x stride low bit
469 stride = (kernel.stride_x - 1) & 1
470 # set kernel y stride low bit
471 stride |= (kernel.stride_y - 1 & 1) << 1
472 # set kernel x stride extension bits
473 stride |= (kernel.stride_x - 1 >> 1) << 6
474 # set kernel y stride extension bits
475 stride |= (kernel.stride_y - 1 >> 1) << 9
476 stride |= (kernel.dilation_x - 1) << 3
477 stride |= (kernel.dilation_y - 1) << 4
478 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
479 stride |= 1 << 2
480 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
481
482
483def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
484 """Generates WEIGHT registers"""
485 if len(weights) == 0:
486 return
487 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
488 # Set weights sources for active and present cores
489 for core, (addr, length) in enumerate(
490 [
491 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
492 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
493 ]
494 ):
495 if core < len(weights):
496 emit.cmd1_with_offset(addr, weights[core].address)
497 emit.cmd1_with_offset(length, weights[core].length)
498 elif core < arch.ncores:
499 emit.cmd1_with_offset(addr, weights[0].address)
500 emit.cmd1_with_offset(length, 0)
501
502
503def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
504 """Generates SCALE registers"""
505 if len(biases) == 0:
506 return
507 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
508 # Set weights sources for active and present cores
509 for core, (addr, length) in enumerate(
510 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
511 ):
512 if core < len(biases):
513 emit.cmd1_with_offset(addr, biases[core].address)
514 emit.cmd1_with_offset(length, biases[core].length)
515 elif core < arch.ncores:
516 emit.cmd1_with_offset(addr, biases[0].address)
517 emit.cmd1_with_offset(length, 0)
518
519
520def generate_block_config(
521 emit: CommandStreamEmitter,
522 npu_op: NpuBlockOperation,
523 arch: ArchitectureFeatures,
524 shared_buffer: SharedBufferAllocation,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100525):
526 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100527 block_config = npu_op.block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100528 assert block_config is not None, "block_config has not been set"
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100529 alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
530 assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
531 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
532 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
533 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100534
535
536def generate_shram_registers_elementwise(
537 emit: CommandStreamEmitter,
538 npu_op: NpuElementWiseOperation,
539 arch: ArchitectureFeatures,
540 shared_buffer: SharedBufferAllocation,
541):
542 """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
543 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
544 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
545 shram_required = arch.available_shram_banks(uses_lut)
546
547 # Acc buffers not needed so set AB_START to size of SHRAM
548 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
549 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
550 if has_ifm2(npu_op):
551 # Set IFM2_IB_START to the latter half of the IB space
552 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
553 emit.cmd0_with_param(
554 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
555 )
556 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
557
558
559def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
560 """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
561 emit.cmd0_with_param(
562 cmd0.NPU_SET_IFM_IB_END,
563 shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
564 )
565 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
566 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
567
568
Louis Verhaard933f55e2020-11-25 14:10:30 +0100569def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
570 """Creates shared buffer allocation for the given operation"""
571 op_type = npu_op.op_type
572 block_type = NpuBlockType.Default
573 if op_type == NpuOperationType.Conv2D:
574 block_type = NpuBlockType.ConvolutionMxN
575 elif op_type == NpuOperationType.ConvDepthWise:
576 block_type = NpuBlockType.ConvolutionDepthWise
577 elif op_type == NpuOperationType.Pooling:
578 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
579 elif op_type == NpuOperationType.ElementWise:
580 block_type = NpuBlockType.ElementWise
581 else:
582 assert 0, "Unsupported operation"
583 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
584 return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
585
586
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100587def generate_common(
588 emit: CommandStreamEmitter,
589 npu_op: NpuBlockOperation,
590 block_traversal: NpuBlockTraversal,
591 arch: ArchitectureFeatures,
592 use_global_scale: bool = False,
593 op_to_scale: int = 0,
594):
595 """Generate registers that are common to most operations"""
596 assert npu_op.ifm is not None and npu_op.ofm is not None
597 generate_ifm(emit, npu_op.ifm)
598 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
599 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
600 if npu_op.padding is not None:
601 generate_padding(emit, npu_op.padding)
602 generate_ofm(emit, npu_op.ofm)
603 generate_ofm_precision(emit, npu_op, use_global_scale)
604 if npu_op.op_type != NpuOperationType.ElementWise:
605 assert npu_op.kernel is not None
606 generate_kernel(emit, npu_op.kernel, block_traversal)
607 generate_weights(emit, npu_op.weights, arch)
608 generate_biases(emit, npu_op.biases, arch)
609 generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard933f55e2020-11-25 14:10:30 +0100610 shared_buffer = create_shared_buffer(npu_op, arch)
611 generate_block_config(emit, npu_op, arch, shared_buffer)
612 if npu_op.op_type == NpuOperationType.ElementWise:
613 generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
614 else:
615 generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100616
617
618# -------------------------------------------------------------------
619# SCALING
620# -------------------------------------------------------------------
621
622
623def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
624 """Generates OFM_SCALE register for pooling operations"""
625 # For valid padding vela has to output scaling values
626 kernel = pool_op.kernel
627 ifm_quant = pool_op.ifm.quantization
628 ofm_quant = pool_op.ofm.quantization
629 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
630 assert ifm_quant.scale_f32 is not None
631 rescale = 0x3000 * ifm_quant.scale_f32
632 if pool_op.ifm.data_type == NpuDataType.INT16:
633 # Calculate scale and shift for the output scale of 1/(3*4096)
634 shift = 0
635 max_rescale = np.iinfo(np.int16).max / 2
636 while rescale <= max_rescale and shift <= 30:
637 shift += 1
638 rescale *= 2
639 scale = int(rescale)
640 else:
641 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
642 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
643 scale = int(round_away_zero(scale * rescale))
644 elif pool_op.fused_quantize:
645 # Quantize op requires different scaling
646 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
647 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
648 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
649 elif pool_op.rescale is not None:
650 # for ResizeBilinear operations with "rescale" in primary_op.attrs
651 rescale = pool_op.rescale
652 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
653 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
654 scale = int(round_away_zero(scale * rescale))
655 else:
656 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
657 # kernel height == kernel width == 1 is always true in this case
658 # Normally the scale is maximised, to get maximum precision, which means that
659 # if rescale != 1, scale need to consider the number of bits needed for rescaling
660 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
661 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
662 rescale_bits = 0
663 if kernel.height == kernel.width == 1:
664 if rescale > 1:
665 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
666 elif rescale < 1:
667 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
668 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
669 scale = int(round_away_zero(scale * rescale))
670 else:
671 scale = 1
672 shift = 0
673
674 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
675
676
677def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
678 """
679 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
680 Returns the operator to scale
681 """
682 op_to_scale = 0
683 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
684 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
685 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
686 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
687
688 if npu_op.activation is not None and npu_op.activation.op_type in (
689 NpuActivationOp.SIGMOID,
690 NpuActivationOp.TANH,
691 ):
692 output_scale = 1 / 0x3000
693
694 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
695 if None in (input_scale, input2_scale, output_scale):
696 ofm_scale = 1
697 shift = 0
698 else:
699 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
700 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
701 else: # Add/Sub
702 if None in (input_scale, input2_scale, output_scale):
703 opa_scale = opb_scale = ofm_scale = 1
704 opa_shift = shift = 0
705 if npu_op.rescale is not None:
706 ofm_scale, shift = npu_op.rescale
707 elif input_scale == input2_scale:
708 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
709 input_scale, input2_scale, output_scale
710 )
711 opa_shift = 0 # Unused for this case
712 else:
713 # Use advanced implementation only when input scales differ
714 bitdepth = npu_op.ifm.data_type.size_in_bits()
715 (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
716 input_scale, input2_scale, output_scale, bitdepth
717 )
718 opb_scale = 0 # Unused for this case
719 if npu_op.reversed_operands:
720 # If the operand order is reversed we also have to swap which operand is scaled
721 if op_to_scale == scaling.OperandToScale.OPa:
722 op_to_scale = scaling.OperandToScale.OPb
723 else:
724 op_to_scale = scaling.OperandToScale.OPa
725 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
726 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
727 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
728 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
729 output_scale = npu_op.ofm.quantization.scale_f32
730 ofm_scale, shift = scaling.quantise_scale(output_scale)
731 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
732 else:
733 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
734 return op_to_scale
735
736
737# -------------------------------------------------------------------
738# ADDRESSING/STRIDES (helper functions)
739# -------------------------------------------------------------------
740
741
742def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
743 """Checks if the ranges overlap"""
744 return range1.region == range2.region and numeric_util.overlaps(
745 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
746 )
747
748
Louis Verhaardd2665802020-11-20 13:08:55 +0100749def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
750 """Checks if there is any address overlap between list1 and list2"""
751 for range1 in list1:
752 if range1 is None:
753 continue
754 for range2 in list2:
755 if range2 is not None and ranges_overlap(range1, range2):
756 return True
757 return False
758
759
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100760def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
761 """Calculates STRIDE_C/Y/X"""
762 if fm.strides is not None:
763 return fm.strides
764 elem_size = fm.data_type.size_in_bytes()
765 if fm.layout == NpuLayout.NHWC:
766 stride_c = elem_size
767 stride_x = fm.shape.depth * stride_c
768 stride_y = fm.shape.width * stride_x
769 else:
770 stride_x = 16 * elem_size
771 stride_c = stride_x * fm.shape.width
772 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
773 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
774
775
776def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
777 """Returns address of given coordinate"""
778 t = 0
779 BRICK = 16
780 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
781 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
782 if x >= fm.tiles.width_0:
783 x -= fm.tiles.width_0
784 t = 1
785 if y >= fm.tiles.height_1:
786 y -= fm.tiles.height_1
787 t += 2
788 elif y >= fm.tiles.height_0:
789 y -= fm.tiles.height_0
790 t += 2
791 elem_size = fm.data_type.size_in_bytes()
792 return (
793 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
794 )
795
796
797def get_address_range(
798 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
799) -> NpuAddressRange:
Louis Verhaardd2665802020-11-20 13:08:55 +0100800 """
801 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
802 The begin and end coordinates must be within the same tile.
803 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100804 addr0 = get_address(fm, strides, y0, x0, c0)
805 addr1 = get_address(fm, strides, y1, x1, c1)
806 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
807
808
Louis Verhaardd2665802020-11-20 13:08:55 +0100809def get_h_ranges(
810 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
811) -> List[NpuAddressRange]:
812 """
813 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
814 the begin and end coordinates must be within the same tile.
815 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
816 """
817 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
818
819
820def get_address_ranges_for_area(
821 fm: NpuFeatureMap, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
822) -> List[NpuAddressRange]:
823 """
824 Returns a list of adddress ranges that covers the area (y0, x0, c0) - (y1, x1, c1) (inclusive).
825 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
826
827 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
828 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
829
830 .....|.... .....|....
831 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
832 ..XXX|XX.. ..BBB|DD..
833 -----+---- --> -----+----
834 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
835 .....|.... .....|....
836 """
837 strides = get_strides(fm)
838 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
839 h, w, c = fm.shape
840 y2, x2, c2 = min(y1, h - 1), min(x1, w - 1), min(c1, c - 1)
841 ranges = []
842 if x0 < width_0 and y0 < height_0:
843 # Horizontal ranges for tile 0
844 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y2, height_0 - 1), min(x2, width_0 - 1), c2))
845 if x2 >= width_0 and y0 < height_1:
846 # Horizontal ranges for tile 1
847 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y2, height_1 - 1), x2, c2))
848 if x0 < width_0 and y2 >= height_0:
849 # Horizontal ranges for tile 2
850 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y2, min(x2, width_0 - 1), c2))
851 if x2 >= width_0 and y2 >= height_1:
852 # Horizontal ranges for tile 3
853 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y2, x2, c2))
854 return ranges
855
856
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100857def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
858 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
859 strides = get_strides(fm)
860 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
861 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
862 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
863 if width > width_0:
864 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
865 else:
866 t1 = None
867 if height > height_0:
868 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
869 else:
870 t2 = None
871 if t1 is not None and t2 is not None:
Louis Verhaardd2665802020-11-20 13:08:55 +0100872 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100873 else:
874 t3 = None
875 return [t0, t1, t2, t3]
876
877
878# -------------------------------------------------------------------
879# DMA_WAIT/KERNEL_WAIT
880# -------------------------------------------------------------------
881
882
Tim Hall289a41d2020-08-04 21:40:14 +0100883Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100884
Tim Hall79d07d22020-04-27 18:20:16 +0100885
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100886def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
887 return MemoryRangeSet(range.region, range.address, range.address + range.length)
888
889
890def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
891 """Returns the address that are read and written by the given DMA operation"""
892 res = MemoryAccessSet()
893 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
894 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
895 return res
896
897
898def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
899 """Returns the addresses that are read and written by the given operation"""
900 assert npu_op.ifm is not None and npu_op.ofm is not None
901 # Read addresses
902 read_ranges = get_address_ranges(npu_op.ifm)
903 if has_ifm2(npu_op):
904 assert npu_op.ifm2 is not None
905 read_ranges.extend(get_address_ranges(npu_op.ifm2))
906 read_ranges.extend(npu_op.weights)
907 read_ranges.extend(npu_op.biases)
908 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
909 address = arch.available_shram_banks(True) * arch.shram_bank_size
910 read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
911 # Written addresses
912 write_ranges = get_address_ranges(npu_op.ofm)
913 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
914 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
915 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
916 write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
917
918 res = MemoryAccessSet()
919 for read_range in read_ranges:
920 if read_range is not None:
921 res.add(memory_range_set(read_range), AccessDirection.Read)
922 for write_range in write_ranges:
923 if write_range is not None:
924 res.add(memory_range_set(write_range), AccessDirection.Write)
925 return res
926
927
928def get_wait_dependency(
929 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
930):
931 """Used to calculate whether DMA wait or kernel wait operations are needed"""
932 npu_op = npu_op_list[op_index]
933 op_access = memory_accesses[npu_op]
934 index = op_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100935
Tim Hall289a41d2020-08-04 21:40:14 +0100936 # NPU dependency tracking
937 npu_outstanding = -1
938 npu_ops = 0
939 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100940
Tim Hall289a41d2020-08-04 21:40:14 +0100941 # DMA dependency tracking
942 dma_outstanding = -1
943 dma_ops = 0
944 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100945
Tim Hall289a41d2020-08-04 21:40:14 +0100946 # Seek back in the command stream looking for NPU or DMA dependencies
947 # but only as far as the first dependency or the watermarks (dependencies
948 # before this point have been satisfied already).
949 # The watermark moves to after the latest element we must wait for, not
950 # the command that issues the wait.
951 # NPU->NPU dependency is handled via blockdep.
952 while (index >= npu_index) or (index >= dma_index):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100953 prev_op = npu_op_list[index]
954 prev_access = memory_accesses[prev_op]
Tim Hall79d07d22020-04-27 18:20:16 +0100955
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100956 # Check NPU consuming DMA output
957 if is_dma_op(prev_op):
958 if index >= dma_index:
959 if not is_dma_op(npu_op):
960 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
961 dma_outstanding = dma_ops
962 dma_ops += 1 # Count DMA ops in the pipeline
963 if dma_ops >= arch.max_outstanding_dma:
964 dma_index = max(index + 1, dma_index)
Tim Hall289a41d2020-08-04 21:40:14 +0100965 # Check DMA consuming NPU output
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100966 else:
Tim Hall289a41d2020-08-04 21:40:14 +0100967 if index >= npu_index:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100968 if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall289a41d2020-08-04 21:40:14 +0100969 npu_outstanding = npu_ops
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100970 npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall289a41d2020-08-04 21:40:14 +0100971 if npu_ops >= arch.max_outstanding_kernels:
972 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100973
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100974 index -= 1
Tim Hall79d07d22020-04-27 18:20:16 +0100975
Tim Hall289a41d2020-08-04 21:40:14 +0100976 # Update DMA watermark if we didn't see any and the NPU pipeline is full
977 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100978 dma_index = op_index
Tim Hall289a41d2020-08-04 21:40:14 +0100979
980 # Bring the search watermark forwards as we complete for those dependencies
981 watermark = Watermark(npu_index, dma_index)
982 outstanding = Watermark(npu_outstanding, dma_outstanding)
983
984 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100985
986
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100987def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
988 if cmd_waits.npu >= 0:
989 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
990
991 if cmd_waits.dma >= 0:
992 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
993
994
995# -------------------------------------------------------------------
996# BLOCKDEP
997# -------------------------------------------------------------------
998
999
Louis Verhaardd2665802020-11-20 13:08:55 +01001000def shape3d_size(shape: NpuShape3D) -> int:
1001 return shape.width * shape.height * shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +01001002
1003
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001004def shape3d_to_rect(shape: NpuShape3D) -> Rect:
1005 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall79d07d22020-04-27 18:20:16 +01001006
1007
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001008def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall79d07d22020-04-27 18:20:16 +01001009 # Note: NOT equivalent to the normal ifm block depth calculation since
1010 # it takes into account 'depthless' block operations by returning full
1011 # depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001012 if npu_op.op_type == NpuOperationType.Conv2D:
1013 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
1014 return res
1015 return npu_op.ofm.shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +01001016
1017
Louis Verhaard933f55e2020-11-25 14:10:30 +01001018def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001019 """Calculates the value of the BLOCKDEP register"""
1020 if prev_op is None:
1021 return 0
Louis Verhaardd2665802020-11-20 13:08:55 +01001022 assert npu_op.ifm is not None
1023 assert prev_op.ofm is not None
1024 # Check if IFM or IFM2 overlaps with prev op's OFM
1025 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
1026 ifm_ranges = get_address_ranges(npu_op.ifm)
1027 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
1028 if has_ifm2(npu_op):
1029 assert npu_op.ifm2 is not None
1030 ifm2_ranges = get_address_ranges(npu_op.ifm2)
1031 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
1032 else:
1033 ifm2_overlaps = False
1034 if ifm_overlaps and ifm2_overlaps:
1035 # Both IFM and IFM2 overlap (should be rare)
1036 return 0
1037 if not ifm_overlaps and not ifm2_overlaps:
1038 # No overlap between prev OFM and IFM/IFM2
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001039 return ArchitectureFeatures.MAX_BLOCKDEP
Louis Verhaardd2665802020-11-20 13:08:55 +01001040 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
1041 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001042 return 0
Louis Verhaard933f55e2020-11-25 14:10:30 +01001043 prev_block_config = prev_op.block_config
1044 block_config = npu_op.block_config
Louis Verhaardd2665802020-11-20 13:08:55 +01001045 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
1046 assert overlapping_fm is not None
1047
1048 def intersects(ifm_start_coord: Tuple, ifm_end_coord: Tuple, ofm_start_coord: Tuple, ofm_end_coord: Tuple) -> bool:
1049 """Checks if the given IFM area overlaps with the given OFM area"""
1050 if overlapping_fm.shape == prev_op.ofm.shape and overlapping_fm.tiles == prev_op.ofm.tiles:
1051 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
1052 # if the xyz coordinates overlap, which is quick and easy
1053 return ArchitectureFeatures.intersects(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
1054 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
1055 # In this case address comparison is needed between the two areas
1056 x0, y0, c0 = ifm_start_coord
1057 x1, y1, c1 = ifm_end_coord
1058 ifm_ranges = get_address_ranges_for_area(overlapping_fm, y0, x0, c0, y1, x1, c1)
1059 x0, y0, c0 = ofm_start_coord
1060 x1, y1, c1 = ofm_end_coord
1061 prev_ofm_ranges = get_address_ranges_for_area(prev_op.ofm, y0, x0, c0, y1, x1, c1)
1062 return range_lists_overlap(ifm_ranges, prev_ofm_ranges)
1063
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001064 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
1065 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001066 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
1067 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
1068 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
1069 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
1070 cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
Louis Verhaardd2665802020-11-20 13:08:55 +01001071 return arch.calc_block_dep(
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001072 prev_ofm_rect,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001073 prev_ofm_block,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001074 cur_ifm_rect,
1075 cur_ofm_rect,
1076 cur_ifm_block_depth,
1077 cur_ofm_block,
1078 to_kernel(npu_op.kernel),
1079 cur_padLT,
Louis Verhaardd2665802020-11-20 13:08:55 +01001080 intersects=intersects,
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001081 )
Tim Hall79d07d22020-04-27 18:20:16 +01001082
1083
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001084# -------------------------------------------------------------------
1085# PRINT
1086# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +02001087
1088
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001089def print_feature_map(fm: NpuFeatureMap, name: str):
1090 if fm is not None:
1091 q = (
1092 "no quantization"
1093 if fm.quantization is None
1094 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
1095 )
1096 h, w, c = fm.shape
1097 sz = h * w * c * fm.data_type.size_in_bytes()
1098 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
1099 strides = get_strides(fm)
1100 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
1101 t = fm.tiles
1102 addresses = [hex(addr) for addr in t.addresses]
1103 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall79d07d22020-04-27 18:20:16 +01001104
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001105
1106def print_operation(npu_op: NpuOperation, index: int = 0):
1107 pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
1108 if is_dma_op(npu_op):
1109 print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
1110 return
1111 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
1112 if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
1113 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +02001114 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001115 if (
1116 npu_op.op_type == NpuOperationType.Conv2D
1117 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
1118 ):
1119 fc = "FullyConnected "
1120 else:
1121 fc = ""
1122 print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
1123 print_feature_map(npu_op.ifm, "IFM")
1124 if npu_op.ifm2_scalar is not None:
1125 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1126 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
1127 else:
1128 print_feature_map(npu_op.ifm2, "IFM2")
1129 print_feature_map(npu_op.ofm, "OFM")
1130 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
1131 print(f" Kernel: {k}")
1132 if npu_op.padding is not None:
1133 print(f" {npu_op.padding}")
1134 for weights in npu_op.weights:
1135 print(f" Weights: {weights}")
1136 for bias in npu_op.biases:
1137 print(f" Scales: {bias}")
1138 if npu_op.activation is not None:
1139 act = npu_op.activation
1140 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
1141 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
1142 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
1143 if npu_op.op_type == NpuOperationType.Conv2D:
1144 print(f" {npu_op.block_traversal}")
1145 bh, bw, bc = npu_op.block_config
1146 rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
1147 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +01001148
Tim Hall79d07d22020-04-27 18:20:16 +01001149
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001150def print_operations(npu_op_list: List[NpuOperation]):
1151 for index, npu_op in enumerate(npu_op_list):
1152 print_operation(npu_op, index)
Tim Hall79d07d22020-04-27 18:20:16 +01001153
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001154
1155# -------------------------------------------------------------------
1156# OPERATIONS
1157# -------------------------------------------------------------------
1158
1159
1160def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
1161 """Generates NPU_OP_* command"""
1162 op_type = npu_op.op_type
1163 if op_type == NpuOperationType.Dma:
1164 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
1165 elif op_type == NpuOperationType.Conv2D:
1166 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1167 elif op_type == NpuOperationType.ConvDepthWise:
1168 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1169 elif op_type == NpuOperationType.Pooling:
1170 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
1171 elif op_type == NpuOperationType.ElementWise:
1172 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
1173 else:
1174 assert 0, "Unsupported operation"
1175
1176
Louis Verhaard933f55e2020-11-25 14:10:30 +01001177def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001178 """Generates register commands for Conv2D operations"""
1179 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001180
1181
1182def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1183 """Generates register commands for depthwise convolution operations"""
1184 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001185
1186
1187def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
1188 """Generates register commands for pooling operations"""
1189 use_global_scale = (
1190 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
1191 )
1192 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
1193 # Pooling op specific
1194 if use_global_scale:
1195 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001196
1197
1198def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
1199 """Generates register commands for elementwise operations"""
1200 use_global_scale = npu_op.sub_op_type in (
1201 NpuElementWiseOp.ADD,
1202 NpuElementWiseOp.SUB,
1203 NpuElementWiseOp.MUL,
1204 NpuElementWiseOp.LRELU,
1205 NpuElementWiseOp.ABS,
1206 )
1207 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
1208 generate_common(
1209 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
1210 )
1211 # Elementwise op specific
1212 if npu_op.sub_op_type not in unary_elementwise_ops:
1213 # Binary operation; generate IFM2 registers
1214 assert npu_op.ifm2 is not None
1215 has_scalar = npu_op.ifm2_scalar is not None
1216 generate_ifm2(emit, npu_op.ifm2, has_scalar)
1217 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1218 generate_ifm2_broadcast(emit, npu_op)
1219 if has_scalar:
1220 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1221 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1222 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001223
1224
1225def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
1226 """Generates register commands for DMA operations"""
1227 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
1228 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
1229 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1230
1231 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1232 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
1233
1234
Louis Verhaard933f55e2020-11-25 14:10:30 +01001235def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001236 """
1237 Generates register commands for the given operation, but not the final NPU_OP_... command.
1238 Returns the selected block config
1239 """
1240 op_type = npu_op.op_type
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001241 if op_type == NpuOperationType.Conv2D:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001242 generate_conv2d_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001243 elif op_type == NpuOperationType.ConvDepthWise:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001244 generate_conv_depthwise_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001245 elif op_type == NpuOperationType.Pooling:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001246 generate_pooling_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001247 elif op_type == NpuOperationType.ElementWise:
Louis Verhaard933f55e2020-11-25 14:10:30 +01001248 generate_elementwise_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001249 elif op_type == NpuOperationType.Dma:
1250 generate_dma_op(emit, npu_op)
1251 else:
1252 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001253
1254
1255def generate_command_stream(
1256 emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
1257):
1258 """Generates register commands for the given list of NPU operations"""
1259 # Calculate memory accesses for every operation
Tim Hall289a41d2020-08-04 21:40:14 +01001260 memory_accesses = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001261 for npu_op in npu_op_list:
1262 if is_dma_op(npu_op):
1263 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
1264 else:
1265 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hallc8a73862020-10-27 12:43:14 +00001266 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001267 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1268 dep_watermark = Watermark(0, 0)
1269 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001270 # Generate register commands for all operations
1271 for op_index, npu_op in enumerate(npu_op_list):
1272 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard933f55e2020-11-25 14:10:30 +01001273 generate_registers_for_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001274 if not is_dma_op(npu_op):
1275 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001276 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001277 blockdep = min(blockdep, arch.max_blockdep)
1278 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1279 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001280
1281 generate_cmd_waits(emit, cmd_waits)
1282 # Generate the actual NPU_OP command
1283 generate_operation_code(emit, npu_op)
1284 if add_to_debug_db is not None:
1285 add_to_debug_db(npu_op, emit.offset)
1286 # Fill in final part of command stream:
1287 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1288
1289
1290def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
1291 """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
1292 # Convert high level command stream to list of NpuOperation
1293 npu_op_list = []
1294 npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall79d07d22020-04-27 18:20:16 +01001295 for cmd in sg.high_level_command_stream:
1296 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
1297 print("Warning: Skipping register command stream generation for", cmd.ps)
1298 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001299 npu_op = convert_command_to_npu_op(cmd, arch)
1300 npu_op_list.append(npu_op)
1301 npu_op_to_cmd[npu_op] = cmd
1302 if verbose:
1303 print_operations(npu_op_list)
1304 # Generate register commands
Tim Halle6ccd872020-11-09 16:46:37 +00001305 stream_id = DebugDatabase.add_stream(sg)
1306 DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001307 emit = CommandStreamEmitter()
Tim Halle6ccd872020-11-09 16:46:37 +00001308
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001309 def add_to_debug_db(npu_op: NpuOperation, offset: int):
1310 """Adds info to the debug database"""
1311 if not is_dma_op(npu_op):
1312 cmd = npu_op_to_cmd[npu_op]
1313 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall289a41d2020-08-04 21:40:14 +01001314
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001315 generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall79d07d22020-04-27 18:20:16 +01001316 sg.register_command_stream = emit.to_list()
1317 if verbose:
1318 emit.print_cmds()
1319 print("number of commands", len(emit.cmd_stream))
1320 print("command stream length in words", len(sg.register_command_stream))
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001321
1322
Louis Verhaard933f55e2020-11-25 14:10:30 +01001323def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
1324 """
1325 Internal implementation of the public facing API for finding block configs.
1326 """
1327 if is_dma_op(npu_op):
1328 return []
1329 arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
1330 shared_buffer = create_shared_buffer(npu_op, arch)
1331 blocks = find_suitable_block_configs(arch, shared_buffer)
1332 return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
1333
1334
Louis Verhaardaeae5672020-11-02 18:04:27 +01001335def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001336 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001337 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001338 Calculates dependencies between commands and inserts wait operations if needed.
1339
1340 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001341 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1342 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001343 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001344 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001345 emit = CommandStreamEmitter()
Louis Verhaard52078302020-11-18 13:35:06 +01001346 arch = create_default_arch(accelerator)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001347 generate_command_stream(emit, npu_op_list, arch)
1348 return emit.to_list()