blob: e3fedfcc3310d00e100259f40dc17e3a0a97cbdd [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
18# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
19# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
25import numpy as np
26
27from . import scaling
Diego Russoe8a10452020-04-21 17:39:10 +010028from .architecture_features import ArchitectureFeatures
29from .architecture_features import Block
Diego Russoe8a10452020-04-21 17:39:10 +010030from .architecture_features import Rect
31from .architecture_features import SharedBufferArea
32from .architecture_features import SHRAMElements
33from .data_type import BaseType
34from .data_type import DataType
Tim Halle6ccd872020-11-09 16:46:37 +000035from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010036from .ethos_u55_regs.ethos_u55_regs import acc_format
37from .ethos_u55_regs.ethos_u55_regs import activation
38from .ethos_u55_regs.ethos_u55_regs import cmd0
39from .ethos_u55_regs.ethos_u55_regs import cmd1
40from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
41from .ethos_u55_regs.ethos_u55_regs import ifm_precision
Fredrik Svedberga0c36242020-06-03 15:43:31 +020042from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020043from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010044from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010045from .high_level_command_stream import CommandType
Diego Russoe8a10452020-04-21 17:39:10 +010046from .numeric_util import clamp_sigmoid
47from .numeric_util import clamp_tanh
Louis Verhaardb2fb2122020-06-04 15:51:24 +020048from .numeric_util import full_shape
Diego Russoe8a10452020-04-21 17:39:10 +010049from .numeric_util import quantise_float32
50from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010051from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010052from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020053from .operation import Op
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020054from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010055from .tensor import TensorBlockTraversal
56from .tensor import TensorFormat
Fredrik Svedberga0c36242020-06-03 15:43:31 +020057from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010058
59
60class RegisterMachine:
61 def __init__(self):
62 self.n_banks = 1
63 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
64 self.bank_idx = 0
65
66 def set_register(self, reg, value):
67 is_changed = self.registers[self.bank_idx][reg] != value
68 self.registers[self.bank_idx][reg] = value
69 # is_changed = True # force command
70 return is_changed
71
72 def switch_bank(self):
73 self.bank_idx = (self.bank_idx + 1) % self.n_banks
74
75
76class CmdMode(IntEnum):
77 NoPayload = 0x0000
78 Payload32 = 0x4000
79 Mask = 0xC000
80 CmdOpMask = 0x03FF
81
82
83class BasePointerIndex(IntEnum):
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020084 WeightTensor = 0 # base address index for the Weight tensor
85 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
86 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020087 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
Tim Hall79d07d22020-04-27 18:20:16 +010088
89
90# TODO: Replace with definitions from ethos_u55_regs
91class IFM2Broadcast(IntEnum):
92 BroadcastHdim = 1 << 0
93 BroadcastWdim = 1 << 1
94 BroadcastCdim = 1 << 2
95 ReverseOperandOrder = 1 << 6
96 UseIFM2Scalar = 1 << 7
97
98
99class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000100 WORD_SIZE = 4
101
Tim Hall79d07d22020-04-27 18:20:16 +0100102 def __init__(self):
103 self.cmd_stream = []
104 self.reg_machine = [RegisterMachine(), RegisterMachine()]
105 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000106 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100107
108 def get_reg_machine(self, cmd):
109 if "DMA" in cmd.name:
110 return self.reg_machine[1]
111 else:
112 return self.reg_machine[0]
113
114 def size_in_bytes(self):
115 sz = 0
116 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000117 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100118 return sz
119
120 def to_list(self):
121 return [elem for cmd in self.cmd_stream for elem in cmd]
122
123 def print_cmds(self):
124 print("Code: Command: Param: Payload:")
125 for words_for_one_command in self.cmd_stream:
126 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
127 param = words_for_one_command[0] >> 16 # higher 16 bits
128
129 payload_mode = CmdMode(code & CmdMode.Mask)
130
131 # code and command
132 s = " 0x%04x " % code
133 if payload_mode == CmdMode.NoPayload:
134 s += str(cmd0(code & CmdMode.CmdOpMask))
135 else:
136 s += str(cmd1(code & CmdMode.CmdOpMask))
137
138 s = s.ljust(40)
139 s += "%5d" % param
140
141 # payload
142 if payload_mode == CmdMode.Payload32:
143 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
144 else:
145 s += " -"
146
147 print(s)
148
149 def cmd0_with_param(self, cmd, param):
150 if isinstance(param, Enum):
151 param = int(param.value)
152 else:
153 param = int(param)
154 param = param & 0xFFFF
155 command = cmd.value | (param << 16)
156 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
157 return
158
159 # This is not a redundant command, actually write it
160 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000161 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100162
163 def cmd1_with_offset(self, cmd, offset, param=0x0):
164 offset = int(offset) & 0xFFFFFFFFF
165 command = cmd.value | CmdMode.Payload32.value | (param << 16)
166
167 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
168 return
169
170 # This is not a redundant command, actually write it
171 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000172 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100173
Tim Hall289a41d2020-08-04 21:40:14 +0100174 def cmd_wait(self, cmd, channel, outstanding_count):
175 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100176 command = ((param & 0xFFFF) << 16) | cmd.value
177 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000178 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100179
180 def cmd_do_operation(self, cmd, param=0):
181 param = int(param)
182 command = ((param & 0xFFFF) << 16) | cmd.value
183
184 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000185 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100186 self.get_reg_machine(cmd).switch_bank()
187
188
Tim Hall289a41d2020-08-04 21:40:14 +0100189Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100190
Tim Hall79d07d22020-04-27 18:20:16 +0100191
Tim Hall289a41d2020-08-04 21:40:14 +0100192def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark):
193 cmd = cmd_stream[cmd_index]
194 cmd_access = memory_accesses[cmd]
195 index = cmd_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100196
Tim Hall289a41d2020-08-04 21:40:14 +0100197 # NPU dependency tracking
198 npu_outstanding = -1
199 npu_ops = 0
200 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100201
Tim Hall289a41d2020-08-04 21:40:14 +0100202 # DMA dependency tracking
203 dma_outstanding = -1
204 dma_ops = 0
205 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100206
Tim Hall289a41d2020-08-04 21:40:14 +0100207 # Seek back in the command stream looking for NPU or DMA dependencies
208 # but only as far as the first dependency or the watermarks (dependencies
209 # before this point have been satisfied already).
210 # The watermark moves to after the latest element we must wait for, not
211 # the command that issues the wait.
212 # NPU->NPU dependency is handled via blockdep.
213 while (index >= npu_index) or (index >= dma_index):
214 prev_cmd = cmd_stream[index]
215 prev_access = memory_accesses[prev_cmd]
Tim Hall79d07d22020-04-27 18:20:16 +0100216
Tim Hall289a41d2020-08-04 21:40:14 +0100217 # Check DMA consuming NPU output
218 if prev_cmd.cmdtype == CommandType.NpuStripe:
219 if index >= npu_index:
220 if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access):
221 npu_outstanding = npu_ops
222 npu_ops = npu_ops + 1 # Count NPU ops in the pipeline
223 if npu_ops >= arch.max_outstanding_kernels:
224 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100225
Tim Hall289a41d2020-08-04 21:40:14 +0100226 # Check NPU consuming DMA output
227 elif prev_cmd.cmdtype == CommandType.DMA:
228 if index >= dma_index:
229 if cmd.cmdtype == CommandType.NpuStripe:
230 if (dma_outstanding == -1) and prev_access.conflicts(cmd_access):
231 dma_outstanding = dma_ops
232 dma_ops = dma_ops + 1 # Count DMA ops in the pipeline
233 if dma_ops >= arch.max_outstanding_dma:
234 dma_index = max(index + 1, dma_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100235
Tim Hall289a41d2020-08-04 21:40:14 +0100236 index = index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100237
Tim Hall289a41d2020-08-04 21:40:14 +0100238 # Update DMA watermark if we didn't see any and the NPU pipeline is full
239 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
240 dma_index = cmd_index
241
242 # Bring the search watermark forwards as we complete for those dependencies
243 watermark = Watermark(npu_index, dma_index)
244 outstanding = Watermark(npu_outstanding, dma_outstanding)
245
246 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100247
248
Tim Hall79d07d22020-04-27 18:20:16 +0100249def has_prev_op_dependency(prev_cmd, cmd):
250 if prev_cmd is None:
251 return False
252 if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200253 if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor):
Tim Hall79d07d22020-04-27 18:20:16 +0100254 return True
Tim Hall90337952020-05-07 16:42:35 +0100255 elif cmd.ifm2_tensor is not None:
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200256 return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100257 return False
258
259
260def get_op_ofm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200261 start = full_shape(4, cmd.ofm_box.start_coord, 0)
262 end = full_shape(4, cmd.ofm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100263 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
264
265
266def get_op_ifm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200267 start = full_shape(4, cmd.ifm_box.start_coord, 0)
268 end = full_shape(4, cmd.ifm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100269 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
270
271
272def get_op_ifmofm_block_depth(arch, cmd):
273 # Note: NOT equivalent to the normal ifm block depth calculation since
274 # it takes into account 'depthless' block operations by returning full
275 # depth
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200276 if cmd.ps.npu_block_type in (
277 NpuBlockType.ConvolutionDepthWise,
278 NpuBlockType.Pooling,
279 NpuBlockType.ElementWise,
280 NpuBlockType.ReduceSum,
281 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100282 return cmd.ofm_box.get_size_shape()[-1]
283
284 return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
285
286
287def get_op_padding_lt(cmd):
288 if cmd.ps.npu_block_type not in (
289 NpuBlockType.ConvolutionDepthWise,
290 NpuBlockType.Pooling,
291 NpuBlockType.ConvolutionMxN,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200292 NpuBlockType.ReduceSum,
Tim Hall79d07d22020-04-27 18:20:16 +0100293 ):
294 return (0, 0)
295
296 explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
297
298 # Check if this is for horizontal ifm streaming
299 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
300 explicit_padding[0] = cmd.pad_top
301 explicit_padding[2] = cmd.pad_bottom
302
303 return (explicit_padding[1], explicit_padding[0])
304
305
Jacob Bohline99b8932020-07-13 16:01:51 +0200306def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
307 if ifm_shape == []:
308 # Scalar needs to be in IFM2
309 return False
310 elif ifm2_shape == []:
311 return True
312
313 for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
314 if ifm != ifm2 and ifm == 1:
315 # Broadcasted FM needs to be in IFM2
316 return False
317
318 return True
319
320
Tim Hall79d07d22020-04-27 18:20:16 +0100321def generate_register_command_stream(nng, sg, arch, verbose=False):
322 emit = CommandStreamEmitter()
323
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200324 if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
325 base_ptr_idx_map = {
326 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
327 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
328 MemType.Scratch: BasePointerIndex.ScratchTensor,
329 MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
330 }
331 else:
332 base_ptr_idx_map = {
333 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
334 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
335 MemType.Scratch: BasePointerIndex.ScratchTensor,
336 MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
337 }
Tim Hall79d07d22020-04-27 18:20:16 +0100338
339 # Maps an AccumulatorType enum to the corresponding acc_format value
340 acc_format_map = {
341 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
342 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
343 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
344 }
345
346 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
347 elementwise_mode_map = {
Louis Verhaardaee5d752020-09-30 09:01:52 +0200348 Op.Mul: elementwise_mode.MUL.value,
349 Op.Add: elementwise_mode.ADD.value,
350 Op.Sub: elementwise_mode.SUB.value,
351 Op.Minimum: elementwise_mode.MIN.value,
352 Op.Maximum: elementwise_mode.MAX.value,
353 Op.LeakyRelu: elementwise_mode.LRELU.value,
354 Op.Abs: elementwise_mode.ABS.value,
355 Op.CLZ: elementwise_mode.CLZ.value,
356 Op.SHR: elementwise_mode.SHR.value,
357 Op.SHL: elementwise_mode.SHL.value,
Tim Hall79d07d22020-04-27 18:20:16 +0100358 }
359
360 cmd_stream = []
Tim Hall289a41d2020-08-04 21:40:14 +0100361 memory_accesses = {}
Tim Hall79d07d22020-04-27 18:20:16 +0100362 for cmd in sg.high_level_command_stream:
363 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
364 print("Warning: Skipping register command stream generation for", cmd.ps)
365 else:
366 cmd_stream.append(cmd)
Tim Hall289a41d2020-08-04 21:40:14 +0100367 memory_accesses[cmd] = cmd.get_memory_accesses()
Tim Hall79d07d22020-04-27 18:20:16 +0100368
Tim Hall289a41d2020-08-04 21:40:14 +0100369 def emit_cmd_waits(cmd_waits):
370 if cmd_waits.npu >= 0:
371 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
372
373 if cmd_waits.dma >= 0:
374 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
Tim Hall79d07d22020-04-27 18:20:16 +0100375
376 # Initialise operator dependency state
377 prev_ifm_rect = cur_ifm_rect = None
378 prev_ifm_block_depth = cur_ifm_block_depth = None
379 prev_ofm_rect = cur_ofm_rect = None
380 prev_ofm_block = cur_ofm_block = None
381 prev_kernel = cur_kernel = None
382 prev_cmd = None
383
Tim Hall42e41892020-07-06 10:51:31 +0100384 if arch.is_yoda_system:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200385 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Tim Hallf7e810a2020-06-25 15:04:31 +0100386
Tim Hall289a41d2020-08-04 21:40:14 +0100387 dep_watermark = Watermark(0, 0)
388
Tim Halle6ccd872020-11-09 16:46:37 +0000389 stream_id = DebugDatabase.add_stream(sg)
390 DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
391
Tim Hall289a41d2020-08-04 21:40:14 +0100392 for cmd_index, cmd in enumerate(cmd_stream):
393 dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark)
394
Tim Hall79d07d22020-04-27 18:20:16 +0100395 if cmd.cmdtype == CommandType.DMA:
396 start_coord = cmd.box.start_coord
397
398 src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
399 dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
400
401 if cmd.in_tensor.compressed_values is not None:
Andreas Nevalainen897cc142020-10-28 15:42:08 +0100402 if cmd.out_tensor.purpose == TensorPurpose.FSBias:
403 sz = cmd.in_tensor.storage_size()
404 else:
405 stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
406 sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100407 else:
408 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
409
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200410 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
Tim Hall79d07d22020-04-27 18:20:16 +0100411 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200412 if cmd.out_tensor.purpose == TensorPurpose.LUT:
413 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)
414 else:
415 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200416
Tim Hall79d07d22020-04-27 18:20:16 +0100417 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
418 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
419 dma_channel = 0
420 mode = 0 # From external to external
421
Tim Hall289a41d2020-08-04 21:40:14 +0100422 emit_cmd_waits(cmd_waits)
Tim Hall79d07d22020-04-27 18:20:16 +0100423 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
424
425 elif cmd.cmdtype == CommandType.NpuStripe:
426
427 ps = cmd.ps
428 primary_op = ps.primary_op
429 npu_block_type = ps.npu_block_type
430 # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
431 use_global_scale = False
432 # Specifies type of rounding to be used.
Tim Halld775e372020-08-28 18:33:38 +0100433 rounding_mode = (
434 rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL
435 )
Louis Verhaardaee5d752020-09-30 09:01:52 +0200436 if primary_op.type == Op.ResizeBilinear:
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200437 rounding_mode = rounding.TRUNCATE
Louis Verhaardaee5d752020-09-30 09:01:52 +0200438 fmf = primary_op.memory_function
439 faf = primary_op.activation
440 fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
Louis Verhaardd7911c42020-08-25 13:36:41 +0200441 # Force output scale, used in operations with fused LUT
442 # Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization
443 # except when primary_op is AddAct + 0 (no-op) + LUT
Louis Verhaardaee5d752020-09-30 09:01:52 +0200444 forced_ofm_quantization = primary_op.forced_output_quantization
Louis Verhaardd7911c42020-08-25 13:36:41 +0200445 ofm_quant = cmd.ofm_tensor.quantization
446 if forced_ofm_quantization is not None:
447 ofm_quant = forced_ofm_quantization
Tim Hall79d07d22020-04-27 18:20:16 +0100448
449 # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
450 op_to_scale = 0
451
452 # Update state history
453 prev_ifm_rect = cur_ifm_rect
454 prev_ifm_block_depth = cur_ifm_block_depth
455 prev_ofm_rect = cur_ofm_rect
456 prev_ofm_block = cur_ofm_block
457 prev_kernel = cur_kernel
Tim Hall4ed38bc2020-10-20 18:54:20 +0100458 cur_kernel = ps.primary_op.kernel if ps.primary_op else None
Tim Hall79d07d22020-04-27 18:20:16 +0100459
460 block_config = ps.block_config
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
462 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
463 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
464
465 shared_buffer = ps.shared_buffer
466
467 if npu_block_type == NpuBlockType.ElementWise:
Jacob Bohlinbe733cf2020-08-13 10:21:34 +0200468 ifm2_broadcast = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100469
Jacob Bohlinbf612682020-08-13 09:37:02 +0200470 if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
Tim Hall79d07d22020-04-27 18:20:16 +0100471 # The scalar has to be the ifm2 tensor so switch the ifms
472 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
473 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
474
475 # Set ReverseOperandOrder bit to IFM2_BROADCAST
476 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
477
478 # Calculate scales needed for arithmetic elementwise operators
Louis Verhaardaee5d752020-09-30 09:01:52 +0200479 if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)):
Fredrik Svedberg0f98b362020-09-29 10:00:39 +0200480 input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None
481 input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None
482 output_scale = ofm_quant.scale_f32 if ofm_quant else None
Tim Hall79d07d22020-04-27 18:20:16 +0100483 use_global_scale = True
484
Louis Verhaardaee5d752020-09-30 09:01:52 +0200485 if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200486 output_scale = 1 / 0x3000
Tim Hall79d07d22020-04-27 18:20:16 +0100487
Louis Verhaardaee5d752020-09-30 09:01:52 +0200488 if primary_op.type == Op.Mul:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200489 if None in (input_scale, input2_scale, output_scale):
490 ofm_scale = 1
491 shift = 0
492 else:
493 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Tim Hall79d07d22020-04-27 18:20:16 +0100494 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
495 else: # AddAct/SubAct
Charles Xu9a03fdf2020-07-02 15:12:40 +0200496 # Force output scale same as the input scale for
Louis Verhaardd7911c42020-08-25 13:36:41 +0200497 # resizebilinear 1x1 that is converted to add
Charles Xu9a03fdf2020-07-02 15:12:40 +0200498 if "resizebilinear" in primary_op.attrs:
499 output_scale = input2_scale
500
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200501 if None in (input_scale, input2_scale, output_scale):
502 opa_scale = opb_scale = ofm_scale = 1
503 opa_shift = shift = 0
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200504 ofm_scale, shift = primary_op.attrs.get("rescale", [1, 0])
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200505 elif input_scale == input2_scale:
Tim Hall79d07d22020-04-27 18:20:16 +0100506 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
507 input_scale, input2_scale, output_scale
508 )
509 opa_shift = 0 # Unused for this case
510 else:
511 # Use advanced implementation only when input scales differ
512 bitdepth = cmd.ifm_tensor.dtype.bits
513 (
514 opa_scale,
515 opa_shift,
516 ofm_scale,
517 shift,
518 op_to_scale,
519 ) = scaling.advanced_elementwise_add_sub_scale(
520 input_scale, input2_scale, output_scale, bitdepth
521 )
522 opb_scale = 0 # Unused for this case
523 if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
524 # If the operand order is reversed we also have to swap which operand is scaled
525 if op_to_scale == scaling.OperandToScale.OPa:
526 op_to_scale = scaling.OperandToScale.OPb
527 else:
528 op_to_scale = scaling.OperandToScale.OPa
529
530 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
531 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
532 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
533
Louis Verhaardaee5d752020-09-30 09:01:52 +0200534 elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)):
Louis Verhaardd7911c42020-08-25 13:36:41 +0200535 output_scale = ofm_quant.scale_f32
Tim Hall79d07d22020-04-27 18:20:16 +0100536 use_global_scale = True
537
Louis Verhaardaee5d752020-09-30 09:01:52 +0200538 if primary_op.type == Op.LeakyRelu:
Louis Verhaard58520b92020-08-24 16:45:38 +0200539 output_scale = primary_op.attrs["alpha"]
Tim Hall79d07d22020-04-27 18:20:16 +0100540
541 ofm_scale, shift = scaling.quantise_scale(output_scale)
542 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200543 else:
544 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100545
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200546 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
547 uses_lut = primary_op.activation_lut is not None
548 shram_required = arch.available_shram_banks(uses_lut)
Tim Hall79d07d22020-04-27 18:20:16 +0100549 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
550
551 # Acc buffers not needed so set AB_START to size of SHRAM
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200552 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
Tim Hall79d07d22020-04-27 18:20:16 +0100553
554 # Is not a unary operator
555 if cmd.ifm2_tensor is not None:
556 if cmd.ifm2_tensor.shape == []:
557 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
558 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
559 else:
560 ifm_box_shape = cmd.ifm_box.get_size_shape()
561 ifm2_box_shape = cmd.ifm2_box.get_size_shape()
562
563 if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
564 # Broadcast in 'H' dimension
565 assert cmd.ifm2_tensor.shape[1] == 1
566 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
567
568 if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
569 # Broadcast in 'W' dimension
570 assert cmd.ifm2_tensor.shape[2] == 1
571 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
572
573 if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
574 # Broadcast in 'C' dimension
575 assert cmd.ifm2_tensor.shape[3] == 1
576 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
577
578 # Set IFM2_IB_START to the latter half of the IB space
579 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
580 emit.cmd0_with_param(
Tim Hall4ed38bc2020-10-20 18:54:20 +0100581 cmd0.NPU_SET_IFM2_IB_START,
582 (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
Tim Hall79d07d22020-04-27 18:20:16 +0100583 )
584
585 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
586
587 else:
588 emit.cmd0_with_param(
589 cmd0.NPU_SET_IFM_IB_END,
590 shared_buffer.bank_locations[SharedBufferArea.IFM]
591 + shared_buffer.banks_required[SharedBufferArea.IFM],
592 )
593 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
594
595 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
596
Louis Verhaardaee5d752020-09-30 09:01:52 +0200597 if primary_op.type == Op.ResizeBilinear:
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200598 # perform nearest neighbor upscale
Jacob Bohlincf7da102020-05-20 09:03:40 +0200599 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200600 elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
Jacob Bohlincf7da102020-05-20 09:03:40 +0200601 # perform insert zero upscale
602 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200603 else:
Jacob Bohlincf7da102020-05-20 09:03:40 +0200604 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
Tim Hall79d07d22020-04-27 18:20:16 +0100605
606 if npu_block_type in set(
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200607 (
608 NpuBlockType.ConvolutionMxN,
609 NpuBlockType.ConvolutionDepthWise,
610 NpuBlockType.Pooling,
611 NpuBlockType.ReduceSum,
612 )
Tim Hall79d07d22020-04-27 18:20:16 +0100613 ):
614 # Set up padding
615 explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
616
617 # Check if this is for horizontal ifm streaming
618 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
619 explicit_padding[0] = cmd.pad_top
620 explicit_padding[2] = cmd.pad_bottom
621
622 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
623 # because of activation function needed to be fused.
624 if cmd.ifm_box.start_coord[-2] > 0:
625 explicit_padding[1] = 0
626 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
627 explicit_padding[3] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100628 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
629 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
630 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
631 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
632
Dwight Lidman0538a772020-05-06 14:09:17 +0200633 # set kernel x stride low bit
634 stride = primary_op.attrs["strides"][2] - 1 & 1
635 # set kernel y stride low bit
636 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
637 # set kernel x stride extension bits
638 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
639 # set kernel y stride extension bits
640 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
641
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200642 if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100643 k_height, k_width = primary_op.attrs["ksize"][1:3]
644 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
645 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
646
647 valid_padding = sum(explicit_padding) == 0
648
Louis Verhaardaee5d752020-09-30 09:01:52 +0200649 if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding:
Tim Hall79d07d22020-04-27 18:20:16 +0100650 # For valid padding vela has to output scaling values
Louis Verhaardaee5d752020-09-30 09:01:52 +0200651 if faf == Op.Sigmoid or faf == Op.Tanh:
Tim Hall79d07d22020-04-27 18:20:16 +0100652 rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200653 if cmd.ifm_tensor.dtype == DataType.int16:
Charles Xuf8992312020-08-18 08:41:54 +0200654 # Calculate scale and shift for the output scale of 1/(3*4096)
655 shift = 0
656 max_rescale = np.iinfo(np.int16).max / 2
657 while rescale <= max_rescale and shift <= 30:
658 shift += 1
659 rescale *= 2
660 scale = int(rescale)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200661 else:
Charles Xuf8992312020-08-18 08:41:54 +0200662 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
663 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200664 scale = int(round_away_zero(scale * rescale))
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200665 elif fused_quantize:
666 # Quantize op requires different scaling
667 ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)
Louis Verhaardd7911c42020-08-25 13:36:41 +0200668 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200669 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200670 elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs:
Charles Xu87c13502020-08-06 12:17:26 +0200671 rescale = primary_op.attrs["rescale"]
672 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
673 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
674 scale = int(round_away_zero(scale * rescale))
Tim Hall79d07d22020-04-27 18:20:16 +0100675 else:
676 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
677 # k_height == k_width == 1 is allways true in this case
678 # Normally the scale is maximised, to get maximum precision, which means that
679 # if rescale != 1, scale need to consider the number of bits needed for rescaling
Louis Verhaardd7911c42020-08-25 13:36:41 +0200680 if None not in (ofm_quant.scale_f32, cmd.ifm_tensor.quantization.scale_f32,):
681 rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200682 rescale_bits = 0
683 if k_height == k_width == 1:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200684 if fmf == Op.ConcatSliceWrite:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200685 rounding_mode = rounding.NATURAL
686 if rescale > 1:
687 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
688 elif rescale < 1:
689 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
690 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
691 scale = int(round_away_zero(scale * rescale))
692 else:
693 scale = 1
694 shift = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100695
696 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
697 # Valid-padded average pool should use the global scale from
698 # NPU_SET_OFM_SCALE register, which is set above.
699 use_global_scale = True
700
701 else: # Convolution
702 assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200703 # Reduced precision quantization and natural rounding used for int16
704 if cmd.ifm_tensor.dtype == DataType.int16:
705 rounding_mode = rounding.NATURAL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200706 stride |= (cur_kernel.dilation.y - 1) << 4
707 stride |= (cur_kernel.dilation.x - 1) << 3
708 emit.cmd0_with_param(
709 cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
710 )
711 emit.cmd0_with_param(
712 cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
713 )
Tim Hall79d07d22020-04-27 18:20:16 +0100714 if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
715 # Part-kernel-first weight ordering
716 assert npu_block_type == NpuBlockType.ConvolutionMxN
717 stride |= 1 << 2
718
719 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
720
721 elif npu_block_type in set((NpuBlockType.VectorProduct,)):
722 # Vector product is implemented using a 1x1 convolution so need
723 # to setup the appropriate padding and kernel info
724 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
725 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
726 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
727 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
728
729 # kernel stride reg = 0 means stride(1,1) + depth first weight
730 # order + dilation(0,0) + kernel_split_size=8
731 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
732
733 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
734 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
735
736 if npu_block_type in set(
737 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
738 ):
739 # Emit Weight base address commands, only maps the area required for
740 # this command's weights from the larger tensor.
741 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100742 weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200743 substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100744
745 # Extract weight substream offsets and calculate their lengths
746 assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100747 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100748
Tim Hall62316762020-06-25 16:55:02 +0100749 # Set weights sources for active and present cores
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200750 for core, param in enumerate(
751 [
752 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
753 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
754 ]
755 ):
Tim Hall62316762020-06-25 16:55:02 +0100756 if core < substreams:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200757 emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])
758 emit.cmd1_with_offset(
759 param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]
760 )
Tim Hall62316762020-06-25 16:55:02 +0100761 elif core < arch.ncores:
762 emit.cmd1_with_offset(param[0], weight_addr)
763 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100764
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200765 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100766 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100767
768 # Emit Scale & Bias base address commands, with length matching the amount required by
769 # the weight tensors.
770 if cmd.scale_tensor is not None:
Tim Hallf7e810a2020-06-25 15:04:31 +0100771 scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200772 substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100773
774 # Extract scale substream offsets and calculate their lengths
775 assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200776 scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
Tim Hallf7e810a2020-06-25 15:04:31 +0100777
Tim Hall62316762020-06-25 16:55:02 +0100778 # Set scale sources for active and present cores
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200779 for core, param in enumerate(
780 [
781 (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
782 (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),
783 ]
784 ):
Tim Hall62316762020-06-25 16:55:02 +0100785 if core < substreams:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200786 emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])
787 emit.cmd1_with_offset(
788 param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]
789 )
Tim Hall62316762020-06-25 16:55:02 +0100790 elif core < arch.ncores:
791 emit.cmd1_with_offset(param[0], scale_addr)
792 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100793
Tim Hall79d07d22020-04-27 18:20:16 +0100794 # Emit base address for NPU to access scale & bias data
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200795 scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100796 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100797
Fredrik Svedberg0f98b362020-09-29 10:00:39 +0200798 ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min
799 ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max
800 ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min
801 ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max
Tim Hall79d07d22020-04-27 18:20:16 +0100802
803 # Emit commands for any fused activation function
Diego Russoea6111a2020-04-14 18:41:58 +0100804 if faf is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100805 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
806 # Even if no activation function, values need to be set to override previous values
807 faf_min = ofm_quant_qmin
808 faf_max = ofm_quant_qmax
Louis Verhaardaee5d752020-09-30 09:01:52 +0200809 elif faf == Op.Relu:
Tim Hall79d07d22020-04-27 18:20:16 +0100810 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
811 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
812 faf_max = ofm_quant_qmax
Louis Verhaardaee5d752020-09-30 09:01:52 +0200813 elif faf == Op.Relu6:
Tim Hall79d07d22020-04-27 18:20:16 +0100814 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
815 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
816 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200817 elif faf == Op.ReluN1To1:
Tim Hall79d07d22020-04-27 18:20:16 +0100818 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
819 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
820 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200821 elif faf == Op.Tanh:
Tim Hall79d07d22020-04-27 18:20:16 +0100822 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200823 if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200824 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
825 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
826 else:
827 faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
828 faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200829 elif faf == Op.Sigmoid:
Tim Hall79d07d22020-04-27 18:20:16 +0100830 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200831 if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200832 faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
833 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
834 else:
835 faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
836 faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200837 elif faf == Op.LUT:
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200838 lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1)
839 assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range."
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +0200840 if cmd.ofm_tensor.dtype == DataType.int32:
Fredrik Svedberg1575b942020-08-18 13:19:18 +0200841 lut_index |= 3 << 12 # Force I8 range
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200842 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)
843 faf_min = ofm_quant_qmin
844 faf_max = ofm_quant_qmax
Tim Hall79d07d22020-04-27 18:20:16 +0100845 else:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200846 raise Exception("Unsupported fused_activation_function = " + faf.name)
Tim Hall79d07d22020-04-27 18:20:16 +0100847
848 # Activation range needs to be set based upon the quantisation range and the fused activation range
849 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
850 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
851
852 out_shape = cmd.ofm_box.get_size_shape()
853 if len(out_shape) >= 4:
854 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
855 else:
856 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
857 if len(out_shape) >= 2:
858 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
859 else:
860 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
861 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
862
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200863 if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100864 in_shape = cmd.ifm_box.get_size_shape()
865 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
866 else:
867 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
868
Jacob Bohlin3c678292020-04-27 10:27:25 +0200869 for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
Tim Hall79d07d22020-04-27 18:20:16 +0100870 (
871 cmd.ifm_tensor,
872 cmd.ifm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200873 cmd0.NPU_SET_IFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100874 (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
875 (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
876 cmd0.NPU_SET_IFM_ZERO_POINT,
877 ),
878 (
879 cmd.ifm2_tensor,
880 cmd.ifm2_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200881 cmd0.NPU_SET_IFM2_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100882 (
883 cmd1.NPU_SET_IFM2_BASE0,
884 cmd1.NPU_SET_IFM2_BASE1,
885 cmd1.NPU_SET_IFM2_BASE2,
886 cmd1.NPU_SET_IFM2_BASE3,
887 ),
888 (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
889 cmd0.NPU_SET_IFM2_ZERO_POINT,
890 ),
891 (
892 cmd.ofm_tensor,
893 cmd.ofm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200894 cmd0.NPU_SET_OFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100895 (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
896 (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
897 cmd0.NPU_SET_OFM_ZERO_POINT,
898 ),
899 ):
900
Diego Russoea6111a2020-04-14 18:41:58 +0100901 if tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100902 continue
903
Louis Verhaard98a34992020-09-01 10:39:04 +0200904 need_zero_point = (
905 (faf is not None and forced_ofm_quantization is None)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200906 or (fmf == Op.ConcatSliceWrite)
Louis Verhaard98a34992020-09-01 10:39:04 +0200907 or fused_quantize
908 )
Tim Hall79d07d22020-04-27 18:20:16 +0100909 if (
Louis Verhaardaee5d752020-09-30 09:01:52 +0200910 (primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point)
Fredrik Svedberg237d72d2020-08-28 18:12:28 +0200911 or (
912 tens.dtype == DataType.int32
913 and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT)
914 )
915 or tens.quantization is None
916 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100917 # Actual integer operation, just set scale to 1 and zero point to 0
918 emit.cmd0_with_param(zero_point_op, 0)
919 else:
920 assert tens.quantization.zero_point is not None, "need an actual zero point set"
Louis Verhaardd7911c42020-08-25 13:36:41 +0200921 if cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op and forced_ofm_quantization is not None:
922 zero_point = forced_ofm_quantization.zero_point
923 elif (
Charles Xu9a03fdf2020-07-02 15:12:40 +0200924 "resizebilinear" in primary_op.attrs
Louis Verhaardaee5d752020-09-30 09:01:52 +0200925 and primary_op.type == Op.Add
Charles Xu9a03fdf2020-07-02 15:12:40 +0200926 and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op
927 ):
928 # Force output zero point same as the input zero point
Louis Verhaardd7911c42020-08-25 13:36:41 +0200929 # for resizebilinear 1x1 that is converted to add
Charles Xu9a03fdf2020-07-02 15:12:40 +0200930 zero_point = cmd.ifm2_tensor.quantization.zero_point
931 else:
932 zero_point = tens.quantization.zero_point
933 emit.cmd0_with_param(zero_point_op, int(zero_point))
Tim Hall79d07d22020-04-27 18:20:16 +0100934
935 if tens.shape == []:
936 # Empty shape, elementwise constant
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200937 ifm2_scalar = tens.quant_values
Tim Hall79d07d22020-04-27 18:20:16 +0100938 assert ifm2_scalar.size == 1
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200939 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
Tim Hall79d07d22020-04-27 18:20:16 +0100940 continue
941
942 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
943 box.start_coord, box.end_coord
944 )
945 if npu_block_type != NpuBlockType.VectorProduct:
946 if tens == cmd.ifm_tensor:
947 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
948 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
949 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
950 elif tens == cmd.ofm_tensor:
951 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
952 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
953 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
Louis Verhaard0cf06c72020-05-12 08:31:05 +0200954 if tens == cmd.ifm2_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100955 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
956 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
957 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
958 else:
959 if len(out_shape) == 2:
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200960 assert out_shape[0] == 1
Tim Hall79d07d22020-04-27 18:20:16 +0100961 if tens == cmd.ifm_tensor:
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200962 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100963 elif tens == cmd.ofm_tensor:
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200964 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100965 else:
966 assert False
967
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200968 emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
Jacob Bohlin3c678292020-04-27 10:27:25 +0200969
Tim Hall79d07d22020-04-27 18:20:16 +0100970 for idx, addr in enumerate(addresses):
971 if addr is None:
972 addresses[idx] = 0
973
974 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
975 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
976 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
977 emit.cmd1_with_offset(ptr_ops[3], addresses[3])
978
979 strides = tens.get_strides()
980 emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
981 emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
982 emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
983
984 if tens.format == TensorFormat.NHCWB16:
985 # Check that all BasePointer addresses are aligned to 16 bytes
986 assert (int(addresses[0]) % 16) == 0
987 assert (int(addresses[1]) % 16) == 0
988 assert (int(addresses[2]) % 16) == 0
989 assert (int(addresses[3]) % 16) == 0
990
991 ofm_dtype = cmd.ofm_tensor.dtype
992 assert ofm_dtype.type & BaseType.Int
993 prec = 0
994 if ofm_dtype.size_in_bits() == 8:
995 prec = 0
996 elif ofm_dtype.size_in_bits() == 16:
997 prec = 2
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200998 elif ofm_dtype.size_in_bits() == 32:
999 prec = 4
Tim Hall79d07d22020-04-27 18:20:16 +01001000 else:
1001 assert 0
1002
1003 if ofm_dtype.type & BaseType.Signed:
1004 prec += 1
1005
1006 if use_global_scale:
1007 # Set global scale bit, as opposed to using per channel scale
1008 prec |= 1 << 8
1009
1010 if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
1011 prec |= 1 << 6
1012
1013 prec |= rounding_mode.value << 14
1014
1015 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
1016
1017 prec = None
1018 weight_bits = 8
1019 if cmd.weight_tensor is not None:
1020 weight_bits = cmd.weight_tensor.dtype.size_in_bits()
1021
1022 ifm_dtype = cmd.ifm_tensor.dtype
1023
1024 assert weight_bits == 8, "Unsupported weight bit depth"
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001025 assert (
1026 ifm_dtype.size_in_bits() in {8, 16}
1027 or ifm_dtype.size_in_bits() == 32
1028 and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)
1029 ), "Unsupported ifm bit depth"
Tim Hall79d07d22020-04-27 18:20:16 +01001030
1031 if ifm_dtype.size_in_bits() == 8:
1032 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001033 prec = ifm_precision.S8
Tim Hall79d07d22020-04-27 18:20:16 +01001034 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001035 prec = ifm_precision.U8
Tim Hall79d07d22020-04-27 18:20:16 +01001036 elif ifm_dtype.size_in_bits() == 16:
1037 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001038 prec = ifm_precision.S16
Tim Hall79d07d22020-04-27 18:20:16 +01001039 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001040 prec = ifm_precision.U16
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001041 elif ifm_dtype == DataType.int32:
1042 prec = ifm_precision.S32
Tim Hall79d07d22020-04-27 18:20:16 +01001043
1044 ifm_prec = prec.value
1045 ifm2_prec = ifm_prec
1046
1047 if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
1048 ifm_prec |= 1 << 6
1049
1050 ifm_prec |= op_to_scale << 8
1051
1052 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
1053
1054 if cmd.ifm2_tensor is not None:
1055 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
1056 ifm2_prec |= 1 << 6
1057 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
1058
Tim Hall79d07d22020-04-27 18:20:16 +01001059 # Get op parameters
1060 cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
1061 cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
1062 cur_ofm_rect = get_op_ofm_rect(cmd)
1063 cur_ifm_rect = get_op_ifm_rect(cmd)
Tim Hall79d07d22020-04-27 18:20:16 +01001064 cur_padLT = get_op_padding_lt(cmd)
1065 if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
1066 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
1067 blockdep = arch.calc_block_dep(
1068 prev_ifm_rect,
1069 prev_ofm_rect,
1070 prev_ifm_block_depth,
1071 prev_ofm_block,
1072 prev_kernel,
1073 cur_ifm_rect,
1074 cur_ofm_rect,
1075 cur_ifm_block_depth,
1076 cur_ofm_block,
1077 cur_kernel,
1078 cur_padLT,
1079 )
1080 else:
1081 blockdep = 0
1082 else:
1083 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
1084
1085 # Set between every op (dependent or not)
1086 blockdep = min(blockdep, arch.max_blockdep)
1087 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1088 prev_cmd = cmd
1089
Tim Hall289a41d2020-08-04 21:40:14 +01001090 emit_cmd_waits(cmd_waits)
Tim Halle6ccd872020-11-09 16:46:37 +00001091 DebugDatabase.add_command(stream_id, emit.offset, primary_op)
Tim Hall289a41d2020-08-04 21:40:14 +01001092
Tim Hall79d07d22020-04-27 18:20:16 +01001093 if npu_block_type == NpuBlockType.ConvolutionMxN:
1094 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1095 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
1096 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1097 elif npu_block_type == NpuBlockType.VectorProduct:
1098 # Vector product is implemented using a 1x1 convolution
1099 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1100 elif npu_block_type == NpuBlockType.Pooling:
Louis Verhaardaee5d752020-09-30 09:01:52 +02001101 param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value
Tim Hall79d07d22020-04-27 18:20:16 +01001102 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001103 elif npu_block_type == NpuBlockType.ReduceSum:
1104 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)
Tim Hall79d07d22020-04-27 18:20:16 +01001105 elif npu_block_type == NpuBlockType.ElementWise:
1106 param = elementwise_mode_map[primary_op.type]
1107 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
1108 else:
1109 print("Warning: Skipping register command stream generation for", ps)
1110
1111 # Fill in final part of command stream:
1112 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1113
1114 sg.register_command_stream = emit.to_list()
1115 if verbose:
1116 emit.print_cmds()
1117 print("number of commands", len(emit.cmd_stream))
1118 print("command stream length in words", len(sg.register_command_stream))