blob: bdc3722330f68e45fd03298485097ebfa4b26cf1 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
18# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
19# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Tim Hall289a41d2020-08-04 21:40:14 +010021from collections import namedtuple
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
25import numpy as np
26
27from . import scaling
Diego Russoe8a10452020-04-21 17:39:10 +010028from .architecture_features import ArchitectureFeatures
29from .architecture_features import Block
30from .architecture_features import Kernel
31from .architecture_features import Rect
32from .architecture_features import SharedBufferArea
33from .architecture_features import SHRAMElements
34from .data_type import BaseType
35from .data_type import DataType
36from .ethos_u55_regs.ethos_u55_regs import acc_format
37from .ethos_u55_regs.ethos_u55_regs import activation
38from .ethos_u55_regs.ethos_u55_regs import cmd0
39from .ethos_u55_regs.ethos_u55_regs import cmd1
40from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
41from .ethos_u55_regs.ethos_u55_regs import ifm_precision
Fredrik Svedberga0c36242020-06-03 15:43:31 +020042from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020043from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010044from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010045from .high_level_command_stream import CommandType
Diego Russoe8a10452020-04-21 17:39:10 +010046from .numeric_util import clamp_sigmoid
47from .numeric_util import clamp_tanh
Louis Verhaardb2fb2122020-06-04 15:51:24 +020048from .numeric_util import full_shape
Diego Russoe8a10452020-04-21 17:39:10 +010049from .numeric_util import quantise_float32
50from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010051from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010052from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010053from .shared_buffer_allocation import SharedBufferAllocation
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020054from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010055from .tensor import TensorBlockTraversal
56from .tensor import TensorFormat
Fredrik Svedberga0c36242020-06-03 15:43:31 +020057from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010058
59
60class RegisterMachine:
61 def __init__(self):
62 self.n_banks = 1
63 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
64 self.bank_idx = 0
65
66 def set_register(self, reg, value):
67 is_changed = self.registers[self.bank_idx][reg] != value
68 self.registers[self.bank_idx][reg] = value
69 # is_changed = True # force command
70 return is_changed
71
72 def switch_bank(self):
73 self.bank_idx = (self.bank_idx + 1) % self.n_banks
74
75
76class CmdMode(IntEnum):
77 NoPayload = 0x0000
78 Payload32 = 0x4000
79 Mask = 0xC000
80 CmdOpMask = 0x03FF
81
82
83class BasePointerIndex(IntEnum):
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020084 WeightTensor = 0 # base address index for the Weight tensor
85 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
86 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020087 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
Tim Hall79d07d22020-04-27 18:20:16 +010088
89
90# TODO: Replace with definitions from ethos_u55_regs
91class IFM2Broadcast(IntEnum):
92 BroadcastHdim = 1 << 0
93 BroadcastWdim = 1 << 1
94 BroadcastCdim = 1 << 2
95 ReverseOperandOrder = 1 << 6
96 UseIFM2Scalar = 1 << 7
97
98
99class CommandStreamEmitter:
100 def __init__(self):
101 self.cmd_stream = []
102 self.reg_machine = [RegisterMachine(), RegisterMachine()]
103 self.last_absolute_wait = defaultdict(int)
104
105 def get_reg_machine(self, cmd):
106 if "DMA" in cmd.name:
107 return self.reg_machine[1]
108 else:
109 return self.reg_machine[0]
110
111 def size_in_bytes(self):
112 sz = 0
113 for cmd in self.cmd_stream:
114 sz += len(cmd) * 4
115 return sz
116
117 def to_list(self):
118 return [elem for cmd in self.cmd_stream for elem in cmd]
119
120 def print_cmds(self):
121 print("Code: Command: Param: Payload:")
122 for words_for_one_command in self.cmd_stream:
123 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
124 param = words_for_one_command[0] >> 16 # higher 16 bits
125
126 payload_mode = CmdMode(code & CmdMode.Mask)
127
128 # code and command
129 s = " 0x%04x " % code
130 if payload_mode == CmdMode.NoPayload:
131 s += str(cmd0(code & CmdMode.CmdOpMask))
132 else:
133 s += str(cmd1(code & CmdMode.CmdOpMask))
134
135 s = s.ljust(40)
136 s += "%5d" % param
137
138 # payload
139 if payload_mode == CmdMode.Payload32:
140 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
141 else:
142 s += " -"
143
144 print(s)
145
146 def cmd0_with_param(self, cmd, param):
147 if isinstance(param, Enum):
148 param = int(param.value)
149 else:
150 param = int(param)
151 param = param & 0xFFFF
152 command = cmd.value | (param << 16)
153 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
154 return
155
156 # This is not a redundant command, actually write it
157 self.cmd_stream.append((command,))
158
159 def cmd1_with_offset(self, cmd, offset, param=0x0):
160 offset = int(offset) & 0xFFFFFFFFF
161 command = cmd.value | CmdMode.Payload32.value | (param << 16)
162
163 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
164 return
165
166 # This is not a redundant command, actually write it
167 self.cmd_stream.append((command, offset))
168
Tim Hall289a41d2020-08-04 21:40:14 +0100169 def cmd_wait(self, cmd, channel, outstanding_count):
170 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100171 command = ((param & 0xFFFF) << 16) | cmd.value
172 self.cmd_stream.append((command,))
173
174 def cmd_do_operation(self, cmd, param=0):
175 param = int(param)
176 command = ((param & 0xFFFF) << 16) | cmd.value
177
178 self.cmd_stream.append((command,))
179 self.get_reg_machine(cmd).switch_bank()
180
181
Tim Hall289a41d2020-08-04 21:40:14 +0100182Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall79d07d22020-04-27 18:20:16 +0100183
Tim Hall79d07d22020-04-27 18:20:16 +0100184
Tim Hall289a41d2020-08-04 21:40:14 +0100185def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark):
186 cmd = cmd_stream[cmd_index]
187 cmd_access = memory_accesses[cmd]
188 index = cmd_index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Tim Hall289a41d2020-08-04 21:40:14 +0100190 # NPU dependency tracking
191 npu_outstanding = -1
192 npu_ops = 0
193 npu_index = watermark.npu
Tim Hall79d07d22020-04-27 18:20:16 +0100194
Tim Hall289a41d2020-08-04 21:40:14 +0100195 # DMA dependency tracking
196 dma_outstanding = -1
197 dma_ops = 0
198 dma_index = watermark.dma
Tim Hall79d07d22020-04-27 18:20:16 +0100199
Tim Hall289a41d2020-08-04 21:40:14 +0100200 # Seek back in the command stream looking for NPU or DMA dependencies
201 # but only as far as the first dependency or the watermarks (dependencies
202 # before this point have been satisfied already).
203 # The watermark moves to after the latest element we must wait for, not
204 # the command that issues the wait.
205 # NPU->NPU dependency is handled via blockdep.
206 while (index >= npu_index) or (index >= dma_index):
207 prev_cmd = cmd_stream[index]
208 prev_access = memory_accesses[prev_cmd]
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Tim Hall289a41d2020-08-04 21:40:14 +0100210 # Check DMA consuming NPU output
211 if prev_cmd.cmdtype == CommandType.NpuStripe:
212 if index >= npu_index:
213 if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access):
214 npu_outstanding = npu_ops
215 npu_ops = npu_ops + 1 # Count NPU ops in the pipeline
216 if npu_ops >= arch.max_outstanding_kernels:
217 npu_index = max(index + 1, npu_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100218
Tim Hall289a41d2020-08-04 21:40:14 +0100219 # Check NPU consuming DMA output
220 elif prev_cmd.cmdtype == CommandType.DMA:
221 if index >= dma_index:
222 if cmd.cmdtype == CommandType.NpuStripe:
223 if (dma_outstanding == -1) and prev_access.conflicts(cmd_access):
224 dma_outstanding = dma_ops
225 dma_ops = dma_ops + 1 # Count DMA ops in the pipeline
226 if dma_ops >= arch.max_outstanding_dma:
227 dma_index = max(index + 1, dma_index)
Tim Hall79d07d22020-04-27 18:20:16 +0100228
Tim Hall289a41d2020-08-04 21:40:14 +0100229 index = index - 1
Tim Hall79d07d22020-04-27 18:20:16 +0100230
Tim Hall289a41d2020-08-04 21:40:14 +0100231 # Update DMA watermark if we didn't see any and the NPU pipeline is full
232 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
233 dma_index = cmd_index
234
235 # Bring the search watermark forwards as we complete for those dependencies
236 watermark = Watermark(npu_index, dma_index)
237 outstanding = Watermark(npu_outstanding, dma_outstanding)
238
239 return watermark, outstanding
Tim Hall79d07d22020-04-27 18:20:16 +0100240
241
242def get_op_kernel(ps):
243 if ps.primary_op is None:
244 return None
245
246 strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
247 dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
248 if ps.weight_tensor:
249 if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
250 k_h = 1
251 k_w = 1
252 else:
253 k_h = ps.weight_tensor.shape[0]
254 k_w = ps.weight_tensor.shape[1]
255 else:
256 k_h = ps.primary_op.attrs.get("filter_height", 1)
257 k_w = ps.primary_op.attrs.get("filter_width", 1)
258
259 return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
260
261
Tim Hall79d07d22020-04-27 18:20:16 +0100262def has_prev_op_dependency(prev_cmd, cmd):
263 if prev_cmd is None:
264 return False
265 if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200266 if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor):
Tim Hall79d07d22020-04-27 18:20:16 +0100267 return True
Tim Hall90337952020-05-07 16:42:35 +0100268 elif cmd.ifm2_tensor is not None:
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200269 return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100270 return False
271
272
273def get_op_ofm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200274 start = full_shape(4, cmd.ofm_box.start_coord, 0)
275 end = full_shape(4, cmd.ofm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100276 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
277
278
279def get_op_ifm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200280 start = full_shape(4, cmd.ifm_box.start_coord, 0)
281 end = full_shape(4, cmd.ifm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100282 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
283
284
285def get_op_ifmofm_block_depth(arch, cmd):
286 # Note: NOT equivalent to the normal ifm block depth calculation since
287 # it takes into account 'depthless' block operations by returning full
288 # depth
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200289 if cmd.ps.npu_block_type in (
290 NpuBlockType.ConvolutionDepthWise,
291 NpuBlockType.Pooling,
292 NpuBlockType.ElementWise,
293 NpuBlockType.ReduceSum,
294 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100295 return cmd.ofm_box.get_size_shape()[-1]
296
297 return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
298
299
300def get_op_padding_lt(cmd):
301 if cmd.ps.npu_block_type not in (
302 NpuBlockType.ConvolutionDepthWise,
303 NpuBlockType.Pooling,
304 NpuBlockType.ConvolutionMxN,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200305 NpuBlockType.ReduceSum,
Tim Hall79d07d22020-04-27 18:20:16 +0100306 ):
307 return (0, 0)
308
309 explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
310
311 # Check if this is for horizontal ifm streaming
312 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
313 explicit_padding[0] = cmd.pad_top
314 explicit_padding[2] = cmd.pad_bottom
315
316 return (explicit_padding[1], explicit_padding[0])
317
318
Jacob Bohline99b8932020-07-13 16:01:51 +0200319def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
320 if ifm_shape == []:
321 # Scalar needs to be in IFM2
322 return False
323 elif ifm2_shape == []:
324 return True
325
326 for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
327 if ifm != ifm2 and ifm == 1:
328 # Broadcasted FM needs to be in IFM2
329 return False
330
331 return True
332
333
Tim Hall79d07d22020-04-27 18:20:16 +0100334def generate_register_command_stream(nng, sg, arch, verbose=False):
335 emit = CommandStreamEmitter()
336
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200337 if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
338 base_ptr_idx_map = {
339 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
340 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
341 MemType.Scratch: BasePointerIndex.ScratchTensor,
342 MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
343 }
344 else:
345 base_ptr_idx_map = {
346 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
347 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
348 MemType.Scratch: BasePointerIndex.ScratchTensor,
349 MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
350 }
Tim Hall79d07d22020-04-27 18:20:16 +0100351
352 # Maps an AccumulatorType enum to the corresponding acc_format value
353 acc_format_map = {
354 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
355 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
356 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
357 }
358
359 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
360 elementwise_mode_map = {
361 "MulAct": elementwise_mode.MUL.value,
362 "AddAct": elementwise_mode.ADD.value,
363 "SubAct": elementwise_mode.SUB.value,
364 "Minimum": elementwise_mode.MIN.value,
365 "Maximum": elementwise_mode.MAX.value,
366 "LeakyRelu": elementwise_mode.LRELU.value,
367 "Abs": elementwise_mode.ABS.value,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200368 "CLZ": elementwise_mode.CLZ.value,
369 "SHR": elementwise_mode.SHR.value,
370 "SHL": elementwise_mode.SHL.value,
Tim Hall79d07d22020-04-27 18:20:16 +0100371 }
372
373 cmd_stream = []
Tim Hall289a41d2020-08-04 21:40:14 +0100374 memory_accesses = {}
Tim Hall79d07d22020-04-27 18:20:16 +0100375 for cmd in sg.high_level_command_stream:
376 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
377 print("Warning: Skipping register command stream generation for", cmd.ps)
378 else:
379 cmd_stream.append(cmd)
Tim Hall289a41d2020-08-04 21:40:14 +0100380 memory_accesses[cmd] = cmd.get_memory_accesses()
Tim Hall79d07d22020-04-27 18:20:16 +0100381
Tim Hall289a41d2020-08-04 21:40:14 +0100382 def emit_cmd_waits(cmd_waits):
383 if cmd_waits.npu >= 0:
384 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
385
386 if cmd_waits.dma >= 0:
387 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
Tim Hall79d07d22020-04-27 18:20:16 +0100388
389 # Initialise operator dependency state
390 prev_ifm_rect = cur_ifm_rect = None
391 prev_ifm_block_depth = cur_ifm_block_depth = None
392 prev_ofm_rect = cur_ofm_rect = None
393 prev_ofm_block = cur_ofm_block = None
394 prev_kernel = cur_kernel = None
395 prev_cmd = None
396
Tim Hall42e41892020-07-06 10:51:31 +0100397 if arch.is_yoda_system:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200398 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Tim Hallf7e810a2020-06-25 15:04:31 +0100399
Tim Hall289a41d2020-08-04 21:40:14 +0100400 dep_watermark = Watermark(0, 0)
401
402 for cmd_index, cmd in enumerate(cmd_stream):
403 dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark)
404
Tim Hall79d07d22020-04-27 18:20:16 +0100405 if cmd.cmdtype == CommandType.DMA:
406 start_coord = cmd.box.start_coord
407
408 src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
409 dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
410
411 if cmd.in_tensor.compressed_values is not None:
412 stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
413 sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
414 else:
415 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
416
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200417 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
Tim Hall79d07d22020-04-27 18:20:16 +0100418 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200419 if cmd.out_tensor.purpose == TensorPurpose.LUT:
420 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)
421 else:
422 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200423
Tim Hall79d07d22020-04-27 18:20:16 +0100424 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
425 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
426 dma_channel = 0
427 mode = 0 # From external to external
428
Tim Hall289a41d2020-08-04 21:40:14 +0100429 emit_cmd_waits(cmd_waits)
Tim Hall79d07d22020-04-27 18:20:16 +0100430 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
431
432 elif cmd.cmdtype == CommandType.NpuStripe:
433
434 ps = cmd.ps
435 primary_op = ps.primary_op
436 npu_block_type = ps.npu_block_type
437 # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
438 use_global_scale = False
439 # Specifies type of rounding to be used.
440 rounding_mode = rounding.TFL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200441 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200442 rounding_mode = rounding.TRUNCATE
Tim Hall79d07d22020-04-27 18:20:16 +0100443 fmf = primary_op.attrs.get("fused_memory_function", None)
444 faf = primary_op.attrs.get("fused_activation_function", None)
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200445 fused_quantize = any(op.type == "Quantize" for op in ps.ops)
Tim Hall79d07d22020-04-27 18:20:16 +0100446
447 # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
448 op_to_scale = 0
449
450 # Update state history
451 prev_ifm_rect = cur_ifm_rect
452 prev_ifm_block_depth = cur_ifm_block_depth
453 prev_ofm_rect = cur_ofm_rect
454 prev_ofm_block = cur_ofm_block
455 prev_kernel = cur_kernel
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200456 cur_kernel = get_op_kernel(ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100457
458 block_config = ps.block_config
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
460 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
461 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
462
463 shared_buffer = ps.shared_buffer
464
465 if npu_block_type == NpuBlockType.ElementWise:
Jacob Bohlinbe733cf2020-08-13 10:21:34 +0200466 ifm2_broadcast = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100467
Jacob Bohlinbf612682020-08-13 09:37:02 +0200468 if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
Tim Hall79d07d22020-04-27 18:20:16 +0100469 # The scalar has to be the ifm2 tensor so switch the ifms
470 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
471 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
472
473 # Set ReverseOperandOrder bit to IFM2_BROADCAST
474 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
475
476 # Calculate scales needed for arithmetic elementwise operators
477 if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
478 input_scale = cmd.ifm_tensor.quantization.scale_f32
479 input2_scale = cmd.ifm2_tensor.quantization.scale_f32
480 output_scale = cmd.ofm_tensor.quantization.scale_f32
481 use_global_scale = True
482
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200483 if output_scale is not None and faf in ("Sigmoid", "Tanh"):
484 output_scale = 1 / 0x3000
Tim Hall79d07d22020-04-27 18:20:16 +0100485
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200486 if primary_op.type == "MulAct":
487 if None in (input_scale, input2_scale, output_scale):
488 ofm_scale = 1
489 shift = 0
490 else:
491 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Tim Hall79d07d22020-04-27 18:20:16 +0100492 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
493 else: # AddAct/SubAct
Charles Xu9a03fdf2020-07-02 15:12:40 +0200494 # Force output scale same as the input scale for
495 # resizebiliner 1x1 that is converted to add
496 if "resizebilinear" in primary_op.attrs:
497 output_scale = input2_scale
498
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200499 if None in (input_scale, input2_scale, output_scale):
500 opa_scale = opb_scale = ofm_scale = 1
501 opa_shift = shift = 0
502 elif input_scale == input2_scale:
Tim Hall79d07d22020-04-27 18:20:16 +0100503 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
504 input_scale, input2_scale, output_scale
505 )
506 opa_shift = 0 # Unused for this case
507 else:
508 # Use advanced implementation only when input scales differ
509 bitdepth = cmd.ifm_tensor.dtype.bits
510 (
511 opa_scale,
512 opa_shift,
513 ofm_scale,
514 shift,
515 op_to_scale,
516 ) = scaling.advanced_elementwise_add_sub_scale(
517 input_scale, input2_scale, output_scale, bitdepth
518 )
519 opb_scale = 0 # Unused for this case
520 if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
521 # If the operand order is reversed we also have to swap which operand is scaled
522 if op_to_scale == scaling.OperandToScale.OPa:
523 op_to_scale = scaling.OperandToScale.OPb
524 else:
525 op_to_scale = scaling.OperandToScale.OPa
526
527 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
528 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
529 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
530
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200531 elif primary_op.type in set(("LeakyRelu", "Abs",)):
Tim Hall79d07d22020-04-27 18:20:16 +0100532 output_scale = cmd.ofm_tensor.quantization.scale_f32
533 use_global_scale = True
534
535 if primary_op.type == "LeakyRelu":
536 output_scale *= primary_op.attrs["alpha"]
537
538 ofm_scale, shift = scaling.quantise_scale(output_scale)
539 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200540 else:
541 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100542
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200543 # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
544 uses_lut = primary_op.activation_lut is not None
545 shram_required = arch.available_shram_banks(uses_lut)
Tim Hall79d07d22020-04-27 18:20:16 +0100546 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
547
548 # Acc buffers not needed so set AB_START to size of SHRAM
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200549 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
Tim Hall79d07d22020-04-27 18:20:16 +0100550
551 # Is not a unary operator
552 if cmd.ifm2_tensor is not None:
553 if cmd.ifm2_tensor.shape == []:
554 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
555 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
556 else:
557 ifm_box_shape = cmd.ifm_box.get_size_shape()
558 ifm2_box_shape = cmd.ifm2_box.get_size_shape()
559
560 if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
561 # Broadcast in 'H' dimension
562 assert cmd.ifm2_tensor.shape[1] == 1
563 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
564
565 if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
566 # Broadcast in 'W' dimension
567 assert cmd.ifm2_tensor.shape[2] == 1
568 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
569
570 if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
571 # Broadcast in 'C' dimension
572 assert cmd.ifm2_tensor.shape[3] == 1
573 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
574
575 # Set IFM2_IB_START to the latter half of the IB space
576 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
577 emit.cmd0_with_param(
578 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
579 )
580
581 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
582
583 else:
584 emit.cmd0_with_param(
585 cmd0.NPU_SET_IFM_IB_END,
586 shared_buffer.bank_locations[SharedBufferArea.IFM]
587 + shared_buffer.banks_required[SharedBufferArea.IFM],
588 )
589 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
590
591 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
592
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200593 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200594 # perform nearest neighbor upscale
Jacob Bohlincf7da102020-05-20 09:03:40 +0200595 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
596 elif primary_op.type == "Conv2DBackpropInputSwitchedBias":
597 # perform insert zero upscale
598 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200599 else:
Jacob Bohlincf7da102020-05-20 09:03:40 +0200600 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
Tim Hall79d07d22020-04-27 18:20:16 +0100601
602 if npu_block_type in set(
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200603 (
604 NpuBlockType.ConvolutionMxN,
605 NpuBlockType.ConvolutionDepthWise,
606 NpuBlockType.Pooling,
607 NpuBlockType.ReduceSum,
608 )
Tim Hall79d07d22020-04-27 18:20:16 +0100609 ):
610 # Set up padding
611 explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
612
613 # Check if this is for horizontal ifm streaming
614 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
615 explicit_padding[0] = cmd.pad_top
616 explicit_padding[2] = cmd.pad_bottom
617
618 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
619 # because of activation function needed to be fused.
620 if cmd.ifm_box.start_coord[-2] > 0:
621 explicit_padding[1] = 0
622 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
623 explicit_padding[3] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100624 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
625 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
626 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
627 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
628
Dwight Lidman0538a772020-05-06 14:09:17 +0200629 # set kernel x stride low bit
630 stride = primary_op.attrs["strides"][2] - 1 & 1
631 # set kernel y stride low bit
632 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
633 # set kernel x stride extension bits
634 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
635 # set kernel y stride extension bits
636 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
637
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200638 if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100639 k_height, k_width = primary_op.attrs["ksize"][1:3]
640 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
641 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
642
643 valid_padding = sum(explicit_padding) == 0
644
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200645 if (
646 primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear", "ReduceSum"))
647 and valid_padding
648 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100649 # For valid padding vela has to output scaling values
650 if faf == "Sigmoid" or faf == "Tanh":
651 rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
Tim Hall79d07d22020-04-27 18:20:16 +0100652
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200653 if cmd.ifm_tensor.dtype == DataType.int16:
Charles Xu749d9212020-06-11 12:39:19 +0200654 multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32 + 0.5))
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200655 rescale *= 3 * multiplier
656
657 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
Tim Hall79d07d22020-04-27 18:20:16 +0100658 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200659
660 if cmd.ifm_tensor.dtype == DataType.int16:
661 scale = (1 << shift) * 3 * multiplier
662 else:
663 scale = int(round_away_zero(scale * rescale))
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200664 elif fused_quantize:
665 # Quantize op requires different scaling
666 ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)
667 ofm_scale_f64 = np.double(cmd.ofm_tensor.quantization.scale_f32)
668 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
Tim Hall79d07d22020-04-27 18:20:16 +0100669 else:
670 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
671 # k_height == k_width == 1 is allways true in this case
672 # Normally the scale is maximised, to get maximum precision, which means that
673 # if rescale != 1, scale need to consider the number of bits needed for rescaling
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200674 if None not in (
675 cmd.ofm_tensor.quantization.scale_f32,
676 cmd.ifm_tensor.quantization.scale_f32,
677 ):
678 rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
679 rescale_bits = 0
680 if k_height == k_width == 1:
681 if fmf == "ConcatSliceWrite":
682 rounding_mode = rounding.NATURAL
683 if rescale > 1:
684 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
685 elif rescale < 1:
686 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
687 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
688 scale = int(round_away_zero(scale * rescale))
689 else:
690 scale = 1
691 shift = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100692
693 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
694 # Valid-padded average pool should use the global scale from
695 # NPU_SET_OFM_SCALE register, which is set above.
696 use_global_scale = True
697
698 else: # Convolution
699 assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200700 # Reduced precision quantization and natural rounding used for int16
701 if cmd.ifm_tensor.dtype == DataType.int16:
702 rounding_mode = rounding.NATURAL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200703 stride |= (cur_kernel.dilation.y - 1) << 4
704 stride |= (cur_kernel.dilation.x - 1) << 3
705 emit.cmd0_with_param(
706 cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
707 )
708 emit.cmd0_with_param(
709 cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
710 )
Tim Hall79d07d22020-04-27 18:20:16 +0100711 if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
712 # Part-kernel-first weight ordering
713 assert npu_block_type == NpuBlockType.ConvolutionMxN
714 stride |= 1 << 2
715
716 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
717
718 elif npu_block_type in set((NpuBlockType.VectorProduct,)):
719 # Vector product is implemented using a 1x1 convolution so need
720 # to setup the appropriate padding and kernel info
721 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
722 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
723 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
724 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
725
726 # kernel stride reg = 0 means stride(1,1) + depth first weight
727 # order + dilation(0,0) + kernel_split_size=8
728 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
729
730 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
731 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
732
733 if npu_block_type in set(
734 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
735 ):
736 # Emit Weight base address commands, only maps the area required for
737 # this command's weights from the larger tensor.
738 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100739 weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200740 substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100741
742 # Extract weight substream offsets and calculate their lengths
743 assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100744 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100745
Tim Hall62316762020-06-25 16:55:02 +0100746 # Set weights sources for active and present cores
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200747 for core, param in enumerate(
748 [
749 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
750 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
751 ]
752 ):
Tim Hall62316762020-06-25 16:55:02 +0100753 if core < substreams:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200754 emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])
755 emit.cmd1_with_offset(
756 param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]
757 )
Tim Hall62316762020-06-25 16:55:02 +0100758 elif core < arch.ncores:
759 emit.cmd1_with_offset(param[0], weight_addr)
760 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100761
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200762 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100763 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100764
765 # Emit Scale & Bias base address commands, with length matching the amount required by
766 # the weight tensors.
767 if cmd.scale_tensor is not None:
Tim Hallf7e810a2020-06-25 15:04:31 +0100768 scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200769 substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100770
771 # Extract scale substream offsets and calculate their lengths
772 assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200773 scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
Tim Hallf7e810a2020-06-25 15:04:31 +0100774
Tim Hall62316762020-06-25 16:55:02 +0100775 # Set scale sources for active and present cores
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200776 for core, param in enumerate(
777 [
778 (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
779 (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),
780 ]
781 ):
Tim Hall62316762020-06-25 16:55:02 +0100782 if core < substreams:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200783 emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])
784 emit.cmd1_with_offset(
785 param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]
786 )
Tim Hall62316762020-06-25 16:55:02 +0100787 elif core < arch.ncores:
788 emit.cmd1_with_offset(param[0], scale_addr)
789 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100790
Tim Hall79d07d22020-04-27 18:20:16 +0100791 # Emit base address for NPU to access scale & bias data
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200792 scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100793 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100794
795 ofm_quant = cmd.ofm_tensor.quantization
796 ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
797 ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
798 ifm_min = cmd.ifm_tensor.quantization.min
799 ifm_max = cmd.ifm_tensor.quantization.max
800
801 # Emit commands for any fused activation function
Diego Russoea6111a2020-04-14 18:41:58 +0100802 if faf is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100803 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
804 # Even if no activation function, values need to be set to override previous values
805 faf_min = ofm_quant_qmin
806 faf_max = ofm_quant_qmax
807 elif faf == "Relu":
808 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
809 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
810 faf_max = ofm_quant_qmax
811 elif faf == "Relu6":
812 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
813 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
814 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
815 elif faf == "ReluN1To1":
816 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
817 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
818 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
819 elif faf == "Tanh":
820 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200821 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
822 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
823 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
824 else:
825 faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
826 faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100827 elif faf == "Sigmoid":
828 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200829 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
830 faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
831 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
832 else:
833 faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
834 faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200835 elif faf == "LUT":
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200836 lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1)
837 assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range."
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200838 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)
839 faf_min = ofm_quant_qmin
840 faf_max = ofm_quant_qmax
Tim Hall79d07d22020-04-27 18:20:16 +0100841 else:
842 raise Exception("Unsupported fused_activation_function = " + faf)
843
844 # Activation range needs to be set based upon the quantisation range and the fused activation range
845 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
846 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
847
848 out_shape = cmd.ofm_box.get_size_shape()
849 if len(out_shape) >= 4:
850 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
851 else:
852 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
853 if len(out_shape) >= 2:
854 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
855 else:
856 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
857 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
858
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200859 if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100860 in_shape = cmd.ifm_box.get_size_shape()
861 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
862 else:
863 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
864
Jacob Bohlin3c678292020-04-27 10:27:25 +0200865 for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
Tim Hall79d07d22020-04-27 18:20:16 +0100866 (
867 cmd.ifm_tensor,
868 cmd.ifm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200869 cmd0.NPU_SET_IFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100870 (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
871 (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
872 cmd0.NPU_SET_IFM_ZERO_POINT,
873 ),
874 (
875 cmd.ifm2_tensor,
876 cmd.ifm2_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200877 cmd0.NPU_SET_IFM2_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100878 (
879 cmd1.NPU_SET_IFM2_BASE0,
880 cmd1.NPU_SET_IFM2_BASE1,
881 cmd1.NPU_SET_IFM2_BASE2,
882 cmd1.NPU_SET_IFM2_BASE3,
883 ),
884 (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
885 cmd0.NPU_SET_IFM2_ZERO_POINT,
886 ),
887 (
888 cmd.ofm_tensor,
889 cmd.ofm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200890 cmd0.NPU_SET_OFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100891 (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
892 (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
893 cmd0.NPU_SET_OFM_ZERO_POINT,
894 ),
895 ):
896
Diego Russoea6111a2020-04-14 18:41:58 +0100897 if tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100898 continue
899
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200900 need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite") or fused_quantize
Tim Hall79d07d22020-04-27 18:20:16 +0100901 if (
Dwight Lidman86d49932020-06-04 15:31:56 +0200902 primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and not need_zero_point
Diego Russoea6111a2020-04-14 18:41:58 +0100903 ) or tens.quantization is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100904 # Actual integer operation, just set scale to 1 and zero point to 0
905 emit.cmd0_with_param(zero_point_op, 0)
906 else:
907 assert tens.quantization.zero_point is not None, "need an actual zero point set"
Charles Xu9a03fdf2020-07-02 15:12:40 +0200908 if (
909 "resizebilinear" in primary_op.attrs
910 and primary_op.type == "AddAct"
911 and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op
912 ):
913 # Force output zero point same as the input zero point
914 # for resizebiliner 1x1 that is converted to add
915 zero_point = cmd.ifm2_tensor.quantization.zero_point
916 else:
917 zero_point = tens.quantization.zero_point
918 emit.cmd0_with_param(zero_point_op, int(zero_point))
Tim Hall79d07d22020-04-27 18:20:16 +0100919
920 if tens.shape == []:
921 # Empty shape, elementwise constant
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200922 ifm2_scalar = tens.quant_values
Tim Hall79d07d22020-04-27 18:20:16 +0100923 assert ifm2_scalar.size == 1
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200924 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
Tim Hall79d07d22020-04-27 18:20:16 +0100925 continue
926
927 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
928 box.start_coord, box.end_coord
929 )
930 if npu_block_type != NpuBlockType.VectorProduct:
931 if tens == cmd.ifm_tensor:
932 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
933 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
934 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
935 elif tens == cmd.ofm_tensor:
936 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
937 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
938 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
Louis Verhaard0cf06c72020-05-12 08:31:05 +0200939 if tens == cmd.ifm2_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100940 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
941 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
942 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
943 else:
944 if len(out_shape) == 2:
945 # TODO: N is put in W-dimension for now
946 # Should be spread over H and W, but then block size selectetion,
947 # and stride calculation should be changed
948 if tens == cmd.ifm_tensor:
949 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
950 elif tens == cmd.ofm_tensor:
951 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
952 else:
953 assert False
954
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200955 emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
Jacob Bohlin3c678292020-04-27 10:27:25 +0200956
Tim Hall79d07d22020-04-27 18:20:16 +0100957 for idx, addr in enumerate(addresses):
958 if addr is None:
959 addresses[idx] = 0
960
961 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
962 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
963 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
964 emit.cmd1_with_offset(ptr_ops[3], addresses[3])
965
966 strides = tens.get_strides()
967 emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
968 emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
969 emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
970
971 if tens.format == TensorFormat.NHCWB16:
972 # Check that all BasePointer addresses are aligned to 16 bytes
973 assert (int(addresses[0]) % 16) == 0
974 assert (int(addresses[1]) % 16) == 0
975 assert (int(addresses[2]) % 16) == 0
976 assert (int(addresses[3]) % 16) == 0
977
978 ofm_dtype = cmd.ofm_tensor.dtype
979 assert ofm_dtype.type & BaseType.Int
980 prec = 0
981 if ofm_dtype.size_in_bits() == 8:
982 prec = 0
983 elif ofm_dtype.size_in_bits() == 16:
984 prec = 2
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200985 elif ofm_dtype.size_in_bits() == 32:
986 prec = 4
Tim Hall79d07d22020-04-27 18:20:16 +0100987 else:
988 assert 0
989
990 if ofm_dtype.type & BaseType.Signed:
991 prec += 1
992
993 if use_global_scale:
994 # Set global scale bit, as opposed to using per channel scale
995 prec |= 1 << 8
996
997 if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
998 prec |= 1 << 6
999
1000 prec |= rounding_mode.value << 14
1001
1002 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
1003
1004 prec = None
1005 weight_bits = 8
1006 if cmd.weight_tensor is not None:
1007 weight_bits = cmd.weight_tensor.dtype.size_in_bits()
1008
1009 ifm_dtype = cmd.ifm_tensor.dtype
1010
1011 assert weight_bits == 8, "Unsupported weight bit depth"
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001012 assert (
1013 ifm_dtype.size_in_bits() in {8, 16}
1014 or ifm_dtype.size_in_bits() == 32
1015 and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)
1016 ), "Unsupported ifm bit depth"
Tim Hall79d07d22020-04-27 18:20:16 +01001017
1018 if ifm_dtype.size_in_bits() == 8:
1019 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001020 prec = ifm_precision.S8
Tim Hall79d07d22020-04-27 18:20:16 +01001021 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001022 prec = ifm_precision.U8
Tim Hall79d07d22020-04-27 18:20:16 +01001023 elif ifm_dtype.size_in_bits() == 16:
1024 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001025 prec = ifm_precision.S16
Tim Hall79d07d22020-04-27 18:20:16 +01001026 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001027 prec = ifm_precision.U16
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001028 elif ifm_dtype == DataType.int32:
1029 prec = ifm_precision.S32
Tim Hall79d07d22020-04-27 18:20:16 +01001030
1031 ifm_prec = prec.value
1032 ifm2_prec = ifm_prec
1033
1034 if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
1035 ifm_prec |= 1 << 6
1036
1037 ifm_prec |= op_to_scale << 8
1038
1039 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
1040
1041 if cmd.ifm2_tensor is not None:
1042 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
1043 ifm2_prec |= 1 << 6
1044 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
1045
Tim Hall79d07d22020-04-27 18:20:16 +01001046 # Get op parameters
1047 cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
1048 cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
1049 cur_ofm_rect = get_op_ofm_rect(cmd)
1050 cur_ifm_rect = get_op_ifm_rect(cmd)
Tim Hall79d07d22020-04-27 18:20:16 +01001051 cur_padLT = get_op_padding_lt(cmd)
1052 if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
1053 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
1054 blockdep = arch.calc_block_dep(
1055 prev_ifm_rect,
1056 prev_ofm_rect,
1057 prev_ifm_block_depth,
1058 prev_ofm_block,
1059 prev_kernel,
1060 cur_ifm_rect,
1061 cur_ofm_rect,
1062 cur_ifm_block_depth,
1063 cur_ofm_block,
1064 cur_kernel,
1065 cur_padLT,
1066 )
1067 else:
1068 blockdep = 0
1069 else:
1070 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
1071
1072 # Set between every op (dependent or not)
1073 blockdep = min(blockdep, arch.max_blockdep)
1074 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1075 prev_cmd = cmd
1076
Tim Hall289a41d2020-08-04 21:40:14 +01001077 emit_cmd_waits(cmd_waits)
1078
Tim Hall79d07d22020-04-27 18:20:16 +01001079 if npu_block_type == NpuBlockType.ConvolutionMxN:
1080 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1081 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
1082 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1083 elif npu_block_type == NpuBlockType.VectorProduct:
1084 # Vector product is implemented using a 1x1 convolution
1085 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1086 elif npu_block_type == NpuBlockType.Pooling:
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001087 param = pooling_mode.MAX.value if "Max" in primary_op.type else pooling_mode.AVERAGE.value
Tim Hall79d07d22020-04-27 18:20:16 +01001088 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001089 elif npu_block_type == NpuBlockType.ReduceSum:
1090 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)
Tim Hall79d07d22020-04-27 18:20:16 +01001091 elif npu_block_type == NpuBlockType.ElementWise:
1092 param = elementwise_mode_map[primary_op.type]
1093 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
1094 else:
1095 print("Warning: Skipping register command stream generation for", ps)
1096
1097 # Fill in final part of command stream:
1098 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1099
1100 sg.register_command_stream = emit.to_list()
1101 if verbose:
1102 emit.print_cmds()
1103 print("number of commands", len(emit.cmd_stream))
1104 print("command stream length in words", len(sg.register_command_stream))