blob: 471953d992b783122bfd2a6467d2727ce3348fec [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
18# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
19# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010023
24import numpy as np
25
26from . import scaling
Diego Russoe8a10452020-04-21 17:39:10 +010027from .architecture_features import ArchitectureFeatures
28from .architecture_features import Block
29from .architecture_features import Kernel
30from .architecture_features import Rect
31from .architecture_features import SharedBufferArea
32from .architecture_features import SHRAMElements
33from .data_type import BaseType
34from .data_type import DataType
35from .ethos_u55_regs.ethos_u55_regs import acc_format
36from .ethos_u55_regs.ethos_u55_regs import activation
37from .ethos_u55_regs.ethos_u55_regs import cmd0
38from .ethos_u55_regs.ethos_u55_regs import cmd1
39from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
40from .ethos_u55_regs.ethos_u55_regs import ifm_precision
Fredrik Svedberga0c36242020-06-03 15:43:31 +020041from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020042from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010043from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010044from .high_level_command_stream import CommandType
Diego Russoe8a10452020-04-21 17:39:10 +010045from .numeric_util import clamp_sigmoid
46from .numeric_util import clamp_tanh
Louis Verhaardb2fb2122020-06-04 15:51:24 +020047from .numeric_util import full_shape
Diego Russoe8a10452020-04-21 17:39:10 +010048from .numeric_util import quantise_float32
49from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010050from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010051from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010052from .shared_buffer_allocation import SharedBufferAllocation
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020053from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010054from .tensor import TensorBlockTraversal
55from .tensor import TensorFormat
Fredrik Svedberga0c36242020-06-03 15:43:31 +020056from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010057
58
59class RegisterMachine:
60 def __init__(self):
61 self.n_banks = 1
62 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
63 self.bank_idx = 0
64
65 def set_register(self, reg, value):
66 is_changed = self.registers[self.bank_idx][reg] != value
67 self.registers[self.bank_idx][reg] = value
68 # is_changed = True # force command
69 return is_changed
70
71 def switch_bank(self):
72 self.bank_idx = (self.bank_idx + 1) % self.n_banks
73
74
75class CmdMode(IntEnum):
76 NoPayload = 0x0000
77 Payload32 = 0x4000
78 Mask = 0xC000
79 CmdOpMask = 0x03FF
80
81
82class BasePointerIndex(IntEnum):
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020083 WeightTensor = 0 # base address index for the Weight tensor
84 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
85 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Fredrik Svedberga0c36242020-06-03 15:43:31 +020086 Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
Tim Hall79d07d22020-04-27 18:20:16 +010087
88
89# TODO: Replace with definitions from ethos_u55_regs
90class IFM2Broadcast(IntEnum):
91 BroadcastHdim = 1 << 0
92 BroadcastWdim = 1 << 1
93 BroadcastCdim = 1 << 2
94 ReverseOperandOrder = 1 << 6
95 UseIFM2Scalar = 1 << 7
96
97
98class CommandStreamEmitter:
99 def __init__(self):
100 self.cmd_stream = []
101 self.reg_machine = [RegisterMachine(), RegisterMachine()]
102 self.last_absolute_wait = defaultdict(int)
103
104 def get_reg_machine(self, cmd):
105 if "DMA" in cmd.name:
106 return self.reg_machine[1]
107 else:
108 return self.reg_machine[0]
109
110 def size_in_bytes(self):
111 sz = 0
112 for cmd in self.cmd_stream:
113 sz += len(cmd) * 4
114 return sz
115
116 def to_list(self):
117 return [elem for cmd in self.cmd_stream for elem in cmd]
118
119 def print_cmds(self):
120 print("Code: Command: Param: Payload:")
121 for words_for_one_command in self.cmd_stream:
122 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
123 param = words_for_one_command[0] >> 16 # higher 16 bits
124
125 payload_mode = CmdMode(code & CmdMode.Mask)
126
127 # code and command
128 s = " 0x%04x " % code
129 if payload_mode == CmdMode.NoPayload:
130 s += str(cmd0(code & CmdMode.CmdOpMask))
131 else:
132 s += str(cmd1(code & CmdMode.CmdOpMask))
133
134 s = s.ljust(40)
135 s += "%5d" % param
136
137 # payload
138 if payload_mode == CmdMode.Payload32:
139 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
140 else:
141 s += " -"
142
143 print(s)
144
145 def cmd0_with_param(self, cmd, param):
146 if isinstance(param, Enum):
147 param = int(param.value)
148 else:
149 param = int(param)
150 param = param & 0xFFFF
151 command = cmd.value | (param << 16)
152 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
153 return
154
155 # This is not a redundant command, actually write it
156 self.cmd_stream.append((command,))
157
158 def cmd1_with_offset(self, cmd, offset, param=0x0):
159 offset = int(offset) & 0xFFFFFFFFF
160 command = cmd.value | CmdMode.Payload32.value | (param << 16)
161
162 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
163 return
164
165 # This is not a redundant command, actually write it
166 self.cmd_stream.append((command, offset))
167
168 def cmd_wait(self, cmd, param, absolute_wait_time):
169 if absolute_wait_time <= self.last_absolute_wait[cmd]:
170 return
171
172 self.last_absolute_wait[cmd] = absolute_wait_time
173 param = int(param)
174 command = ((param & 0xFFFF) << 16) | cmd.value
175 self.cmd_stream.append((command,))
176
177 def cmd_do_operation(self, cmd, param=0):
178 param = int(param)
179 command = ((param & 0xFFFF) << 16) | cmd.value
180
181 self.cmd_stream.append((command,))
182 self.get_reg_machine(cmd).switch_bank()
183
184
185def calc_command_dependencies(cmd_stream, arch):
186 cmd_starts = {}
187 cmd_ends = {}
188 memory_accesses = {}
189
190 # Keep track of accumulated number of commands in command stream.
191 # First element kernel ops: (# of blocks, # of commands)
192 # Second element DMA ops: (# of commands)
Michael McGeagh8677e532020-07-28 11:32:22 +0100193 pos = np.array((np.array((0, 0)), np.array([0])), dtype=object)
Tim Hall79d07d22020-04-27 18:20:16 +0100194
195 dependencies = {}
196
197 for cmd in cmd_stream:
198 cmd_starts[cmd] = pos
199 op_count = cmd.get_operation_count()
200 # Keep track of both num blocks and commands
201 cmd_add = 0 if (op_count[0] == 0) else 1
Michael McGeagh8677e532020-07-28 11:32:22 +0100202 pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])), dtype=object)
203 cmd_ends[cmd] = np.array((pos[0], pos[1]), dtype=object)
Tim Hall79d07d22020-04-27 18:20:16 +0100204 memory_accesses[cmd] = cmd.get_memory_accesses()
205
206 for idx, cmd in enumerate(cmd_stream):
207 curr_accesses = memory_accesses[cmd]
208 # Keep track of command dependency.
209 # First element kernel ops: (# of blocks, # of commands)
210 # Second element DMA ops: (# of commands)
Michael McGeagh8677e532020-07-28 11:32:22 +0100211 dep_offsets = np.array((np.array((-1, -1)), np.array([-1])), dtype=object)
Tim Hall79d07d22020-04-27 18:20:16 +0100212 dep_cmds = [None] * CommandType.Size.value
213 if idx > 0:
214 # Look at the previous commands in backwards order
215 for prev_cmd in cmd_stream[idx - 1 :: -1]:
216 assert prev_cmd is not cmd
217 if dep_cmds[prev_cmd.cmdtype] is None:
218 is_dependency = False
219 if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
220 # Special handling here, as dpu -> dpu operations require additional care
221 if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
222 is_dependency = True
223 elif memory_accesses[prev_cmd].conflicts(curr_accesses):
224 is_dependency = True
225 else:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200226 if memory_accesses[prev_cmd].conflicts(curr_accesses) or (
227 prev_cmd.cmdtype == CommandType.DMA and prev_cmd.in_tensor.purpose == TensorPurpose.LUT
228 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100229 is_dependency = True
230
231 if is_dependency:
232 new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
233 if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
234 dep_cmds[prev_cmd.cmdtype] = prev_cmd
235 dep_offsets[prev_cmd.cmdtype] = new_offset
236
237 # Check if we've got dependencies for all commands, in which case we can early out
238 for dep in dep_cmds:
239 if dep is None:
240 break
241 else:
242 break # all handled
243
244 # Convert absolute to relative dependencies, using None to signal the special case of no
245 # dependency of this kind
246 res = [None] * CommandType.Size.value
247 for i in range(CommandType.Size.value):
248 if dep_cmds[i] is not None:
249 res[i] = cmd_starts[cmd][i] - dep_offsets[i]
250
251 dependencies[cmd] = cmd_starts[cmd], res
252
253 return dependencies
254
255
256def get_op_kernel(ps):
257 if ps.primary_op is None:
258 return None
259
260 strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
261 dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
262 if ps.weight_tensor:
263 if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
264 k_h = 1
265 k_w = 1
266 else:
267 k_h = ps.weight_tensor.shape[0]
268 k_w = ps.weight_tensor.shape[1]
269 else:
270 k_h = ps.primary_op.attrs.get("filter_height", 1)
271 k_w = ps.primary_op.attrs.get("filter_width", 1)
272
273 return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
274
275
Tim Hall79d07d22020-04-27 18:20:16 +0100276def has_prev_op_dependency(prev_cmd, cmd):
277 if prev_cmd is None:
278 return False
279 if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
Tim Hall90337952020-05-07 16:42:35 +0100280 if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:
Tim Hall79d07d22020-04-27 18:20:16 +0100281 return True
Tim Hall90337952020-05-07 16:42:35 +0100282 elif cmd.ifm2_tensor is not None:
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200283 return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id
Tim Hall79d07d22020-04-27 18:20:16 +0100284 return False
285
286
287def get_op_ofm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200288 start = full_shape(4, cmd.ofm_box.start_coord, 0)
289 end = full_shape(4, cmd.ofm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100290 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
291
292
293def get_op_ifm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200294 start = full_shape(4, cmd.ifm_box.start_coord, 0)
295 end = full_shape(4, cmd.ifm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100296 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
297
298
299def get_op_ifmofm_block_depth(arch, cmd):
300 # Note: NOT equivalent to the normal ifm block depth calculation since
301 # it takes into account 'depthless' block operations by returning full
302 # depth
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200303 if cmd.ps.npu_block_type in (
304 NpuBlockType.ConvolutionDepthWise,
305 NpuBlockType.Pooling,
306 NpuBlockType.ElementWise,
307 NpuBlockType.ReduceSum,
308 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100309 return cmd.ofm_box.get_size_shape()[-1]
310
311 return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
312
313
314def get_op_padding_lt(cmd):
315 if cmd.ps.npu_block_type not in (
316 NpuBlockType.ConvolutionDepthWise,
317 NpuBlockType.Pooling,
318 NpuBlockType.ConvolutionMxN,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200319 NpuBlockType.ReduceSum,
Tim Hall79d07d22020-04-27 18:20:16 +0100320 ):
321 return (0, 0)
322
323 explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
324
325 # Check if this is for horizontal ifm streaming
326 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
327 explicit_padding[0] = cmd.pad_top
328 explicit_padding[2] = cmd.pad_bottom
329
330 return (explicit_padding[1], explicit_padding[0])
331
332
Jacob Bohline99b8932020-07-13 16:01:51 +0200333def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
334 if ifm_shape == []:
335 # Scalar needs to be in IFM2
336 return False
337 elif ifm2_shape == []:
338 return True
339
340 for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
341 if ifm != ifm2 and ifm == 1:
342 # Broadcasted FM needs to be in IFM2
343 return False
344
345 return True
346
347
Tim Hall79d07d22020-04-27 18:20:16 +0100348def generate_register_command_stream(nng, sg, arch, verbose=False):
349 emit = CommandStreamEmitter()
350
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200351 if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
352 base_ptr_idx_map = {
353 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
354 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
355 MemType.Scratch: BasePointerIndex.ScratchTensor,
356 MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
357 }
358 else:
359 base_ptr_idx_map = {
360 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
361 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
362 MemType.Scratch: BasePointerIndex.ScratchTensor,
363 MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
364 }
Tim Hall79d07d22020-04-27 18:20:16 +0100365
366 # Maps an AccumulatorType enum to the corresponding acc_format value
367 acc_format_map = {
368 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
369 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
370 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
371 }
372
373 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
374 elementwise_mode_map = {
375 "MulAct": elementwise_mode.MUL.value,
376 "AddAct": elementwise_mode.ADD.value,
377 "SubAct": elementwise_mode.SUB.value,
378 "Minimum": elementwise_mode.MIN.value,
379 "Maximum": elementwise_mode.MAX.value,
380 "LeakyRelu": elementwise_mode.LRELU.value,
381 "Abs": elementwise_mode.ABS.value,
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200382 "CLZ": elementwise_mode.CLZ.value,
383 "SHR": elementwise_mode.SHR.value,
384 "SHL": elementwise_mode.SHL.value,
Tim Hall79d07d22020-04-27 18:20:16 +0100385 }
386
387 cmd_stream = []
388 for cmd in sg.high_level_command_stream:
389 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
390 print("Warning: Skipping register command stream generation for", cmd.ps)
391 else:
392 cmd_stream.append(cmd)
393
394 dependencies = calc_command_dependencies(cmd_stream, arch)
395
396 # Initialise operator dependency state
397 prev_ifm_rect = cur_ifm_rect = None
398 prev_ifm_block_depth = cur_ifm_block_depth = None
399 prev_ofm_rect = cur_ofm_rect = None
400 prev_ofm_block = cur_ofm_block = None
401 prev_kernel = cur_kernel = None
402 prev_cmd = None
403
404 def emit_wait_commands(cmd):
405 # The command is fully set up, emit whatever wait commands we need
406 absolute_dep, relative_dep = dependencies[cmd]
407 if relative_dep[CommandType.NpuStripe] is not None:
408 if cmd.cmdtype == CommandType.DMA:
409 param = relative_dep[CommandType.NpuStripe][1]
410 if param <= 3:
411 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
412 else:
413 param = relative_dep[CommandType.NpuStripe][0]
414 param = min(param, 0xFFFF) # Clamp to allowable wait amount
415
416 if relative_dep[CommandType.DMA] is not None:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200417 # TODO This can be optimized for yoda
418 param = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100419 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
Tim Hall79d07d22020-04-27 18:20:16 +0100420
Tim Hall42e41892020-07-06 10:51:31 +0100421 if arch.is_yoda_system:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200422 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Tim Hallf7e810a2020-06-25 15:04:31 +0100423
Tim Hall79d07d22020-04-27 18:20:16 +0100424 for cmd in cmd_stream:
425 if cmd.cmdtype == CommandType.DMA:
426 start_coord = cmd.box.start_coord
427
428 src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
429 dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
430
431 if cmd.in_tensor.compressed_values is not None:
432 stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
433 sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
434 else:
435 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
436
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200437 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
Tim Hall79d07d22020-04-27 18:20:16 +0100438 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200439 if cmd.out_tensor.purpose == TensorPurpose.LUT:
440 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)
441 else:
442 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200443
Tim Hall79d07d22020-04-27 18:20:16 +0100444 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
445 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
446 dma_channel = 0
447 mode = 0 # From external to external
448
449 emit_wait_commands(cmd)
450 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
451
452 elif cmd.cmdtype == CommandType.NpuStripe:
453
454 ps = cmd.ps
455 primary_op = ps.primary_op
456 npu_block_type = ps.npu_block_type
457 # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
458 use_global_scale = False
459 # Specifies type of rounding to be used.
460 rounding_mode = rounding.TFL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200461 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200462 rounding_mode = rounding.TRUNCATE
Tim Hall79d07d22020-04-27 18:20:16 +0100463 fmf = primary_op.attrs.get("fused_memory_function", None)
464 faf = primary_op.attrs.get("fused_activation_function", None)
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200465 fused_quantize = any(op.type == "Quantize" for op in ps.ops)
Tim Hall79d07d22020-04-27 18:20:16 +0100466
467 # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
468 op_to_scale = 0
469
470 # Update state history
471 prev_ifm_rect = cur_ifm_rect
472 prev_ifm_block_depth = cur_ifm_block_depth
473 prev_ofm_rect = cur_ofm_rect
474 prev_ofm_block = cur_ofm_block
475 prev_kernel = cur_kernel
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200476 cur_kernel = get_op_kernel(ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100477
478 block_config = ps.block_config
479 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
480 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
481 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
482
483 shared_buffer = ps.shared_buffer
484
485 if npu_block_type == NpuBlockType.ElementWise:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200486 ifm2_broadcast = (
487 IFM2Broadcast.ReverseOperandOrder if primary_op.attrs.get("reverse_op_order", False) else 0
488 )
Tim Hall79d07d22020-04-27 18:20:16 +0100489
Jacob Bohline99b8932020-07-13 16:01:51 +0200490 if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
Tim Hall79d07d22020-04-27 18:20:16 +0100491 # The scalar has to be the ifm2 tensor so switch the ifms
492 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
493 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
494
495 # Set ReverseOperandOrder bit to IFM2_BROADCAST
496 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
497
498 # Calculate scales needed for arithmetic elementwise operators
499 if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
500 input_scale = cmd.ifm_tensor.quantization.scale_f32
501 input2_scale = cmd.ifm2_tensor.quantization.scale_f32
502 output_scale = cmd.ofm_tensor.quantization.scale_f32
503 use_global_scale = True
504
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200505 if output_scale is not None and faf in ("Sigmoid", "Tanh"):
506 output_scale = 1 / 0x3000
Tim Hall79d07d22020-04-27 18:20:16 +0100507
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200508 if primary_op.type == "MulAct":
509 if None in (input_scale, input2_scale, output_scale):
510 ofm_scale = 1
511 shift = 0
512 else:
513 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Tim Hall79d07d22020-04-27 18:20:16 +0100514 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
515 else: # AddAct/SubAct
Charles Xu9a03fdf2020-07-02 15:12:40 +0200516 # Force output scale same as the input scale for
517 # resizebiliner 1x1 that is converted to add
518 if "resizebilinear" in primary_op.attrs:
519 output_scale = input2_scale
520
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200521 if None in (input_scale, input2_scale, output_scale):
522 opa_scale = opb_scale = ofm_scale = 1
523 opa_shift = shift = 0
524 elif input_scale == input2_scale:
Tim Hall79d07d22020-04-27 18:20:16 +0100525 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
526 input_scale, input2_scale, output_scale
527 )
528 opa_shift = 0 # Unused for this case
529 else:
530 # Use advanced implementation only when input scales differ
531 bitdepth = cmd.ifm_tensor.dtype.bits
532 (
533 opa_scale,
534 opa_shift,
535 ofm_scale,
536 shift,
537 op_to_scale,
538 ) = scaling.advanced_elementwise_add_sub_scale(
539 input_scale, input2_scale, output_scale, bitdepth
540 )
541 opb_scale = 0 # Unused for this case
542 if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
543 # If the operand order is reversed we also have to swap which operand is scaled
544 if op_to_scale == scaling.OperandToScale.OPa:
545 op_to_scale = scaling.OperandToScale.OPb
546 else:
547 op_to_scale = scaling.OperandToScale.OPa
548
549 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
550 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
551 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
552
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200553 elif primary_op.type in set(("LeakyRelu", "Abs",)):
Tim Hall79d07d22020-04-27 18:20:16 +0100554 output_scale = cmd.ofm_tensor.quantization.scale_f32
555 use_global_scale = True
556
557 if primary_op.type == "LeakyRelu":
558 output_scale *= primary_op.attrs["alpha"]
559
560 ofm_scale, shift = scaling.quantise_scale(output_scale)
561 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200562 else:
563 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100564
565 # For elementwise set the required SHRAM to be equal to the total size of SHRAM
566 shram_required = arch.shram_total_banks
567 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
568
569 # Acc buffers not needed so set AB_START to size of SHRAM
570 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)
571
572 # Is not a unary operator
573 if cmd.ifm2_tensor is not None:
574 if cmd.ifm2_tensor.shape == []:
575 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
576 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
577 else:
578 ifm_box_shape = cmd.ifm_box.get_size_shape()
579 ifm2_box_shape = cmd.ifm2_box.get_size_shape()
580
581 if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
582 # Broadcast in 'H' dimension
583 assert cmd.ifm2_tensor.shape[1] == 1
584 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
585
586 if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
587 # Broadcast in 'W' dimension
588 assert cmd.ifm2_tensor.shape[2] == 1
589 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
590
591 if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
592 # Broadcast in 'C' dimension
593 assert cmd.ifm2_tensor.shape[3] == 1
594 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
595
596 # Set IFM2_IB_START to the latter half of the IB space
597 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
598 emit.cmd0_with_param(
599 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
600 )
601
602 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
603
604 else:
605 emit.cmd0_with_param(
606 cmd0.NPU_SET_IFM_IB_END,
607 shared_buffer.bank_locations[SharedBufferArea.IFM]
608 + shared_buffer.banks_required[SharedBufferArea.IFM],
609 )
610 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
611
612 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
613
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200614 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200615 # perform nearest neighbor upscale
Jacob Bohlincf7da102020-05-20 09:03:40 +0200616 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
617 elif primary_op.type == "Conv2DBackpropInputSwitchedBias":
618 # perform insert zero upscale
619 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200620 else:
Jacob Bohlincf7da102020-05-20 09:03:40 +0200621 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
Tim Hall79d07d22020-04-27 18:20:16 +0100622
623 if npu_block_type in set(
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200624 (
625 NpuBlockType.ConvolutionMxN,
626 NpuBlockType.ConvolutionDepthWise,
627 NpuBlockType.Pooling,
628 NpuBlockType.ReduceSum,
629 )
Tim Hall79d07d22020-04-27 18:20:16 +0100630 ):
631 # Set up padding
632 explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
633
634 # Check if this is for horizontal ifm streaming
635 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
636 explicit_padding[0] = cmd.pad_top
637 explicit_padding[2] = cmd.pad_bottom
638
639 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
640 # because of activation function needed to be fused.
641 if cmd.ifm_box.start_coord[-2] > 0:
642 explicit_padding[1] = 0
643 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
644 explicit_padding[3] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100645 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
646 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
647 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
648 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
649
Dwight Lidman0538a772020-05-06 14:09:17 +0200650 # set kernel x stride low bit
651 stride = primary_op.attrs["strides"][2] - 1 & 1
652 # set kernel y stride low bit
653 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
654 # set kernel x stride extension bits
655 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
656 # set kernel y stride extension bits
657 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
658
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200659 if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100660 k_height, k_width = primary_op.attrs["ksize"][1:3]
661 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
662 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
663
664 valid_padding = sum(explicit_padding) == 0
665
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200666 if (
667 primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear", "ReduceSum"))
668 and valid_padding
669 ):
Tim Hall79d07d22020-04-27 18:20:16 +0100670 # For valid padding vela has to output scaling values
671 if faf == "Sigmoid" or faf == "Tanh":
672 rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
Tim Hall79d07d22020-04-27 18:20:16 +0100673
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200674 if cmd.ifm_tensor.dtype == DataType.int16:
Charles Xu749d9212020-06-11 12:39:19 +0200675 multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32 + 0.5))
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200676 rescale *= 3 * multiplier
677
678 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
Tim Hall79d07d22020-04-27 18:20:16 +0100679 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200680
681 if cmd.ifm_tensor.dtype == DataType.int16:
682 scale = (1 << shift) * 3 * multiplier
683 else:
684 scale = int(round_away_zero(scale * rescale))
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200685 elif fused_quantize:
686 # Quantize op requires different scaling
687 ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)
688 ofm_scale_f64 = np.double(cmd.ofm_tensor.quantization.scale_f32)
689 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
Tim Hall79d07d22020-04-27 18:20:16 +0100690 else:
691 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
692 # k_height == k_width == 1 is allways true in this case
693 # Normally the scale is maximised, to get maximum precision, which means that
694 # if rescale != 1, scale need to consider the number of bits needed for rescaling
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200695 if None not in (
696 cmd.ofm_tensor.quantization.scale_f32,
697 cmd.ifm_tensor.quantization.scale_f32,
698 ):
699 rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
700 rescale_bits = 0
701 if k_height == k_width == 1:
702 if fmf == "ConcatSliceWrite":
703 rounding_mode = rounding.NATURAL
704 if rescale > 1:
705 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
706 elif rescale < 1:
707 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
708 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
709 scale = int(round_away_zero(scale * rescale))
710 else:
711 scale = 1
712 shift = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100713
714 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
715 # Valid-padded average pool should use the global scale from
716 # NPU_SET_OFM_SCALE register, which is set above.
717 use_global_scale = True
718
719 else: # Convolution
720 assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200721 # Reduced precision quantization and natural rounding used for int16
722 if cmd.ifm_tensor.dtype == DataType.int16:
723 rounding_mode = rounding.NATURAL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200724 stride |= (cur_kernel.dilation.y - 1) << 4
725 stride |= (cur_kernel.dilation.x - 1) << 3
726 emit.cmd0_with_param(
727 cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
728 )
729 emit.cmd0_with_param(
730 cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
731 )
Tim Hall79d07d22020-04-27 18:20:16 +0100732 if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
733 # Part-kernel-first weight ordering
734 assert npu_block_type == NpuBlockType.ConvolutionMxN
735 stride |= 1 << 2
736
737 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
738
739 elif npu_block_type in set((NpuBlockType.VectorProduct,)):
740 # Vector product is implemented using a 1x1 convolution so need
741 # to setup the appropriate padding and kernel info
742 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
743 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
744 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
745 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
746
747 # kernel stride reg = 0 means stride(1,1) + depth first weight
748 # order + dilation(0,0) + kernel_split_size=8
749 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
750
751 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
752 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
753
754 if npu_block_type in set(
755 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
756 ):
757 # Emit Weight base address commands, only maps the area required for
758 # this command's weights from the larger tensor.
759 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100760 weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200761 substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100762
763 # Extract weight substream offsets and calculate their lengths
764 assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100765 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100766
Tim Hall62316762020-06-25 16:55:02 +0100767 # Set weights sources for active and present cores
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200768 for core, param in enumerate(
769 [
770 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
771 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
772 ]
773 ):
Tim Hall62316762020-06-25 16:55:02 +0100774 if core < substreams:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200775 emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])
776 emit.cmd1_with_offset(
777 param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]
778 )
Tim Hall62316762020-06-25 16:55:02 +0100779 elif core < arch.ncores:
780 emit.cmd1_with_offset(param[0], weight_addr)
781 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100782
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200783 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100784 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100785
786 # Emit Scale & Bias base address commands, with length matching the amount required by
787 # the weight tensors.
788 if cmd.scale_tensor is not None:
Tim Hallf7e810a2020-06-25 15:04:31 +0100789 scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200790 substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100791
792 # Extract scale substream offsets and calculate their lengths
793 assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200794 scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
Tim Hallf7e810a2020-06-25 15:04:31 +0100795
Tim Hall62316762020-06-25 16:55:02 +0100796 # Set scale sources for active and present cores
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200797 for core, param in enumerate(
798 [
799 (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
800 (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),
801 ]
802 ):
Tim Hall62316762020-06-25 16:55:02 +0100803 if core < substreams:
Jacob Bohlin0b9ca782020-07-09 11:16:30 +0200804 emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])
805 emit.cmd1_with_offset(
806 param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]
807 )
Tim Hall62316762020-06-25 16:55:02 +0100808 elif core < arch.ncores:
809 emit.cmd1_with_offset(param[0], scale_addr)
810 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100811
Tim Hall79d07d22020-04-27 18:20:16 +0100812 # Emit base address for NPU to access scale & bias data
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200813 scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100814 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100815
816 ofm_quant = cmd.ofm_tensor.quantization
817 ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
818 ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
819 ifm_min = cmd.ifm_tensor.quantization.min
820 ifm_max = cmd.ifm_tensor.quantization.max
821
822 # Emit commands for any fused activation function
Diego Russoea6111a2020-04-14 18:41:58 +0100823 if faf is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100824 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
825 # Even if no activation function, values need to be set to override previous values
826 faf_min = ofm_quant_qmin
827 faf_max = ofm_quant_qmax
828 elif faf == "Relu":
829 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
830 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
831 faf_max = ofm_quant_qmax
832 elif faf == "Relu6":
833 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
834 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
835 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
836 elif faf == "ReluN1To1":
837 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
838 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
839 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
840 elif faf == "Tanh":
841 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200842 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
843 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
844 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
845 else:
846 faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
847 faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100848 elif faf == "Sigmoid":
849 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200850 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
851 faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
852 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
853 else:
854 faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
855 faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200856 elif faf == "LUT":
857 lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", 0)
858 assert lut_index <= activation.LUT_END.value, "LUT index out of range."
859 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)
860 faf_min = ofm_quant_qmin
861 faf_max = ofm_quant_qmax
Tim Hall79d07d22020-04-27 18:20:16 +0100862 else:
863 raise Exception("Unsupported fused_activation_function = " + faf)
864
865 # Activation range needs to be set based upon the quantisation range and the fused activation range
866 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
867 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
868
869 out_shape = cmd.ofm_box.get_size_shape()
870 if len(out_shape) >= 4:
871 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
872 else:
873 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
874 if len(out_shape) >= 2:
875 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
876 else:
877 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
878 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
879
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200880 if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100881 in_shape = cmd.ifm_box.get_size_shape()
882 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
883 else:
884 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
885
Jacob Bohlin3c678292020-04-27 10:27:25 +0200886 for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
Tim Hall79d07d22020-04-27 18:20:16 +0100887 (
888 cmd.ifm_tensor,
889 cmd.ifm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200890 cmd0.NPU_SET_IFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100891 (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
892 (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
893 cmd0.NPU_SET_IFM_ZERO_POINT,
894 ),
895 (
896 cmd.ifm2_tensor,
897 cmd.ifm2_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200898 cmd0.NPU_SET_IFM2_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100899 (
900 cmd1.NPU_SET_IFM2_BASE0,
901 cmd1.NPU_SET_IFM2_BASE1,
902 cmd1.NPU_SET_IFM2_BASE2,
903 cmd1.NPU_SET_IFM2_BASE3,
904 ),
905 (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
906 cmd0.NPU_SET_IFM2_ZERO_POINT,
907 ),
908 (
909 cmd.ofm_tensor,
910 cmd.ofm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200911 cmd0.NPU_SET_OFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100912 (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
913 (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
914 cmd0.NPU_SET_OFM_ZERO_POINT,
915 ),
916 ):
917
Diego Russoea6111a2020-04-14 18:41:58 +0100918 if tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100919 continue
920
Jacob Bohlin9fbc4912020-06-29 11:58:50 +0200921 need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite") or fused_quantize
Tim Hall79d07d22020-04-27 18:20:16 +0100922 if (
Dwight Lidman86d49932020-06-04 15:31:56 +0200923 primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and not need_zero_point
Diego Russoea6111a2020-04-14 18:41:58 +0100924 ) or tens.quantization is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100925 # Actual integer operation, just set scale to 1 and zero point to 0
926 emit.cmd0_with_param(zero_point_op, 0)
927 else:
928 assert tens.quantization.zero_point is not None, "need an actual zero point set"
Charles Xu9a03fdf2020-07-02 15:12:40 +0200929 if (
930 "resizebilinear" in primary_op.attrs
931 and primary_op.type == "AddAct"
932 and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op
933 ):
934 # Force output zero point same as the input zero point
935 # for resizebiliner 1x1 that is converted to add
936 zero_point = cmd.ifm2_tensor.quantization.zero_point
937 else:
938 zero_point = tens.quantization.zero_point
939 emit.cmd0_with_param(zero_point_op, int(zero_point))
Tim Hall79d07d22020-04-27 18:20:16 +0100940
941 if tens.shape == []:
942 # Empty shape, elementwise constant
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200943 ifm2_scalar = tens.quant_values
Tim Hall79d07d22020-04-27 18:20:16 +0100944 assert ifm2_scalar.size == 1
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200945 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
Tim Hall79d07d22020-04-27 18:20:16 +0100946 continue
947
948 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
949 box.start_coord, box.end_coord
950 )
951 if npu_block_type != NpuBlockType.VectorProduct:
952 if tens == cmd.ifm_tensor:
953 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
954 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
955 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
956 elif tens == cmd.ofm_tensor:
957 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
958 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
959 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
Louis Verhaard0cf06c72020-05-12 08:31:05 +0200960 if tens == cmd.ifm2_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100961 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
962 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
963 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
964 else:
965 if len(out_shape) == 2:
966 # TODO: N is put in W-dimension for now
967 # Should be spread over H and W, but then block size selectetion,
968 # and stride calculation should be changed
969 if tens == cmd.ifm_tensor:
970 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
971 elif tens == cmd.ofm_tensor:
972 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
973 else:
974 assert False
975
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200976 emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
Jacob Bohlin3c678292020-04-27 10:27:25 +0200977
Tim Hall79d07d22020-04-27 18:20:16 +0100978 for idx, addr in enumerate(addresses):
979 if addr is None:
980 addresses[idx] = 0
981
982 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
983 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
984 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
985 emit.cmd1_with_offset(ptr_ops[3], addresses[3])
986
987 strides = tens.get_strides()
988 emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
989 emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
990 emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
991
992 if tens.format == TensorFormat.NHCWB16:
993 # Check that all BasePointer addresses are aligned to 16 bytes
994 assert (int(addresses[0]) % 16) == 0
995 assert (int(addresses[1]) % 16) == 0
996 assert (int(addresses[2]) % 16) == 0
997 assert (int(addresses[3]) % 16) == 0
998
999 ofm_dtype = cmd.ofm_tensor.dtype
1000 assert ofm_dtype.type & BaseType.Int
1001 prec = 0
1002 if ofm_dtype.size_in_bits() == 8:
1003 prec = 0
1004 elif ofm_dtype.size_in_bits() == 16:
1005 prec = 2
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001006 elif ofm_dtype.size_in_bits() == 32:
1007 prec = 4
Tim Hall79d07d22020-04-27 18:20:16 +01001008 else:
1009 assert 0
1010
1011 if ofm_dtype.type & BaseType.Signed:
1012 prec += 1
1013
1014 if use_global_scale:
1015 # Set global scale bit, as opposed to using per channel scale
1016 prec |= 1 << 8
1017
1018 if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
1019 prec |= 1 << 6
1020
1021 prec |= rounding_mode.value << 14
1022
1023 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
1024
1025 prec = None
1026 weight_bits = 8
1027 if cmd.weight_tensor is not None:
1028 weight_bits = cmd.weight_tensor.dtype.size_in_bits()
1029
1030 ifm_dtype = cmd.ifm_tensor.dtype
1031
1032 assert weight_bits == 8, "Unsupported weight bit depth"
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001033 assert (
1034 ifm_dtype.size_in_bits() in {8, 16}
1035 or ifm_dtype.size_in_bits() == 32
1036 and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)
1037 ), "Unsupported ifm bit depth"
Tim Hall79d07d22020-04-27 18:20:16 +01001038
1039 if ifm_dtype.size_in_bits() == 8:
1040 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001041 prec = ifm_precision.S8
Tim Hall79d07d22020-04-27 18:20:16 +01001042 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001043 prec = ifm_precision.U8
Tim Hall79d07d22020-04-27 18:20:16 +01001044 elif ifm_dtype.size_in_bits() == 16:
1045 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001046 prec = ifm_precision.S16
Tim Hall79d07d22020-04-27 18:20:16 +01001047 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +02001048 prec = ifm_precision.U16
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001049 elif ifm_dtype == DataType.int32:
1050 prec = ifm_precision.S32
Tim Hall79d07d22020-04-27 18:20:16 +01001051
1052 ifm_prec = prec.value
1053 ifm2_prec = ifm_prec
1054
1055 if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
1056 ifm_prec |= 1 << 6
1057
1058 ifm_prec |= op_to_scale << 8
1059
1060 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
1061
1062 if cmd.ifm2_tensor is not None:
1063 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
1064 ifm2_prec |= 1 << 6
1065 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
1066
1067 emit_wait_commands(cmd)
1068
1069 # Get op parameters
1070 cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
1071 cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
1072 cur_ofm_rect = get_op_ofm_rect(cmd)
1073 cur_ifm_rect = get_op_ifm_rect(cmd)
Tim Hall79d07d22020-04-27 18:20:16 +01001074 cur_padLT = get_op_padding_lt(cmd)
1075 if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
1076 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
1077 blockdep = arch.calc_block_dep(
1078 prev_ifm_rect,
1079 prev_ofm_rect,
1080 prev_ifm_block_depth,
1081 prev_ofm_block,
1082 prev_kernel,
1083 cur_ifm_rect,
1084 cur_ofm_rect,
1085 cur_ifm_block_depth,
1086 cur_ofm_block,
1087 cur_kernel,
1088 cur_padLT,
1089 )
1090 else:
1091 blockdep = 0
1092 else:
1093 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
1094
1095 # Set between every op (dependent or not)
1096 blockdep = min(blockdep, arch.max_blockdep)
1097 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1098 prev_cmd = cmd
1099
1100 if npu_block_type == NpuBlockType.ConvolutionMxN:
1101 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1102 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
1103 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1104 elif npu_block_type == NpuBlockType.VectorProduct:
1105 # Vector product is implemented using a 1x1 convolution
1106 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1107 elif npu_block_type == NpuBlockType.Pooling:
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001108 param = pooling_mode.MAX.value if "Max" in primary_op.type else pooling_mode.AVERAGE.value
Tim Hall79d07d22020-04-27 18:20:16 +01001109 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001110 elif npu_block_type == NpuBlockType.ReduceSum:
1111 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)
Tim Hall79d07d22020-04-27 18:20:16 +01001112 elif npu_block_type == NpuBlockType.ElementWise:
1113 param = elementwise_mode_map[primary_op.type]
1114 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
1115 else:
1116 print("Warning: Skipping register command stream generation for", ps)
1117
1118 # Fill in final part of command stream:
1119 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1120
1121 sg.register_command_stream = emit.to_list()
1122 if verbose:
1123 emit.print_cmds()
1124 print("number of commands", len(emit.cmd_stream))
1125 print("command stream length in words", len(sg.register_command_stream))