blob: 3b2949838450745de399ff38cede3a8672b59d9a [file] [log] [blame]
Tim Hallf7e810a2020-06-25 15:04:31 +01001
Tim Hall79d07d22020-04-27 18:20:16 +01002# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
3#
4# SPDX-License-Identifier: Apache-2.0
5#
6# Licensed under the Apache License, Version 2.0 (the License); you may
7# not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an AS IS BASIS, WITHOUT
14# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
18# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
19# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
20# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
25import numpy as np
26
27from . import scaling
Diego Russoe8a10452020-04-21 17:39:10 +010028from .architecture_features import ArchitectureFeatures
29from .architecture_features import Block
30from .architecture_features import Kernel
31from .architecture_features import Rect
32from .architecture_features import SharedBufferArea
33from .architecture_features import SHRAMElements
34from .data_type import BaseType
35from .data_type import DataType
36from .ethos_u55_regs.ethos_u55_regs import acc_format
37from .ethos_u55_regs.ethos_u55_regs import activation
38from .ethos_u55_regs.ethos_u55_regs import cmd0
39from .ethos_u55_regs.ethos_u55_regs import cmd1
40from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
41from .ethos_u55_regs.ethos_u55_regs import ifm_precision
Jacob Bohlincf7da102020-05-20 09:03:40 +020042from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010043from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010044from .high_level_command_stream import CommandType
Diego Russoe8a10452020-04-21 17:39:10 +010045from .numeric_util import clamp_sigmoid
46from .numeric_util import clamp_tanh
Louis Verhaardb2fb2122020-06-04 15:51:24 +020047from .numeric_util import full_shape
Diego Russoe8a10452020-04-21 17:39:10 +010048from .numeric_util import quantise_float32
49from .numeric_util import round_away_zero
50from .numeric_util import round_up
51from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010052from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010053from .shared_buffer_allocation import SharedBufferAllocation
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020054from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010055from .tensor import TensorBlockTraversal
56from .tensor import TensorFormat
Tim Hall79d07d22020-04-27 18:20:16 +010057
58
59class RegisterMachine:
60 def __init__(self):
61 self.n_banks = 1
62 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
63 self.bank_idx = 0
64
65 def set_register(self, reg, value):
66 is_changed = self.registers[self.bank_idx][reg] != value
67 self.registers[self.bank_idx][reg] = value
68 # is_changed = True # force command
69 return is_changed
70
71 def switch_bank(self):
72 self.bank_idx = (self.bank_idx + 1) % self.n_banks
73
74
75class CmdMode(IntEnum):
76 NoPayload = 0x0000
77 Payload32 = 0x4000
78 Mask = 0xC000
79 CmdOpMask = 0x03FF
80
81
82class BasePointerIndex(IntEnum):
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020083 WeightTensor = 0 # base address index for the Weight tensor
84 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
85 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Tim Hall79d07d22020-04-27 18:20:16 +010086
87
88# TODO: Replace with definitions from ethos_u55_regs
89class IFM2Broadcast(IntEnum):
90 BroadcastHdim = 1 << 0
91 BroadcastWdim = 1 << 1
92 BroadcastCdim = 1 << 2
93 ReverseOperandOrder = 1 << 6
94 UseIFM2Scalar = 1 << 7
95
96
97class CommandStreamEmitter:
98 def __init__(self):
99 self.cmd_stream = []
100 self.reg_machine = [RegisterMachine(), RegisterMachine()]
101 self.last_absolute_wait = defaultdict(int)
102
103 def get_reg_machine(self, cmd):
104 if "DMA" in cmd.name:
105 return self.reg_machine[1]
106 else:
107 return self.reg_machine[0]
108
109 def size_in_bytes(self):
110 sz = 0
111 for cmd in self.cmd_stream:
112 sz += len(cmd) * 4
113 return sz
114
115 def to_list(self):
116 return [elem for cmd in self.cmd_stream for elem in cmd]
117
118 def print_cmds(self):
119 print("Code: Command: Param: Payload:")
120 for words_for_one_command in self.cmd_stream:
121 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
122 param = words_for_one_command[0] >> 16 # higher 16 bits
123
124 payload_mode = CmdMode(code & CmdMode.Mask)
125
126 # code and command
127 s = " 0x%04x " % code
128 if payload_mode == CmdMode.NoPayload:
129 s += str(cmd0(code & CmdMode.CmdOpMask))
130 else:
131 s += str(cmd1(code & CmdMode.CmdOpMask))
132
133 s = s.ljust(40)
134 s += "%5d" % param
135
136 # payload
137 if payload_mode == CmdMode.Payload32:
138 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
139 else:
140 s += " -"
141
142 print(s)
143
144 def cmd0_with_param(self, cmd, param):
145 if isinstance(param, Enum):
146 param = int(param.value)
147 else:
148 param = int(param)
149 param = param & 0xFFFF
150 command = cmd.value | (param << 16)
151 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
152 return
153
154 # This is not a redundant command, actually write it
155 self.cmd_stream.append((command,))
156
157 def cmd1_with_offset(self, cmd, offset, param=0x0):
158 offset = int(offset) & 0xFFFFFFFFF
159 command = cmd.value | CmdMode.Payload32.value | (param << 16)
160
161 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
162 return
163
164 # This is not a redundant command, actually write it
165 self.cmd_stream.append((command, offset))
166
167 def cmd_wait(self, cmd, param, absolute_wait_time):
168 if absolute_wait_time <= self.last_absolute_wait[cmd]:
169 return
170
171 self.last_absolute_wait[cmd] = absolute_wait_time
172 param = int(param)
173 command = ((param & 0xFFFF) << 16) | cmd.value
174 self.cmd_stream.append((command,))
175
176 def cmd_do_operation(self, cmd, param=0):
177 param = int(param)
178 command = ((param & 0xFFFF) << 16) | cmd.value
179
180 self.cmd_stream.append((command,))
181 self.get_reg_machine(cmd).switch_bank()
182
183
184def calc_command_dependencies(cmd_stream, arch):
185 cmd_starts = {}
186 cmd_ends = {}
187 memory_accesses = {}
188
189 # Keep track of accumulated number of commands in command stream.
190 # First element kernel ops: (# of blocks, # of commands)
191 # Second element DMA ops: (# of commands)
192 pos = np.array((np.array((0, 0)), np.array([0])))
193
194 dependencies = {}
195
196 for cmd in cmd_stream:
197 cmd_starts[cmd] = pos
198 op_count = cmd.get_operation_count()
199 # Keep track of both num blocks and commands
200 cmd_add = 0 if (op_count[0] == 0) else 1
201 pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])))
202 cmd_ends[cmd] = np.array((pos[0], pos[1]))
203 memory_accesses[cmd] = cmd.get_memory_accesses()
204
205 for idx, cmd in enumerate(cmd_stream):
206 curr_accesses = memory_accesses[cmd]
207 # Keep track of command dependency.
208 # First element kernel ops: (# of blocks, # of commands)
209 # Second element DMA ops: (# of commands)
210 dep_offsets = np.array((np.array((-1, -1)), np.array([-1])))
211 dep_cmds = [None] * CommandType.Size.value
212 if idx > 0:
213 # Look at the previous commands in backwards order
214 for prev_cmd in cmd_stream[idx - 1 :: -1]:
215 assert prev_cmd is not cmd
216 if dep_cmds[prev_cmd.cmdtype] is None:
217 is_dependency = False
218 if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
219 # Special handling here, as dpu -> dpu operations require additional care
220 if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
221 is_dependency = True
222 elif memory_accesses[prev_cmd].conflicts(curr_accesses):
223 is_dependency = True
224 else:
225 if memory_accesses[prev_cmd].conflicts(curr_accesses):
226 is_dependency = True
227
228 if is_dependency:
229 new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
230 if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
231 dep_cmds[prev_cmd.cmdtype] = prev_cmd
232 dep_offsets[prev_cmd.cmdtype] = new_offset
233
234 # Check if we've got dependencies for all commands, in which case we can early out
235 for dep in dep_cmds:
236 if dep is None:
237 break
238 else:
239 break # all handled
240
241 # Convert absolute to relative dependencies, using None to signal the special case of no
242 # dependency of this kind
243 res = [None] * CommandType.Size.value
244 for i in range(CommandType.Size.value):
245 if dep_cmds[i] is not None:
246 res[i] = cmd_starts[cmd][i] - dep_offsets[i]
247
248 dependencies[cmd] = cmd_starts[cmd], res
249
250 return dependencies
251
252
253def get_op_kernel(ps):
254 if ps.primary_op is None:
255 return None
256
257 strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
258 dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
259 if ps.weight_tensor:
260 if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
261 k_h = 1
262 k_w = 1
263 else:
264 k_h = ps.weight_tensor.shape[0]
265 k_w = ps.weight_tensor.shape[1]
266 else:
267 k_h = ps.primary_op.attrs.get("filter_height", 1)
268 k_w = ps.primary_op.attrs.get("filter_width", 1)
269
270 return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
271
272
Tim Hall79d07d22020-04-27 18:20:16 +0100273def has_prev_op_dependency(prev_cmd, cmd):
274 if prev_cmd is None:
275 return False
276 if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
Tim Hall90337952020-05-07 16:42:35 +0100277 if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:
Tim Hall79d07d22020-04-27 18:20:16 +0100278 return True
Tim Hall90337952020-05-07 16:42:35 +0100279 elif cmd.ifm2_tensor is not None:
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200280 return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id
Tim Hall79d07d22020-04-27 18:20:16 +0100281 return False
282
283
284def get_op_ofm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200285 start = full_shape(4, cmd.ofm_box.start_coord, 0)
286 end = full_shape(4, cmd.ofm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100287 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
288
289
290def get_op_ifm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200291 start = full_shape(4, cmd.ifm_box.start_coord, 0)
292 end = full_shape(4, cmd.ifm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100293 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
294
295
296def get_op_ifmofm_block_depth(arch, cmd):
297 # Note: NOT equivalent to the normal ifm block depth calculation since
298 # it takes into account 'depthless' block operations by returning full
299 # depth
300 if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise):
301 return cmd.ofm_box.get_size_shape()[-1]
302
303 return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
304
305
306def get_op_padding_lt(cmd):
307 if cmd.ps.npu_block_type not in (
308 NpuBlockType.ConvolutionDepthWise,
309 NpuBlockType.Pooling,
310 NpuBlockType.ConvolutionMxN,
311 ):
312 return (0, 0)
313
314 explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
315
316 # Check if this is for horizontal ifm streaming
317 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
318 explicit_padding[0] = cmd.pad_top
319 explicit_padding[2] = cmd.pad_bottom
320
321 return (explicit_padding[1], explicit_padding[0])
322
323
324def generate_register_command_stream(nng, sg, arch, verbose=False):
325 emit = CommandStreamEmitter()
326
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200327 if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
328 base_ptr_idx_map = {
329 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
330 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
331 MemType.Scratch: BasePointerIndex.ScratchTensor,
332 MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
333 }
334 else:
335 base_ptr_idx_map = {
336 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
337 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
338 MemType.Scratch: BasePointerIndex.ScratchTensor,
339 MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
340 }
Tim Hall79d07d22020-04-27 18:20:16 +0100341
342 # Maps an AccumulatorType enum to the corresponding acc_format value
343 acc_format_map = {
344 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
345 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
346 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
347 }
348
349 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
350 elementwise_mode_map = {
351 "MulAct": elementwise_mode.MUL.value,
352 "AddAct": elementwise_mode.ADD.value,
353 "SubAct": elementwise_mode.SUB.value,
354 "Minimum": elementwise_mode.MIN.value,
355 "Maximum": elementwise_mode.MAX.value,
356 "LeakyRelu": elementwise_mode.LRELU.value,
357 "Abs": elementwise_mode.ABS.value,
358 }
359
360 cmd_stream = []
361 for cmd in sg.high_level_command_stream:
362 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
363 print("Warning: Skipping register command stream generation for", cmd.ps)
364 else:
365 cmd_stream.append(cmd)
366
367 dependencies = calc_command_dependencies(cmd_stream, arch)
368
369 # Initialise operator dependency state
370 prev_ifm_rect = cur_ifm_rect = None
371 prev_ifm_block_depth = cur_ifm_block_depth = None
372 prev_ofm_rect = cur_ofm_rect = None
373 prev_ofm_block = cur_ofm_block = None
374 prev_kernel = cur_kernel = None
375 prev_cmd = None
376
377 def emit_wait_commands(cmd):
378 # The command is fully set up, emit whatever wait commands we need
379 absolute_dep, relative_dep = dependencies[cmd]
380 if relative_dep[CommandType.NpuStripe] is not None:
381 if cmd.cmdtype == CommandType.DMA:
382 param = relative_dep[CommandType.NpuStripe][1]
383 if param <= 3:
384 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
385 else:
386 param = relative_dep[CommandType.NpuStripe][0]
387 param = min(param, 0xFFFF) # Clamp to allowable wait amount
388
389 if relative_dep[CommandType.DMA] is not None:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200390 # TODO This can be optimized for yoda
391 param = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100392 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
Tim Hall79d07d22020-04-27 18:20:16 +0100393
Tim Hall42e41892020-07-06 10:51:31 +0100394 if arch.is_yoda_system:
395 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores-1)
Tim Hallf7e810a2020-06-25 15:04:31 +0100396
Tim Hall79d07d22020-04-27 18:20:16 +0100397 for cmd in cmd_stream:
398 if cmd.cmdtype == CommandType.DMA:
399 start_coord = cmd.box.start_coord
400
401 src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
402 dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
403
404 if cmd.in_tensor.compressed_values is not None:
405 stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
406 sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
407 else:
408 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
409
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200410 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
Tim Hall79d07d22020-04-27 18:20:16 +0100411 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200412 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
413
Tim Hall79d07d22020-04-27 18:20:16 +0100414 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
415 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
416 dma_channel = 0
417 mode = 0 # From external to external
418
419 emit_wait_commands(cmd)
420 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
421
422 elif cmd.cmdtype == CommandType.NpuStripe:
423
424 ps = cmd.ps
425 primary_op = ps.primary_op
426 npu_block_type = ps.npu_block_type
427 # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
428 use_global_scale = False
429 # Specifies type of rounding to be used.
430 rounding_mode = rounding.TFL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200431 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200432 rounding_mode = rounding.TRUNCATE
Tim Hall79d07d22020-04-27 18:20:16 +0100433 fmf = primary_op.attrs.get("fused_memory_function", None)
434 faf = primary_op.attrs.get("fused_activation_function", None)
435
436 # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
437 op_to_scale = 0
438
439 # Update state history
440 prev_ifm_rect = cur_ifm_rect
441 prev_ifm_block_depth = cur_ifm_block_depth
442 prev_ofm_rect = cur_ofm_rect
443 prev_ofm_block = cur_ofm_block
444 prev_kernel = cur_kernel
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200445 cur_kernel = get_op_kernel(ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100446
447 block_config = ps.block_config
448 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
449 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
450 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
451
452 shared_buffer = ps.shared_buffer
453
454 if npu_block_type == NpuBlockType.ElementWise:
455 ifm2_broadcast = 0
456
457 if cmd.ifm_tensor.shape == []:
458 # The scalar has to be the ifm2 tensor so switch the ifms
459 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
460 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
461
462 # Set ReverseOperandOrder bit to IFM2_BROADCAST
463 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
464
465 # Calculate scales needed for arithmetic elementwise operators
466 if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
467 input_scale = cmd.ifm_tensor.quantization.scale_f32
468 input2_scale = cmd.ifm2_tensor.quantization.scale_f32
469 output_scale = cmd.ofm_tensor.quantization.scale_f32
470 use_global_scale = True
471
472 if primary_op.type == "MulAct":
473 if (faf == "Sigmoid") or (faf == "Tanh"):
474 output_scale = 1 / 0x3000
475
476 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
477 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
478 else: # AddAct/SubAct
479 if (faf == "Sigmoid") or (faf == "Tanh"):
480 output_scale = 1 / 0x3000
481
482 if input_scale == input2_scale:
483 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
484 input_scale, input2_scale, output_scale
485 )
486 opa_shift = 0 # Unused for this case
487 else:
488 # Use advanced implementation only when input scales differ
489 bitdepth = cmd.ifm_tensor.dtype.bits
490 (
491 opa_scale,
492 opa_shift,
493 ofm_scale,
494 shift,
495 op_to_scale,
496 ) = scaling.advanced_elementwise_add_sub_scale(
497 input_scale, input2_scale, output_scale, bitdepth
498 )
499 opb_scale = 0 # Unused for this case
500 if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
501 # If the operand order is reversed we also have to swap which operand is scaled
502 if op_to_scale == scaling.OperandToScale.OPa:
503 op_to_scale = scaling.OperandToScale.OPb
504 else:
505 op_to_scale = scaling.OperandToScale.OPa
506
507 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
508 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
509 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
510
511 if primary_op.type in set(("LeakyRelu", "Abs",)):
512 output_scale = cmd.ofm_tensor.quantization.scale_f32
513 use_global_scale = True
514
515 if primary_op.type == "LeakyRelu":
516 output_scale *= primary_op.attrs["alpha"]
517
518 ofm_scale, shift = scaling.quantise_scale(output_scale)
519 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
520
521 # For elementwise set the required SHRAM to be equal to the total size of SHRAM
522 shram_required = arch.shram_total_banks
523 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
524
525 # Acc buffers not needed so set AB_START to size of SHRAM
526 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)
527
528 # Is not a unary operator
529 if cmd.ifm2_tensor is not None:
530 if cmd.ifm2_tensor.shape == []:
531 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
532 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
533 else:
534 ifm_box_shape = cmd.ifm_box.get_size_shape()
535 ifm2_box_shape = cmd.ifm2_box.get_size_shape()
536
537 if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
538 # Broadcast in 'H' dimension
539 assert cmd.ifm2_tensor.shape[1] == 1
540 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
541
542 if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
543 # Broadcast in 'W' dimension
544 assert cmd.ifm2_tensor.shape[2] == 1
545 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
546
547 if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
548 # Broadcast in 'C' dimension
549 assert cmd.ifm2_tensor.shape[3] == 1
550 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
551
552 # Set IFM2_IB_START to the latter half of the IB space
553 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
554 emit.cmd0_with_param(
555 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
556 )
557
558 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
559
560 else:
561 emit.cmd0_with_param(
562 cmd0.NPU_SET_IFM_IB_END,
563 shared_buffer.bank_locations[SharedBufferArea.IFM]
564 + shared_buffer.banks_required[SharedBufferArea.IFM],
565 )
566 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
567
568 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
569
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200570 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200571 # perform nearest neighbor upscale
Jacob Bohlincf7da102020-05-20 09:03:40 +0200572 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
573 elif primary_op.type == "Conv2DBackpropInputSwitchedBias":
574 # perform insert zero upscale
575 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200576 else:
Jacob Bohlincf7da102020-05-20 09:03:40 +0200577 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
Tim Hall79d07d22020-04-27 18:20:16 +0100578
579 if npu_block_type in set(
580 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
581 ):
582 # Set up padding
583 explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
584
585 # Check if this is for horizontal ifm streaming
586 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
587 explicit_padding[0] = cmd.pad_top
588 explicit_padding[2] = cmd.pad_bottom
589
590 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
591 # because of activation function needed to be fused.
592 if cmd.ifm_box.start_coord[-2] > 0:
593 explicit_padding[1] = 0
594 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
595 explicit_padding[3] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100596 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
597 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
598 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
599 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
600
Dwight Lidman0538a772020-05-06 14:09:17 +0200601 # set kernel x stride low bit
602 stride = primary_op.attrs["strides"][2] - 1 & 1
603 # set kernel y stride low bit
604 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
605 # set kernel x stride extension bits
606 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
607 # set kernel y stride extension bits
608 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
609
Tim Hall79d07d22020-04-27 18:20:16 +0100610 if npu_block_type == NpuBlockType.Pooling:
611 k_height, k_width = primary_op.attrs["ksize"][1:3]
612 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
613 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
614
615 valid_padding = sum(explicit_padding) == 0
616
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200617 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and valid_padding:
Tim Hall79d07d22020-04-27 18:20:16 +0100618 # For valid padding vela has to output scaling values
619 if faf == "Sigmoid" or faf == "Tanh":
620 rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
Tim Hall79d07d22020-04-27 18:20:16 +0100621
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200622 if cmd.ifm_tensor.dtype == DataType.int16:
Charles Xu749d9212020-06-11 12:39:19 +0200623 multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32 + 0.5))
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200624 rescale *= 3 * multiplier
625
626 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
Tim Hall79d07d22020-04-27 18:20:16 +0100627 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200628
629 if cmd.ifm_tensor.dtype == DataType.int16:
630 scale = (1 << shift) * 3 * multiplier
631 else:
632 scale = int(round_away_zero(scale * rescale))
Tim Hall79d07d22020-04-27 18:20:16 +0100633 else:
634 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
635 # k_height == k_width == 1 is allways true in this case
636 # Normally the scale is maximised, to get maximum precision, which means that
637 # if rescale != 1, scale need to consider the number of bits needed for rescaling
638 rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
639 rescale_bits = 0
640 if k_height == k_width == 1:
641 if fmf == "ConcatSliceWrite":
642 rounding_mode = rounding.NATURAL
643 if rescale > 1:
644 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
645 elif rescale < 1:
646 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
647 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
648 scale = int(round_away_zero(scale * rescale))
649
650 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
651 # Valid-padded average pool should use the global scale from
652 # NPU_SET_OFM_SCALE register, which is set above.
653 use_global_scale = True
654
655 else: # Convolution
656 assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200657 # Reduced precision quantization and natural rounding used for int16
658 if cmd.ifm_tensor.dtype == DataType.int16:
659 rounding_mode = rounding.NATURAL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200660 stride |= (cur_kernel.dilation.y - 1) << 4
661 stride |= (cur_kernel.dilation.x - 1) << 3
662 emit.cmd0_with_param(
663 cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
664 )
665 emit.cmd0_with_param(
666 cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
667 )
Tim Hall79d07d22020-04-27 18:20:16 +0100668 if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
669 # Part-kernel-first weight ordering
670 assert npu_block_type == NpuBlockType.ConvolutionMxN
671 stride |= 1 << 2
672
673 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
674
675 elif npu_block_type in set((NpuBlockType.VectorProduct,)):
676 # Vector product is implemented using a 1x1 convolution so need
677 # to setup the appropriate padding and kernel info
678 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
679 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
680 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
681 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
682
683 # kernel stride reg = 0 means stride(1,1) + depth first weight
684 # order + dilation(0,0) + kernel_split_size=8
685 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
686
687 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
688 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
689
690 if npu_block_type in set(
691 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
692 ):
693 # Emit Weight base address commands, only maps the area required for
694 # this command's weights from the larger tensor.
695 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100696 weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
697 substreams = len( weight_substream_offsets ) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100698
699 # Extract weight substream offsets and calculate their lengths
700 assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100701 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
Tim Hallf7e810a2020-06-25 15:04:31 +0100702
Tim Hall62316762020-06-25 16:55:02 +0100703 # Set weights sources for active and present cores
704 for core, param in enumerate( [(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
705 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH)] ):
706 if core < substreams:
707 emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core] )
708 emit.cmd1_with_offset(param[1], weight_substream_offsets[core+1] - weight_substream_offsets[core])
709 elif core < arch.ncores:
710 emit.cmd1_with_offset(param[0], weight_addr)
711 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100712
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200713 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100714 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100715
716 # Emit Scale & Bias base address commands, with length matching the amount required by
717 # the weight tensors.
718 if cmd.scale_tensor is not None:
Tim Hallf7e810a2020-06-25 15:04:31 +0100719 scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
720 substreams = len( scale_substream_offsets ) - 1 # Offset list must terminate with full stream length
Tim Hallf7e810a2020-06-25 15:04:31 +0100721
722 # Extract scale substream offsets and calculate their lengths
723 assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
724 scale_addr = cmd.scale_tensor.address_for_coordinate( cmd.weight_box.start_coord[-1:] )
725
Tim Hall62316762020-06-25 16:55:02 +0100726 # Set scale sources for active and present cores
727 for core, param in enumerate( [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
728 (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)] ):
729 if core < substreams:
730 emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core] )
731 emit.cmd1_with_offset(param[1], scale_substream_offsets[core+1] - scale_substream_offsets[core])
732 elif core < arch.ncores:
733 emit.cmd1_with_offset(param[0], scale_addr)
734 emit.cmd1_with_offset(param[1], 0)
Tim Hallf7e810a2020-06-25 15:04:31 +0100735
Tim Hall79d07d22020-04-27 18:20:16 +0100736 # Emit base address for NPU to access scale & bias data
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200737 scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100738 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
Tim Hall79d07d22020-04-27 18:20:16 +0100739
740 ofm_quant = cmd.ofm_tensor.quantization
741 ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
742 ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
743 ifm_min = cmd.ifm_tensor.quantization.min
744 ifm_max = cmd.ifm_tensor.quantization.max
745
746 # Emit commands for any fused activation function
Diego Russoea6111a2020-04-14 18:41:58 +0100747 if faf is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100748 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
749 # Even if no activation function, values need to be set to override previous values
750 faf_min = ofm_quant_qmin
751 faf_max = ofm_quant_qmax
752 elif faf == "Relu":
753 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
754 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
755 faf_max = ofm_quant_qmax
756 elif faf == "Relu6":
757 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
758 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
759 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
760 elif faf == "ReluN1To1":
761 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
762 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
763 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
764 elif faf == "Tanh":
765 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200766 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
767 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
768 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
769 else:
770 faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
771 faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100772 elif faf == "Sigmoid":
773 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200774 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
775 faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
776 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
777 else:
778 faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
779 faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100780 else:
781 raise Exception("Unsupported fused_activation_function = " + faf)
782
783 # Activation range needs to be set based upon the quantisation range and the fused activation range
784 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
785 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
786
787 out_shape = cmd.ofm_box.get_size_shape()
788 if len(out_shape) >= 4:
789 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
790 else:
791 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
792 if len(out_shape) >= 2:
793 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
794 else:
795 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
796 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
797
798 if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
799 in_shape = cmd.ifm_box.get_size_shape()
800 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
801 else:
802 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
803
Jacob Bohlin3c678292020-04-27 10:27:25 +0200804 for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
Tim Hall79d07d22020-04-27 18:20:16 +0100805 (
806 cmd.ifm_tensor,
807 cmd.ifm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200808 cmd0.NPU_SET_IFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100809 (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
810 (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
811 cmd0.NPU_SET_IFM_ZERO_POINT,
812 ),
813 (
814 cmd.ifm2_tensor,
815 cmd.ifm2_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200816 cmd0.NPU_SET_IFM2_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100817 (
818 cmd1.NPU_SET_IFM2_BASE0,
819 cmd1.NPU_SET_IFM2_BASE1,
820 cmd1.NPU_SET_IFM2_BASE2,
821 cmd1.NPU_SET_IFM2_BASE3,
822 ),
823 (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
824 cmd0.NPU_SET_IFM2_ZERO_POINT,
825 ),
826 (
827 cmd.ofm_tensor,
828 cmd.ofm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200829 cmd0.NPU_SET_OFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100830 (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
831 (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
832 cmd0.NPU_SET_OFM_ZERO_POINT,
833 ),
834 ):
835
Diego Russoea6111a2020-04-14 18:41:58 +0100836 if tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100837 continue
838
Diego Russoea6111a2020-04-14 18:41:58 +0100839 need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite")
Tim Hall79d07d22020-04-27 18:20:16 +0100840 if (
Dwight Lidman86d49932020-06-04 15:31:56 +0200841 primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and not need_zero_point
Diego Russoea6111a2020-04-14 18:41:58 +0100842 ) or tens.quantization is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100843 # Actual integer operation, just set scale to 1 and zero point to 0
844 emit.cmd0_with_param(zero_point_op, 0)
845 else:
846 assert tens.quantization.zero_point is not None, "need an actual zero point set"
847 emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point))
848
849 if tens.shape == []:
850 # Empty shape, elementwise constant
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200851 ifm2_scalar = tens.quant_values
Tim Hall79d07d22020-04-27 18:20:16 +0100852 assert ifm2_scalar.size == 1
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200853 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
Tim Hall79d07d22020-04-27 18:20:16 +0100854 continue
855
856 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
857 box.start_coord, box.end_coord
858 )
859 if npu_block_type != NpuBlockType.VectorProduct:
860 if tens == cmd.ifm_tensor:
861 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
862 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
863 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
864 elif tens == cmd.ofm_tensor:
865 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
866 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
867 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
Louis Verhaard0cf06c72020-05-12 08:31:05 +0200868 if tens == cmd.ifm2_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100869 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
870 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
871 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
872 else:
873 if len(out_shape) == 2:
874 # TODO: N is put in W-dimension for now
875 # Should be spread over H and W, but then block size selectetion,
876 # and stride calculation should be changed
877 if tens == cmd.ifm_tensor:
878 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
879 elif tens == cmd.ofm_tensor:
880 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
881 else:
882 assert False
883
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200884 emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
Jacob Bohlin3c678292020-04-27 10:27:25 +0200885
Tim Hall79d07d22020-04-27 18:20:16 +0100886 for idx, addr in enumerate(addresses):
887 if addr is None:
888 addresses[idx] = 0
889
890 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
891 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
892 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
893 emit.cmd1_with_offset(ptr_ops[3], addresses[3])
894
895 strides = tens.get_strides()
896 emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
897 emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
898 emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
899
900 if tens.format == TensorFormat.NHCWB16:
901 # Check that all BasePointer addresses are aligned to 16 bytes
902 assert (int(addresses[0]) % 16) == 0
903 assert (int(addresses[1]) % 16) == 0
904 assert (int(addresses[2]) % 16) == 0
905 assert (int(addresses[3]) % 16) == 0
906
907 ofm_dtype = cmd.ofm_tensor.dtype
908 assert ofm_dtype.type & BaseType.Int
909 prec = 0
910 if ofm_dtype.size_in_bits() == 8:
911 prec = 0
912 elif ofm_dtype.size_in_bits() == 16:
913 prec = 2
914 else:
915 assert 0
916
917 if ofm_dtype.type & BaseType.Signed:
918 prec += 1
919
920 if use_global_scale:
921 # Set global scale bit, as opposed to using per channel scale
922 prec |= 1 << 8
923
924 if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
925 prec |= 1 << 6
926
927 prec |= rounding_mode.value << 14
928
929 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
930
931 prec = None
932 weight_bits = 8
933 if cmd.weight_tensor is not None:
934 weight_bits = cmd.weight_tensor.dtype.size_in_bits()
935
936 ifm_dtype = cmd.ifm_tensor.dtype
937
938 assert weight_bits == 8, "Unsupported weight bit depth"
939 assert ifm_dtype.size_in_bits() in {8, 16}
940
941 if ifm_dtype.size_in_bits() == 8:
942 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200943 prec = ifm_precision.S8
Tim Hall79d07d22020-04-27 18:20:16 +0100944 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200945 prec = ifm_precision.U8
Tim Hall79d07d22020-04-27 18:20:16 +0100946 elif ifm_dtype.size_in_bits() == 16:
947 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200948 prec = ifm_precision.S16
Tim Hall79d07d22020-04-27 18:20:16 +0100949 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200950 prec = ifm_precision.U16
Tim Hall79d07d22020-04-27 18:20:16 +0100951
952 ifm_prec = prec.value
953 ifm2_prec = ifm_prec
954
955 if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
956 ifm_prec |= 1 << 6
957
958 ifm_prec |= op_to_scale << 8
959
960 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
961
962 if cmd.ifm2_tensor is not None:
963 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
964 ifm2_prec |= 1 << 6
965 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
966
967 emit_wait_commands(cmd)
968
969 # Get op parameters
970 cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
971 cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
972 cur_ofm_rect = get_op_ofm_rect(cmd)
973 cur_ifm_rect = get_op_ifm_rect(cmd)
Tim Hall79d07d22020-04-27 18:20:16 +0100974 cur_padLT = get_op_padding_lt(cmd)
975 if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
976 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
977 blockdep = arch.calc_block_dep(
978 prev_ifm_rect,
979 prev_ofm_rect,
980 prev_ifm_block_depth,
981 prev_ofm_block,
982 prev_kernel,
983 cur_ifm_rect,
984 cur_ofm_rect,
985 cur_ifm_block_depth,
986 cur_ofm_block,
987 cur_kernel,
988 cur_padLT,
989 )
990 else:
991 blockdep = 0
992 else:
993 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
994
995 # Set between every op (dependent or not)
996 blockdep = min(blockdep, arch.max_blockdep)
997 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
998 prev_cmd = cmd
999
1000 if npu_block_type == NpuBlockType.ConvolutionMxN:
1001 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1002 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
1003 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
1004 elif npu_block_type == NpuBlockType.VectorProduct:
1005 # Vector product is implemented using a 1x1 convolution
1006 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
1007 elif npu_block_type == NpuBlockType.Pooling:
1008 param = "Max" not in primary_op.type
1009 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
1010 elif npu_block_type == NpuBlockType.ElementWise:
1011 param = elementwise_mode_map[primary_op.type]
1012 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
1013 else:
1014 print("Warning: Skipping register command stream generation for", ps)
1015
1016 # Fill in final part of command stream:
1017 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
1018
1019 sg.register_command_stream = emit.to_list()
1020 if verbose:
1021 emit.print_cmds()
1022 print("number of commands", len(emit.cmd_stream))
1023 print("command stream length in words", len(sg.register_command_stream))