blob: 73418d5215e988adf00438949acd08b5c9642ec5 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
18# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
19# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010023
24import numpy as np
25
26from . import scaling
Diego Russoe8a10452020-04-21 17:39:10 +010027from .architecture_features import ArchitectureFeatures
28from .architecture_features import Block
29from .architecture_features import Kernel
30from .architecture_features import Rect
31from .architecture_features import SharedBufferArea
32from .architecture_features import SHRAMElements
33from .data_type import BaseType
34from .data_type import DataType
35from .ethos_u55_regs.ethos_u55_regs import acc_format
36from .ethos_u55_regs.ethos_u55_regs import activation
37from .ethos_u55_regs.ethos_u55_regs import cmd0
38from .ethos_u55_regs.ethos_u55_regs import cmd1
39from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
40from .ethos_u55_regs.ethos_u55_regs import ifm_precision
41from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010042from .high_level_command_stream import CommandType
Diego Russoe8a10452020-04-21 17:39:10 +010043from .numeric_util import clamp_sigmoid
44from .numeric_util import clamp_tanh
45from .numeric_util import quantise_float32
46from .numeric_util import round_away_zero
47from .numeric_util import round_up
48from .numeric_util import round_up_to_int
Charles Xu3e9c4342020-04-22 08:31:43 +020049from .numeric_util import full_shape
Tim Hall79d07d22020-04-27 18:20:16 +010050from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010051from .shared_buffer_allocation import SharedBufferAllocation
Diego Russoe8a10452020-04-21 17:39:10 +010052from .tensor import MemArea
53from .tensor import TensorBlockTraversal
54from .tensor import TensorFormat
Tim Hall79d07d22020-04-27 18:20:16 +010055
56
57class RegisterMachine:
58 def __init__(self):
59 self.n_banks = 1
60 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
61 self.bank_idx = 0
62
63 def set_register(self, reg, value):
64 is_changed = self.registers[self.bank_idx][reg] != value
65 self.registers[self.bank_idx][reg] = value
66 # is_changed = True # force command
67 return is_changed
68
69 def switch_bank(self):
70 self.bank_idx = (self.bank_idx + 1) % self.n_banks
71
72
73class CmdMode(IntEnum):
74 NoPayload = 0x0000
75 Payload32 = 0x4000
76 Mask = 0xC000
77 CmdOpMask = 0x03FF
78
79
80class BasePointerIndex(IntEnum):
81 ReadOnly = 0 # base address slot index for weights and scaling
82 Scratch = 1 # base address slot index for scratch memory area
83
84
85# TODO: Replace with definitions from ethos_u55_regs
86class IFM2Broadcast(IntEnum):
87 BroadcastHdim = 1 << 0
88 BroadcastWdim = 1 << 1
89 BroadcastCdim = 1 << 2
90 ReverseOperandOrder = 1 << 6
91 UseIFM2Scalar = 1 << 7
92
93
94class CommandStreamEmitter:
95 def __init__(self):
96 self.cmd_stream = []
97 self.reg_machine = [RegisterMachine(), RegisterMachine()]
98 self.last_absolute_wait = defaultdict(int)
99
100 def get_reg_machine(self, cmd):
101 if "DMA" in cmd.name:
102 return self.reg_machine[1]
103 else:
104 return self.reg_machine[0]
105
106 def size_in_bytes(self):
107 sz = 0
108 for cmd in self.cmd_stream:
109 sz += len(cmd) * 4
110 return sz
111
112 def to_list(self):
113 return [elem for cmd in self.cmd_stream for elem in cmd]
114
115 def print_cmds(self):
116 print("Code: Command: Param: Payload:")
117 for words_for_one_command in self.cmd_stream:
118 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
119 param = words_for_one_command[0] >> 16 # higher 16 bits
120
121 payload_mode = CmdMode(code & CmdMode.Mask)
122
123 # code and command
124 s = " 0x%04x " % code
125 if payload_mode == CmdMode.NoPayload:
126 s += str(cmd0(code & CmdMode.CmdOpMask))
127 else:
128 s += str(cmd1(code & CmdMode.CmdOpMask))
129
130 s = s.ljust(40)
131 s += "%5d" % param
132
133 # payload
134 if payload_mode == CmdMode.Payload32:
135 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
136 else:
137 s += " -"
138
139 print(s)
140
141 def cmd0_with_param(self, cmd, param):
142 if isinstance(param, Enum):
143 param = int(param.value)
144 else:
145 param = int(param)
146 param = param & 0xFFFF
147 command = cmd.value | (param << 16)
148 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
149 return
150
151 # This is not a redundant command, actually write it
152 self.cmd_stream.append((command,))
153
154 def cmd1_with_offset(self, cmd, offset, param=0x0):
155 offset = int(offset) & 0xFFFFFFFFF
156 command = cmd.value | CmdMode.Payload32.value | (param << 16)
157
158 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
159 return
160
161 # This is not a redundant command, actually write it
162 self.cmd_stream.append((command, offset))
163
164 def cmd_wait(self, cmd, param, absolute_wait_time):
165 if absolute_wait_time <= self.last_absolute_wait[cmd]:
166 return
167
168 self.last_absolute_wait[cmd] = absolute_wait_time
169 param = int(param)
170 command = ((param & 0xFFFF) << 16) | cmd.value
171 self.cmd_stream.append((command,))
172
173 def cmd_do_operation(self, cmd, param=0):
174 param = int(param)
175 command = ((param & 0xFFFF) << 16) | cmd.value
176
177 self.cmd_stream.append((command,))
178 self.get_reg_machine(cmd).switch_bank()
179
180
181def calc_command_dependencies(cmd_stream, arch):
182 cmd_starts = {}
183 cmd_ends = {}
184 memory_accesses = {}
185
186 # Keep track of accumulated number of commands in command stream.
187 # First element kernel ops: (# of blocks, # of commands)
188 # Second element DMA ops: (# of commands)
189 pos = np.array((np.array((0, 0)), np.array([0])))
190
191 dependencies = {}
192
193 for cmd in cmd_stream:
194 cmd_starts[cmd] = pos
195 op_count = cmd.get_operation_count()
196 # Keep track of both num blocks and commands
197 cmd_add = 0 if (op_count[0] == 0) else 1
198 pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])))
199 cmd_ends[cmd] = np.array((pos[0], pos[1]))
200 memory_accesses[cmd] = cmd.get_memory_accesses()
201
202 for idx, cmd in enumerate(cmd_stream):
203 curr_accesses = memory_accesses[cmd]
204 # Keep track of command dependency.
205 # First element kernel ops: (# of blocks, # of commands)
206 # Second element DMA ops: (# of commands)
207 dep_offsets = np.array((np.array((-1, -1)), np.array([-1])))
208 dep_cmds = [None] * CommandType.Size.value
209 if idx > 0:
210 # Look at the previous commands in backwards order
211 for prev_cmd in cmd_stream[idx - 1 :: -1]:
212 assert prev_cmd is not cmd
213 if dep_cmds[prev_cmd.cmdtype] is None:
214 is_dependency = False
215 if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
216 # Special handling here, as dpu -> dpu operations require additional care
217 if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
218 is_dependency = True
219 elif memory_accesses[prev_cmd].conflicts(curr_accesses):
220 is_dependency = True
221 else:
222 if memory_accesses[prev_cmd].conflicts(curr_accesses):
223 is_dependency = True
224
225 if is_dependency:
226 new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
227 if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
228 dep_cmds[prev_cmd.cmdtype] = prev_cmd
229 dep_offsets[prev_cmd.cmdtype] = new_offset
230
231 # Check if we've got dependencies for all commands, in which case we can early out
232 for dep in dep_cmds:
233 if dep is None:
234 break
235 else:
236 break # all handled
237
238 # Convert absolute to relative dependencies, using None to signal the special case of no
239 # dependency of this kind
240 res = [None] * CommandType.Size.value
241 for i in range(CommandType.Size.value):
242 if dep_cmds[i] is not None:
243 res[i] = cmd_starts[cmd][i] - dep_offsets[i]
244
245 dependencies[cmd] = cmd_starts[cmd], res
246
247 return dependencies
248
249
250def get_op_kernel(ps):
251 if ps.primary_op is None:
252 return None
253
254 strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
255 dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
256 if ps.weight_tensor:
257 if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
258 k_h = 1
259 k_w = 1
260 else:
261 k_h = ps.weight_tensor.shape[0]
262 k_w = ps.weight_tensor.shape[1]
263 else:
264 k_h = ps.primary_op.attrs.get("filter_height", 1)
265 k_w = ps.primary_op.attrs.get("filter_width", 1)
266
267 return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
268
269
Tim Hall79d07d22020-04-27 18:20:16 +0100270def has_prev_op_dependency(prev_cmd, cmd):
271 if prev_cmd is None:
272 return False
273 if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
Tim Hall90337952020-05-07 16:42:35 +0100274 if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:
Tim Hall79d07d22020-04-27 18:20:16 +0100275 return True
Tim Hall90337952020-05-07 16:42:35 +0100276 elif cmd.ifm2_tensor is not None:
277 return (prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id)
Tim Hall79d07d22020-04-27 18:20:16 +0100278 return False
279
280
281def get_op_ofm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200282 start = full_shape(4, cmd.ofm_box.start_coord, 0)
283 end = full_shape(4, cmd.ofm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100284 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
285
286
287def get_op_ifm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200288 start = full_shape(4, cmd.ifm_box.start_coord, 0)
289 end = full_shape(4, cmd.ifm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100290 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
291
292
293def get_op_ifmofm_block_depth(arch, cmd):
294 # Note: NOT equivalent to the normal ifm block depth calculation since
295 # it takes into account 'depthless' block operations by returning full
296 # depth
297 if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise):
298 return cmd.ofm_box.get_size_shape()[-1]
299
300 return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
301
302
303def get_op_padding_lt(cmd):
304 if cmd.ps.npu_block_type not in (
305 NpuBlockType.ConvolutionDepthWise,
306 NpuBlockType.Pooling,
307 NpuBlockType.ConvolutionMxN,
308 ):
309 return (0, 0)
310
311 explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
312
313 # Check if this is for horizontal ifm streaming
314 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
315 explicit_padding[0] = cmd.pad_top
316 explicit_padding[2] = cmd.pad_bottom
317
318 return (explicit_padding[1], explicit_padding[0])
319
320
321def generate_register_command_stream(nng, sg, arch, verbose=False):
322 emit = CommandStreamEmitter()
323
324 base_ptr_idx_map = {
325 MemArea.Sram: BasePointerIndex.Scratch,
326 MemArea.OnChipFlash: BasePointerIndex.ReadOnly,
327 MemArea.OffChipFlash: BasePointerIndex.ReadOnly,
328 MemArea.Dram: BasePointerIndex.ReadOnly,
329 }
330
331 # Maps an AccumulatorType enum to the corresponding acc_format value
332 acc_format_map = {
333 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
334 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
335 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
336 }
337
338 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
339 elementwise_mode_map = {
340 "MulAct": elementwise_mode.MUL.value,
341 "AddAct": elementwise_mode.ADD.value,
342 "SubAct": elementwise_mode.SUB.value,
343 "Minimum": elementwise_mode.MIN.value,
344 "Maximum": elementwise_mode.MAX.value,
345 "LeakyRelu": elementwise_mode.LRELU.value,
346 "Abs": elementwise_mode.ABS.value,
347 }
348
349 cmd_stream = []
350 for cmd in sg.high_level_command_stream:
351 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
352 print("Warning: Skipping register command stream generation for", cmd.ps)
353 else:
354 cmd_stream.append(cmd)
355
356 dependencies = calc_command_dependencies(cmd_stream, arch)
357
358 # Initialise operator dependency state
359 prev_ifm_rect = cur_ifm_rect = None
360 prev_ifm_block_depth = cur_ifm_block_depth = None
361 prev_ofm_rect = cur_ofm_rect = None
362 prev_ofm_block = cur_ofm_block = None
363 prev_kernel = cur_kernel = None
364 prev_cmd = None
365
366 def emit_wait_commands(cmd):
367 # The command is fully set up, emit whatever wait commands we need
368 absolute_dep, relative_dep = dependencies[cmd]
369 if relative_dep[CommandType.NpuStripe] is not None:
370 if cmd.cmdtype == CommandType.DMA:
371 param = relative_dep[CommandType.NpuStripe][1]
372 if param <= 3:
373 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
374 else:
375 param = relative_dep[CommandType.NpuStripe][0]
376 param = min(param, 0xFFFF) # Clamp to allowable wait amount
377
378 if relative_dep[CommandType.DMA] is not None:
379 param = relative_dep[CommandType.DMA][0]
380 param = min(param, 0xF) # Clamp to allowable wait amount
381 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
Tim Hall79d07d22020-04-27 18:20:16 +0100382
Tim Hall79d07d22020-04-27 18:20:16 +0100383 for cmd in cmd_stream:
384 if cmd.cmdtype == CommandType.DMA:
385 start_coord = cmd.box.start_coord
386
387 src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
388 dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
389
390 if cmd.in_tensor.compressed_values is not None:
391 stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
392 sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
393 else:
394 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
395
396 # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage
397 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area])
398 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
399 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area])
400 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
401 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
402 dma_channel = 0
403 mode = 0 # From external to external
404
405 emit_wait_commands(cmd)
406 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
407
408 elif cmd.cmdtype == CommandType.NpuStripe:
409
410 ps = cmd.ps
411 primary_op = ps.primary_op
412 npu_block_type = ps.npu_block_type
413 # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
414 use_global_scale = False
415 # Specifies type of rounding to be used.
416 rounding_mode = rounding.TFL
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200417 if primary_op.type == 'ResizeBilinear':
418 rounding_mode = rounding.TRUNCATE
Tim Hall79d07d22020-04-27 18:20:16 +0100419 fmf = primary_op.attrs.get("fused_memory_function", None)
420 faf = primary_op.attrs.get("fused_activation_function", None)
421
422 # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
423 op_to_scale = 0
424
425 # Update state history
426 prev_ifm_rect = cur_ifm_rect
427 prev_ifm_block_depth = cur_ifm_block_depth
428 prev_ofm_rect = cur_ofm_rect
429 prev_ofm_block = cur_ofm_block
430 prev_kernel = cur_kernel
431
432 block_config = ps.block_config
433 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
434 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
435 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
436
437 shared_buffer = ps.shared_buffer
438
439 if npu_block_type == NpuBlockType.ElementWise:
440 ifm2_broadcast = 0
441
442 if cmd.ifm_tensor.shape == []:
443 # The scalar has to be the ifm2 tensor so switch the ifms
444 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
445 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
446
447 # Set ReverseOperandOrder bit to IFM2_BROADCAST
448 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
449
450 # Calculate scales needed for arithmetic elementwise operators
451 if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
452 input_scale = cmd.ifm_tensor.quantization.scale_f32
453 input2_scale = cmd.ifm2_tensor.quantization.scale_f32
454 output_scale = cmd.ofm_tensor.quantization.scale_f32
455 use_global_scale = True
456
457 if primary_op.type == "MulAct":
458 if (faf == "Sigmoid") or (faf == "Tanh"):
459 output_scale = 1 / 0x3000
460
461 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
462 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
463 else: # AddAct/SubAct
464 if (faf == "Sigmoid") or (faf == "Tanh"):
465 output_scale = 1 / 0x3000
466
467 if input_scale == input2_scale:
468 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
469 input_scale, input2_scale, output_scale
470 )
471 opa_shift = 0 # Unused for this case
472 else:
473 # Use advanced implementation only when input scales differ
474 bitdepth = cmd.ifm_tensor.dtype.bits
475 (
476 opa_scale,
477 opa_shift,
478 ofm_scale,
479 shift,
480 op_to_scale,
481 ) = scaling.advanced_elementwise_add_sub_scale(
482 input_scale, input2_scale, output_scale, bitdepth
483 )
484 opb_scale = 0 # Unused for this case
485 if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
486 # If the operand order is reversed we also have to swap which operand is scaled
487 if op_to_scale == scaling.OperandToScale.OPa:
488 op_to_scale = scaling.OperandToScale.OPb
489 else:
490 op_to_scale = scaling.OperandToScale.OPa
491
492 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
493 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
494 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
495
496 if primary_op.type in set(("LeakyRelu", "Abs",)):
497 output_scale = cmd.ofm_tensor.quantization.scale_f32
498 use_global_scale = True
499
500 if primary_op.type == "LeakyRelu":
501 output_scale *= primary_op.attrs["alpha"]
502
503 ofm_scale, shift = scaling.quantise_scale(output_scale)
504 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
505
506 # For elementwise set the required SHRAM to be equal to the total size of SHRAM
507 shram_required = arch.shram_total_banks
508 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
509
510 # Acc buffers not needed so set AB_START to size of SHRAM
511 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)
512
513 # Is not a unary operator
514 if cmd.ifm2_tensor is not None:
515 if cmd.ifm2_tensor.shape == []:
516 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
517 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
518 else:
519 ifm_box_shape = cmd.ifm_box.get_size_shape()
520 ifm2_box_shape = cmd.ifm2_box.get_size_shape()
521
522 if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
523 # Broadcast in 'H' dimension
524 assert cmd.ifm2_tensor.shape[1] == 1
525 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
526
527 if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
528 # Broadcast in 'W' dimension
529 assert cmd.ifm2_tensor.shape[2] == 1
530 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
531
532 if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
533 # Broadcast in 'C' dimension
534 assert cmd.ifm2_tensor.shape[3] == 1
535 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
536
537 # Set IFM2_IB_START to the latter half of the IB space
538 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
539 emit.cmd0_with_param(
540 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
541 )
542
543 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
544
545 else:
546 emit.cmd0_with_param(
547 cmd0.NPU_SET_IFM_IB_END,
548 shared_buffer.bank_locations[SharedBufferArea.IFM]
549 + shared_buffer.banks_required[SharedBufferArea.IFM],
550 )
551 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
552
553 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
554
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200555 if primary_op.type == 'ResizeBilinear':
556 # perform nearest neighbor upscale
557 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 1)
558 else:
559 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100560
561 if npu_block_type in set(
562 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
563 ):
564 # Set up padding
565 explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
566
567 # Check if this is for horizontal ifm streaming
568 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
569 explicit_padding[0] = cmd.pad_top
570 explicit_padding[2] = cmd.pad_bottom
571
572 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
573 # because of activation function needed to be fused.
574 if cmd.ifm_box.start_coord[-2] > 0:
575 explicit_padding[1] = 0
576 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
577 explicit_padding[3] = 0
578
579 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
580 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
581 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
582 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
583
Dwight Lidman0538a772020-05-06 14:09:17 +0200584 # set kernel x stride low bit
585 stride = primary_op.attrs["strides"][2] - 1 & 1
586 # set kernel y stride low bit
587 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
588 # set kernel x stride extension bits
589 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
590 # set kernel y stride extension bits
591 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
592
Tim Hall79d07d22020-04-27 18:20:16 +0100593
594 if npu_block_type == NpuBlockType.Pooling:
595 k_height, k_width = primary_op.attrs["ksize"][1:3]
596 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
597 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
598
599 valid_padding = sum(explicit_padding) == 0
600
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200601 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and valid_padding:
Tim Hall79d07d22020-04-27 18:20:16 +0100602 # For valid padding vela has to output scaling values
603 if faf == "Sigmoid" or faf == "Tanh":
604 rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
Tim Hall79d07d22020-04-27 18:20:16 +0100605
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200606 if cmd.ifm_tensor.dtype == DataType.int16:
607 multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32))
608 rescale *= 3 * multiplier
609
610 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
Tim Hall79d07d22020-04-27 18:20:16 +0100611 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200612
613 if cmd.ifm_tensor.dtype == DataType.int16:
614 scale = (1 << shift) * 3 * multiplier
615 else:
616 scale = int(round_away_zero(scale * rescale))
Tim Hall79d07d22020-04-27 18:20:16 +0100617 else:
618 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
619 # k_height == k_width == 1 is allways true in this case
620 # Normally the scale is maximised, to get maximum precision, which means that
621 # if rescale != 1, scale need to consider the number of bits needed for rescaling
622 rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
623 rescale_bits = 0
624 if k_height == k_width == 1:
625 if fmf == "ConcatSliceWrite":
626 rounding_mode = rounding.NATURAL
627 if rescale > 1:
628 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
629 elif rescale < 1:
630 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
631 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
632 scale = int(round_away_zero(scale * rescale))
633
634 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
635 # Valid-padded average pool should use the global scale from
636 # NPU_SET_OFM_SCALE register, which is set above.
637 use_global_scale = True
638
639 else: # Convolution
640 assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200641 # Reduced precision quantization and natural rounding used for int16
642 if cmd.ifm_tensor.dtype == DataType.int16:
643 rounding_mode = rounding.NATURAL
Tim Hall79d07d22020-04-27 18:20:16 +0100644 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1)
645 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1)
646 if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
647 # Part-kernel-first weight ordering
648 assert npu_block_type == NpuBlockType.ConvolutionMxN
649 stride |= 1 << 2
650
651 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
652
653 elif npu_block_type in set((NpuBlockType.VectorProduct,)):
654 # Vector product is implemented using a 1x1 convolution so need
655 # to setup the appropriate padding and kernel info
656 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
657 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
658 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
659 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
660
661 # kernel stride reg = 0 means stride(1,1) + depth first weight
662 # order + dilation(0,0) + kernel_split_size=8
663 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
664
665 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
666 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
667
668 if npu_block_type in set(
669 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
670 ):
671 # Emit Weight base address commands, only maps the area required for
672 # this command's weights from the larger tensor.
673 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
674 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
675 weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
676 # Select weight/scale region depending on where permanent storage was defined
677 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area]
678 if arch.permanent_storage_mem_area == MemArea.Sram:
679 weight_region = BasePointerIndex.ReadOnly
680 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
681 emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
682 emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
683
684 # Emit Scale & Bias base address commands, with length matching the amount required by
685 # the weight tensors.
686 if cmd.scale_tensor is not None:
687 # Get address and size of the scale/bias data area
688 scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
689 scale_len = (
690 cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
691 )
692 # Emit base address for NPU to access scale & bias data
693 scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area]
694 if arch.permanent_storage_mem_area == MemArea.Sram:
695 scale_region = BasePointerIndex.ReadOnly
696 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
697 emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
698 emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
699
700 ofm_quant = cmd.ofm_tensor.quantization
701 ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
702 ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
703 ifm_min = cmd.ifm_tensor.quantization.min
704 ifm_max = cmd.ifm_tensor.quantization.max
705
706 # Emit commands for any fused activation function
Diego Russoea6111a2020-04-14 18:41:58 +0100707 if faf is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100708 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
709 # Even if no activation function, values need to be set to override previous values
710 faf_min = ofm_quant_qmin
711 faf_max = ofm_quant_qmax
712 elif faf == "Relu":
713 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
714 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
715 faf_max = ofm_quant_qmax
716 elif faf == "Relu6":
717 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
718 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
719 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
720 elif faf == "ReluN1To1":
721 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
722 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
723 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
724 elif faf == "Tanh":
725 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200726 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
727 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
728 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
729 else:
730 faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
731 faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100732 elif faf == "Sigmoid":
733 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200734 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
735 faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
736 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
737 else:
738 faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
739 faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100740 else:
741 raise Exception("Unsupported fused_activation_function = " + faf)
742
743 # Activation range needs to be set based upon the quantisation range and the fused activation range
744 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
745 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
746
747 out_shape = cmd.ofm_box.get_size_shape()
748 if len(out_shape) >= 4:
749 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
750 else:
751 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
752 if len(out_shape) >= 2:
753 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
754 else:
755 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
756 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
757
758 if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
759 in_shape = cmd.ifm_box.get_size_shape()
760 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
761 else:
762 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
763
Jacob Bohlin3c678292020-04-27 10:27:25 +0200764 for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
Tim Hall79d07d22020-04-27 18:20:16 +0100765 (
766 cmd.ifm_tensor,
767 cmd.ifm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200768 cmd0.NPU_SET_IFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100769 (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
770 (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
771 cmd0.NPU_SET_IFM_ZERO_POINT,
772 ),
773 (
774 cmd.ifm2_tensor,
775 cmd.ifm2_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200776 cmd0.NPU_SET_IFM2_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100777 (
778 cmd1.NPU_SET_IFM2_BASE0,
779 cmd1.NPU_SET_IFM2_BASE1,
780 cmd1.NPU_SET_IFM2_BASE2,
781 cmd1.NPU_SET_IFM2_BASE3,
782 ),
783 (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
784 cmd0.NPU_SET_IFM2_ZERO_POINT,
785 ),
786 (
787 cmd.ofm_tensor,
788 cmd.ofm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200789 cmd0.NPU_SET_OFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100790 (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
791 (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
792 cmd0.NPU_SET_OFM_ZERO_POINT,
793 ),
794 ):
795
Diego Russoea6111a2020-04-14 18:41:58 +0100796 if tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100797 continue
798
Diego Russoea6111a2020-04-14 18:41:58 +0100799 need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite")
Tim Hall79d07d22020-04-27 18:20:16 +0100800 if (
801 primary_op.type in set(("AvgPool", "AvgPoolAct")) and not need_zero_point
Diego Russoea6111a2020-04-14 18:41:58 +0100802 ) or tens.quantization is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100803 # Actual integer operation, just set scale to 1 and zero point to 0
804 emit.cmd0_with_param(zero_point_op, 0)
805 else:
806 assert tens.quantization.zero_point is not None, "need an actual zero point set"
807 emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point))
808
809 if tens.shape == []:
810 # Empty shape, elementwise constant
811 ifm2_scalar = tens.quant_values.astype(np.uint8)
812 assert ifm2_scalar.size == 1
813 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, ifm2_scalar.item(0))
814 continue
815
816 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
817 box.start_coord, box.end_coord
818 )
819 if npu_block_type != NpuBlockType.VectorProduct:
820 if tens == cmd.ifm_tensor:
821 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
822 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
823 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
824 elif tens == cmd.ofm_tensor:
825 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
826 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
827 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
Louis Verhaard0cf06c72020-05-12 08:31:05 +0200828 if tens == cmd.ifm2_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100829 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
830 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
831 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
832 else:
833 if len(out_shape) == 2:
834 # TODO: N is put in W-dimension for now
835 # Should be spread over H and W, but then block size selectetion,
836 # and stride calculation should be changed
837 if tens == cmd.ifm_tensor:
838 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
839 elif tens == cmd.ofm_tensor:
840 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
841 else:
842 assert False
843
Jacob Bohlin3c678292020-04-27 10:27:25 +0200844 if tens.mem_area == MemArea.Sram:
845 emit.cmd0_with_param(region_op, BasePointerIndex.Scratch)
846 else:
847 emit.cmd0_with_param(region_op, BasePointerIndex.ReadOnly)
848
Tim Hall79d07d22020-04-27 18:20:16 +0100849 for idx, addr in enumerate(addresses):
850 if addr is None:
851 addresses[idx] = 0
852
853 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
854 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
855 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
856 emit.cmd1_with_offset(ptr_ops[3], addresses[3])
857
858 strides = tens.get_strides()
859 emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
860 emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
861 emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
862
863 if tens.format == TensorFormat.NHCWB16:
864 # Check that all BasePointer addresses are aligned to 16 bytes
865 assert (int(addresses[0]) % 16) == 0
866 assert (int(addresses[1]) % 16) == 0
867 assert (int(addresses[2]) % 16) == 0
868 assert (int(addresses[3]) % 16) == 0
869
870 ofm_dtype = cmd.ofm_tensor.dtype
871 assert ofm_dtype.type & BaseType.Int
872 prec = 0
873 if ofm_dtype.size_in_bits() == 8:
874 prec = 0
875 elif ofm_dtype.size_in_bits() == 16:
876 prec = 2
877 else:
878 assert 0
879
880 if ofm_dtype.type & BaseType.Signed:
881 prec += 1
882
883 if use_global_scale:
884 # Set global scale bit, as opposed to using per channel scale
885 prec |= 1 << 8
886
887 if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
888 prec |= 1 << 6
889
890 prec |= rounding_mode.value << 14
891
892 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
893
894 prec = None
895 weight_bits = 8
896 if cmd.weight_tensor is not None:
897 weight_bits = cmd.weight_tensor.dtype.size_in_bits()
898
899 ifm_dtype = cmd.ifm_tensor.dtype
900
901 assert weight_bits == 8, "Unsupported weight bit depth"
902 assert ifm_dtype.size_in_bits() in {8, 16}
903
904 if ifm_dtype.size_in_bits() == 8:
905 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200906 prec = ifm_precision.S8
Tim Hall79d07d22020-04-27 18:20:16 +0100907 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200908 prec = ifm_precision.U8
Tim Hall79d07d22020-04-27 18:20:16 +0100909 elif ifm_dtype.size_in_bits() == 16:
910 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200911 prec = ifm_precision.S16
Tim Hall79d07d22020-04-27 18:20:16 +0100912 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200913 prec = ifm_precision.U16
Tim Hall79d07d22020-04-27 18:20:16 +0100914
915 ifm_prec = prec.value
916 ifm2_prec = ifm_prec
917
918 if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
919 ifm_prec |= 1 << 6
920
921 ifm_prec |= op_to_scale << 8
922
923 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
924
925 if cmd.ifm2_tensor is not None:
926 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
927 ifm2_prec |= 1 << 6
928 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
929
930 emit_wait_commands(cmd)
931
932 # Get op parameters
933 cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
934 cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
935 cur_ofm_rect = get_op_ofm_rect(cmd)
936 cur_ifm_rect = get_op_ifm_rect(cmd)
937 cur_kernel = get_op_kernel(cmd.ps)
938 cur_padLT = get_op_padding_lt(cmd)
939 if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
940 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
941 blockdep = arch.calc_block_dep(
942 prev_ifm_rect,
943 prev_ofm_rect,
944 prev_ifm_block_depth,
945 prev_ofm_block,
946 prev_kernel,
947 cur_ifm_rect,
948 cur_ofm_rect,
949 cur_ifm_block_depth,
950 cur_ofm_block,
951 cur_kernel,
952 cur_padLT,
953 )
954 else:
955 blockdep = 0
956 else:
957 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
958
959 # Set between every op (dependent or not)
960 blockdep = min(blockdep, arch.max_blockdep)
961 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
962 prev_cmd = cmd
963
964 if npu_block_type == NpuBlockType.ConvolutionMxN:
965 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
966 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
967 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
968 elif npu_block_type == NpuBlockType.VectorProduct:
969 # Vector product is implemented using a 1x1 convolution
970 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
971 elif npu_block_type == NpuBlockType.Pooling:
972 param = "Max" not in primary_op.type
973 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
974 elif npu_block_type == NpuBlockType.ElementWise:
975 param = elementwise_mode_map[primary_op.type]
976 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
977 else:
978 print("Warning: Skipping register command stream generation for", ps)
979
980 # Fill in final part of command stream:
981 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
982
983 sg.register_command_stream = emit.to_list()
984 if verbose:
985 emit.print_cmds()
986 print("number of commands", len(emit.cmd_stream))
987 print("command stream length in words", len(sg.register_command_stream))