blob: 9dd290a962d5310d394699c73e9dfbe066cd9ec7 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
18# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
19# stream suitable for interpretation by the Ethos-U55 processor.
Tim Hall79d07d22020-04-27 18:20:16 +010020from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010021from enum import Enum
22from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010023
24import numpy as np
25
26from . import scaling
Diego Russoe8a10452020-04-21 17:39:10 +010027from .architecture_features import ArchitectureFeatures
28from .architecture_features import Block
29from .architecture_features import Kernel
30from .architecture_features import Rect
31from .architecture_features import SharedBufferArea
32from .architecture_features import SHRAMElements
33from .data_type import BaseType
34from .data_type import DataType
35from .ethos_u55_regs.ethos_u55_regs import acc_format
36from .ethos_u55_regs.ethos_u55_regs import activation
37from .ethos_u55_regs.ethos_u55_regs import cmd0
38from .ethos_u55_regs.ethos_u55_regs import cmd1
39from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
40from .ethos_u55_regs.ethos_u55_regs import ifm_precision
Jacob Bohlincf7da102020-05-20 09:03:40 +020041from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010042from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall79d07d22020-04-27 18:20:16 +010043from .high_level_command_stream import CommandType
Diego Russoe8a10452020-04-21 17:39:10 +010044from .numeric_util import clamp_sigmoid
45from .numeric_util import clamp_tanh
Louis Verhaardb2fb2122020-06-04 15:51:24 +020046from .numeric_util import full_shape
Diego Russoe8a10452020-04-21 17:39:10 +010047from .numeric_util import quantise_float32
48from .numeric_util import round_away_zero
49from .numeric_util import round_up
50from .numeric_util import round_up_to_int
Tim Hall79d07d22020-04-27 18:20:16 +010051from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010052from .shared_buffer_allocation import SharedBufferAllocation
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020053from .tensor import MemType
Diego Russoe8a10452020-04-21 17:39:10 +010054from .tensor import TensorBlockTraversal
55from .tensor import TensorFormat
Tim Hall79d07d22020-04-27 18:20:16 +010056
57
58class RegisterMachine:
59 def __init__(self):
60 self.n_banks = 1
61 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
62 self.bank_idx = 0
63
64 def set_register(self, reg, value):
65 is_changed = self.registers[self.bank_idx][reg] != value
66 self.registers[self.bank_idx][reg] = value
67 # is_changed = True # force command
68 return is_changed
69
70 def switch_bank(self):
71 self.bank_idx = (self.bank_idx + 1) % self.n_banks
72
73
74class CmdMode(IntEnum):
75 NoPayload = 0x0000
76 Payload32 = 0x4000
77 Mask = 0xC000
78 CmdOpMask = 0x03FF
79
80
81class BasePointerIndex(IntEnum):
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020082 WeightTensor = 0 # base address index for the Weight tensor
83 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
84 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Tim Hall79d07d22020-04-27 18:20:16 +010085
86
87# TODO: Replace with definitions from ethos_u55_regs
88class IFM2Broadcast(IntEnum):
89 BroadcastHdim = 1 << 0
90 BroadcastWdim = 1 << 1
91 BroadcastCdim = 1 << 2
92 ReverseOperandOrder = 1 << 6
93 UseIFM2Scalar = 1 << 7
94
95
96class CommandStreamEmitter:
97 def __init__(self):
98 self.cmd_stream = []
99 self.reg_machine = [RegisterMachine(), RegisterMachine()]
100 self.last_absolute_wait = defaultdict(int)
101
102 def get_reg_machine(self, cmd):
103 if "DMA" in cmd.name:
104 return self.reg_machine[1]
105 else:
106 return self.reg_machine[0]
107
108 def size_in_bytes(self):
109 sz = 0
110 for cmd in self.cmd_stream:
111 sz += len(cmd) * 4
112 return sz
113
114 def to_list(self):
115 return [elem for cmd in self.cmd_stream for elem in cmd]
116
117 def print_cmds(self):
118 print("Code: Command: Param: Payload:")
119 for words_for_one_command in self.cmd_stream:
120 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
121 param = words_for_one_command[0] >> 16 # higher 16 bits
122
123 payload_mode = CmdMode(code & CmdMode.Mask)
124
125 # code and command
126 s = " 0x%04x " % code
127 if payload_mode == CmdMode.NoPayload:
128 s += str(cmd0(code & CmdMode.CmdOpMask))
129 else:
130 s += str(cmd1(code & CmdMode.CmdOpMask))
131
132 s = s.ljust(40)
133 s += "%5d" % param
134
135 # payload
136 if payload_mode == CmdMode.Payload32:
137 s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
138 else:
139 s += " -"
140
141 print(s)
142
143 def cmd0_with_param(self, cmd, param):
144 if isinstance(param, Enum):
145 param = int(param.value)
146 else:
147 param = int(param)
148 param = param & 0xFFFF
149 command = cmd.value | (param << 16)
150 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
151 return
152
153 # This is not a redundant command, actually write it
154 self.cmd_stream.append((command,))
155
156 def cmd1_with_offset(self, cmd, offset, param=0x0):
157 offset = int(offset) & 0xFFFFFFFFF
158 command = cmd.value | CmdMode.Payload32.value | (param << 16)
159
160 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
161 return
162
163 # This is not a redundant command, actually write it
164 self.cmd_stream.append((command, offset))
165
166 def cmd_wait(self, cmd, param, absolute_wait_time):
167 if absolute_wait_time <= self.last_absolute_wait[cmd]:
168 return
169
170 self.last_absolute_wait[cmd] = absolute_wait_time
171 param = int(param)
172 command = ((param & 0xFFFF) << 16) | cmd.value
173 self.cmd_stream.append((command,))
174
175 def cmd_do_operation(self, cmd, param=0):
176 param = int(param)
177 command = ((param & 0xFFFF) << 16) | cmd.value
178
179 self.cmd_stream.append((command,))
180 self.get_reg_machine(cmd).switch_bank()
181
182
183def calc_command_dependencies(cmd_stream, arch):
184 cmd_starts = {}
185 cmd_ends = {}
186 memory_accesses = {}
187
188 # Keep track of accumulated number of commands in command stream.
189 # First element kernel ops: (# of blocks, # of commands)
190 # Second element DMA ops: (# of commands)
191 pos = np.array((np.array((0, 0)), np.array([0])))
192
193 dependencies = {}
194
195 for cmd in cmd_stream:
196 cmd_starts[cmd] = pos
197 op_count = cmd.get_operation_count()
198 # Keep track of both num blocks and commands
199 cmd_add = 0 if (op_count[0] == 0) else 1
200 pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])))
201 cmd_ends[cmd] = np.array((pos[0], pos[1]))
202 memory_accesses[cmd] = cmd.get_memory_accesses()
203
204 for idx, cmd in enumerate(cmd_stream):
205 curr_accesses = memory_accesses[cmd]
206 # Keep track of command dependency.
207 # First element kernel ops: (# of blocks, # of commands)
208 # Second element DMA ops: (# of commands)
209 dep_offsets = np.array((np.array((-1, -1)), np.array([-1])))
210 dep_cmds = [None] * CommandType.Size.value
211 if idx > 0:
212 # Look at the previous commands in backwards order
213 for prev_cmd in cmd_stream[idx - 1 :: -1]:
214 assert prev_cmd is not cmd
215 if dep_cmds[prev_cmd.cmdtype] is None:
216 is_dependency = False
217 if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
218 # Special handling here, as dpu -> dpu operations require additional care
219 if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
220 is_dependency = True
221 elif memory_accesses[prev_cmd].conflicts(curr_accesses):
222 is_dependency = True
223 else:
224 if memory_accesses[prev_cmd].conflicts(curr_accesses):
225 is_dependency = True
226
227 if is_dependency:
228 new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
229 if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
230 dep_cmds[prev_cmd.cmdtype] = prev_cmd
231 dep_offsets[prev_cmd.cmdtype] = new_offset
232
233 # Check if we've got dependencies for all commands, in which case we can early out
234 for dep in dep_cmds:
235 if dep is None:
236 break
237 else:
238 break # all handled
239
240 # Convert absolute to relative dependencies, using None to signal the special case of no
241 # dependency of this kind
242 res = [None] * CommandType.Size.value
243 for i in range(CommandType.Size.value):
244 if dep_cmds[i] is not None:
245 res[i] = cmd_starts[cmd][i] - dep_offsets[i]
246
247 dependencies[cmd] = cmd_starts[cmd], res
248
249 return dependencies
250
251
252def get_op_kernel(ps):
253 if ps.primary_op is None:
254 return None
255
256 strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
257 dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
258 if ps.weight_tensor:
259 if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
260 k_h = 1
261 k_w = 1
262 else:
263 k_h = ps.weight_tensor.shape[0]
264 k_w = ps.weight_tensor.shape[1]
265 else:
266 k_h = ps.primary_op.attrs.get("filter_height", 1)
267 k_w = ps.primary_op.attrs.get("filter_width", 1)
268
269 return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
270
271
Tim Hall79d07d22020-04-27 18:20:16 +0100272def has_prev_op_dependency(prev_cmd, cmd):
273 if prev_cmd is None:
274 return False
275 if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
Tim Hall90337952020-05-07 16:42:35 +0100276 if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:
Tim Hall79d07d22020-04-27 18:20:16 +0100277 return True
Tim Hall90337952020-05-07 16:42:35 +0100278 elif cmd.ifm2_tensor is not None:
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200279 return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id
Tim Hall79d07d22020-04-27 18:20:16 +0100280 return False
281
282
283def get_op_ofm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200284 start = full_shape(4, cmd.ofm_box.start_coord, 0)
285 end = full_shape(4, cmd.ofm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100286 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
287
288
289def get_op_ifm_rect(cmd):
Charles Xu3e9c4342020-04-22 08:31:43 +0200290 start = full_shape(4, cmd.ifm_box.start_coord, 0)
291 end = full_shape(4, cmd.ifm_box.end_coord, 1)
Tim Hall79d07d22020-04-27 18:20:16 +0100292 return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
293
294
295def get_op_ifmofm_block_depth(arch, cmd):
296 # Note: NOT equivalent to the normal ifm block depth calculation since
297 # it takes into account 'depthless' block operations by returning full
298 # depth
299 if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise):
300 return cmd.ofm_box.get_size_shape()[-1]
301
302 return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
303
304
305def get_op_padding_lt(cmd):
306 if cmd.ps.npu_block_type not in (
307 NpuBlockType.ConvolutionDepthWise,
308 NpuBlockType.Pooling,
309 NpuBlockType.ConvolutionMxN,
310 ):
311 return (0, 0)
312
313 explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
314
315 # Check if this is for horizontal ifm streaming
316 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
317 explicit_padding[0] = cmd.pad_top
318 explicit_padding[2] = cmd.pad_bottom
319
320 return (explicit_padding[1], explicit_padding[0])
321
322
323def generate_register_command_stream(nng, sg, arch, verbose=False):
324 emit = CommandStreamEmitter()
325
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200326 if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
327 base_ptr_idx_map = {
328 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
329 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
330 MemType.Scratch: BasePointerIndex.ScratchTensor,
331 MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
332 }
333 else:
334 base_ptr_idx_map = {
335 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
336 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
337 MemType.Scratch: BasePointerIndex.ScratchTensor,
338 MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
339 }
Tim Hall79d07d22020-04-27 18:20:16 +0100340
341 # Maps an AccumulatorType enum to the corresponding acc_format value
342 acc_format_map = {
343 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
344 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
345 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
346 }
347
348 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
349 elementwise_mode_map = {
350 "MulAct": elementwise_mode.MUL.value,
351 "AddAct": elementwise_mode.ADD.value,
352 "SubAct": elementwise_mode.SUB.value,
353 "Minimum": elementwise_mode.MIN.value,
354 "Maximum": elementwise_mode.MAX.value,
355 "LeakyRelu": elementwise_mode.LRELU.value,
356 "Abs": elementwise_mode.ABS.value,
357 }
358
359 cmd_stream = []
360 for cmd in sg.high_level_command_stream:
361 if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
362 print("Warning: Skipping register command stream generation for", cmd.ps)
363 else:
364 cmd_stream.append(cmd)
365
366 dependencies = calc_command_dependencies(cmd_stream, arch)
367
368 # Initialise operator dependency state
369 prev_ifm_rect = cur_ifm_rect = None
370 prev_ifm_block_depth = cur_ifm_block_depth = None
371 prev_ofm_rect = cur_ofm_rect = None
372 prev_ofm_block = cur_ofm_block = None
373 prev_kernel = cur_kernel = None
374 prev_cmd = None
375
376 def emit_wait_commands(cmd):
377 # The command is fully set up, emit whatever wait commands we need
378 absolute_dep, relative_dep = dependencies[cmd]
379 if relative_dep[CommandType.NpuStripe] is not None:
380 if cmd.cmdtype == CommandType.DMA:
381 param = relative_dep[CommandType.NpuStripe][1]
382 if param <= 3:
383 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
384 else:
385 param = relative_dep[CommandType.NpuStripe][0]
386 param = min(param, 0xFFFF) # Clamp to allowable wait amount
387
388 if relative_dep[CommandType.DMA] is not None:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200389 # TODO This can be optimized for yoda
390 param = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100391 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
Tim Hall79d07d22020-04-27 18:20:16 +0100392
Tim Hall79d07d22020-04-27 18:20:16 +0100393 for cmd in cmd_stream:
394 if cmd.cmdtype == CommandType.DMA:
395 start_coord = cmd.box.start_coord
396
397 src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
398 dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
399
400 if cmd.in_tensor.compressed_values is not None:
401 stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
402 sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
403 else:
404 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
405
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200406 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
Tim Hall79d07d22020-04-27 18:20:16 +0100407 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200408 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
409
Tim Hall79d07d22020-04-27 18:20:16 +0100410 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
411 emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
412 dma_channel = 0
413 mode = 0 # From external to external
414
415 emit_wait_commands(cmd)
416 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
417
418 elif cmd.cmdtype == CommandType.NpuStripe:
419
420 ps = cmd.ps
421 primary_op = ps.primary_op
422 npu_block_type = ps.npu_block_type
423 # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
424 use_global_scale = False
425 # Specifies type of rounding to be used.
426 rounding_mode = rounding.TFL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200427 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200428 rounding_mode = rounding.TRUNCATE
Tim Hall79d07d22020-04-27 18:20:16 +0100429 fmf = primary_op.attrs.get("fused_memory_function", None)
430 faf = primary_op.attrs.get("fused_activation_function", None)
431
432 # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
433 op_to_scale = 0
434
435 # Update state history
436 prev_ifm_rect = cur_ifm_rect
437 prev_ifm_block_depth = cur_ifm_block_depth
438 prev_ofm_rect = cur_ofm_rect
439 prev_ofm_block = cur_ofm_block
440 prev_kernel = cur_kernel
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200441 cur_kernel = get_op_kernel(ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100442
443 block_config = ps.block_config
444 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
445 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
446 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
447
448 shared_buffer = ps.shared_buffer
449
450 if npu_block_type == NpuBlockType.ElementWise:
451 ifm2_broadcast = 0
452
453 if cmd.ifm_tensor.shape == []:
454 # The scalar has to be the ifm2 tensor so switch the ifms
455 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
456 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
457
458 # Set ReverseOperandOrder bit to IFM2_BROADCAST
459 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
460
461 # Calculate scales needed for arithmetic elementwise operators
462 if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
463 input_scale = cmd.ifm_tensor.quantization.scale_f32
464 input2_scale = cmd.ifm2_tensor.quantization.scale_f32
465 output_scale = cmd.ofm_tensor.quantization.scale_f32
466 use_global_scale = True
467
468 if primary_op.type == "MulAct":
469 if (faf == "Sigmoid") or (faf == "Tanh"):
470 output_scale = 1 / 0x3000
471
472 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
473 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
474 else: # AddAct/SubAct
475 if (faf == "Sigmoid") or (faf == "Tanh"):
476 output_scale = 1 / 0x3000
477
478 if input_scale == input2_scale:
479 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
480 input_scale, input2_scale, output_scale
481 )
482 opa_shift = 0 # Unused for this case
483 else:
484 # Use advanced implementation only when input scales differ
485 bitdepth = cmd.ifm_tensor.dtype.bits
486 (
487 opa_scale,
488 opa_shift,
489 ofm_scale,
490 shift,
491 op_to_scale,
492 ) = scaling.advanced_elementwise_add_sub_scale(
493 input_scale, input2_scale, output_scale, bitdepth
494 )
495 opb_scale = 0 # Unused for this case
496 if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
497 # If the operand order is reversed we also have to swap which operand is scaled
498 if op_to_scale == scaling.OperandToScale.OPa:
499 op_to_scale = scaling.OperandToScale.OPb
500 else:
501 op_to_scale = scaling.OperandToScale.OPa
502
503 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
504 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
505 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
506
507 if primary_op.type in set(("LeakyRelu", "Abs",)):
508 output_scale = cmd.ofm_tensor.quantization.scale_f32
509 use_global_scale = True
510
511 if primary_op.type == "LeakyRelu":
512 output_scale *= primary_op.attrs["alpha"]
513
514 ofm_scale, shift = scaling.quantise_scale(output_scale)
515 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
516
517 # For elementwise set the required SHRAM to be equal to the total size of SHRAM
518 shram_required = arch.shram_total_banks
519 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
520
521 # Acc buffers not needed so set AB_START to size of SHRAM
522 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)
523
524 # Is not a unary operator
525 if cmd.ifm2_tensor is not None:
526 if cmd.ifm2_tensor.shape == []:
527 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
528 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
529 else:
530 ifm_box_shape = cmd.ifm_box.get_size_shape()
531 ifm2_box_shape = cmd.ifm2_box.get_size_shape()
532
533 if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
534 # Broadcast in 'H' dimension
535 assert cmd.ifm2_tensor.shape[1] == 1
536 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
537
538 if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
539 # Broadcast in 'W' dimension
540 assert cmd.ifm2_tensor.shape[2] == 1
541 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
542
543 if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
544 # Broadcast in 'C' dimension
545 assert cmd.ifm2_tensor.shape[3] == 1
546 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
547
548 # Set IFM2_IB_START to the latter half of the IB space
549 ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
550 emit.cmd0_with_param(
551 cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
552 )
553
554 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
555
556 else:
557 emit.cmd0_with_param(
558 cmd0.NPU_SET_IFM_IB_END,
559 shared_buffer.bank_locations[SharedBufferArea.IFM]
560 + shared_buffer.banks_required[SharedBufferArea.IFM],
561 )
562 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
563
564 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
565
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200566 if primary_op.type == "ResizeBilinear":
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200567 # perform nearest neighbor upscale
Jacob Bohlincf7da102020-05-20 09:03:40 +0200568 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
569 elif primary_op.type == "Conv2DBackpropInputSwitchedBias":
570 # perform insert zero upscale
571 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200572 else:
Jacob Bohlincf7da102020-05-20 09:03:40 +0200573 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
Tim Hall79d07d22020-04-27 18:20:16 +0100574
575 if npu_block_type in set(
576 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
577 ):
578 # Set up padding
579 explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
580
581 # Check if this is for horizontal ifm streaming
582 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
583 explicit_padding[0] = cmd.pad_top
584 explicit_padding[2] = cmd.pad_bottom
585
586 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
587 # because of activation function needed to be fused.
588 if cmd.ifm_box.start_coord[-2] > 0:
589 explicit_padding[1] = 0
590 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
591 explicit_padding[3] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100592 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
593 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
594 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
595 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
596
Dwight Lidman0538a772020-05-06 14:09:17 +0200597 # set kernel x stride low bit
598 stride = primary_op.attrs["strides"][2] - 1 & 1
599 # set kernel y stride low bit
600 stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
601 # set kernel x stride extension bits
602 stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
603 # set kernel y stride extension bits
604 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
605
Tim Hall79d07d22020-04-27 18:20:16 +0100606 if npu_block_type == NpuBlockType.Pooling:
607 k_height, k_width = primary_op.attrs["ksize"][1:3]
608 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
609 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
610
611 valid_padding = sum(explicit_padding) == 0
612
Dwight Lidman3ec04ac2020-04-30 11:54:48 +0200613 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and valid_padding:
Tim Hall79d07d22020-04-27 18:20:16 +0100614 # For valid padding vela has to output scaling values
615 if faf == "Sigmoid" or faf == "Tanh":
616 rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
Tim Hall79d07d22020-04-27 18:20:16 +0100617
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200618 if cmd.ifm_tensor.dtype == DataType.int16:
Charles Xu749d9212020-06-11 12:39:19 +0200619 multiplier = max(1, int(4096 * cmd.ifm_tensor.quantization.scale_f32 + 0.5))
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200620 rescale *= 3 * multiplier
621
622 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
Tim Hall79d07d22020-04-27 18:20:16 +0100623 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200624
625 if cmd.ifm_tensor.dtype == DataType.int16:
626 scale = (1 << shift) * 3 * multiplier
627 else:
628 scale = int(round_away_zero(scale * rescale))
Tim Hall79d07d22020-04-27 18:20:16 +0100629 else:
630 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
631 # k_height == k_width == 1 is allways true in this case
632 # Normally the scale is maximised, to get maximum precision, which means that
633 # if rescale != 1, scale need to consider the number of bits needed for rescaling
634 rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
635 rescale_bits = 0
636 if k_height == k_width == 1:
637 if fmf == "ConcatSliceWrite":
638 rounding_mode = rounding.NATURAL
639 if rescale > 1:
640 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
641 elif rescale < 1:
642 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
643 scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
644 scale = int(round_away_zero(scale * rescale))
645
646 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
647 # Valid-padded average pool should use the global scale from
648 # NPU_SET_OFM_SCALE register, which is set above.
649 use_global_scale = True
650
651 else: # Convolution
652 assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
Fredrik Svedbergd67c0aa2020-03-30 13:15:28 +0200653 # Reduced precision quantization and natural rounding used for int16
654 if cmd.ifm_tensor.dtype == DataType.int16:
655 rounding_mode = rounding.NATURAL
Louis Verhaardb2fb2122020-06-04 15:51:24 +0200656 stride |= (cur_kernel.dilation.y - 1) << 4
657 stride |= (cur_kernel.dilation.x - 1) << 3
658 emit.cmd0_with_param(
659 cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
660 )
661 emit.cmd0_with_param(
662 cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
663 )
Tim Hall79d07d22020-04-27 18:20:16 +0100664 if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
665 # Part-kernel-first weight ordering
666 assert npu_block_type == NpuBlockType.ConvolutionMxN
667 stride |= 1 << 2
668
669 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
670
671 elif npu_block_type in set((NpuBlockType.VectorProduct,)):
672 # Vector product is implemented using a 1x1 convolution so need
673 # to setup the appropriate padding and kernel info
674 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
675 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
676 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
677 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
678
679 # kernel stride reg = 0 means stride(1,1) + depth first weight
680 # order + dilation(0,0) + kernel_split_size=8
681 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
682
683 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
684 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
685
686 if npu_block_type in set(
687 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
688 ):
689 # Emit Weight base address commands, only maps the area required for
690 # this command's weights from the larger tensor.
691 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
692 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
693 weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200694 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100695 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
696 emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
697 emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
698
699 # Emit Scale & Bias base address commands, with length matching the amount required by
700 # the weight tensors.
701 if cmd.scale_tensor is not None:
702 # Get address and size of the scale/bias data area
703 scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
704 scale_len = (
705 cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
706 )
707 # Emit base address for NPU to access scale & bias data
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200708 scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
Tim Hall79d07d22020-04-27 18:20:16 +0100709 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
710 emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
711 emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
712
713 ofm_quant = cmd.ofm_tensor.quantization
714 ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
715 ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
716 ifm_min = cmd.ifm_tensor.quantization.min
717 ifm_max = cmd.ifm_tensor.quantization.max
718
719 # Emit commands for any fused activation function
Diego Russoea6111a2020-04-14 18:41:58 +0100720 if faf is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100721 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
722 # Even if no activation function, values need to be set to override previous values
723 faf_min = ofm_quant_qmin
724 faf_max = ofm_quant_qmax
725 elif faf == "Relu":
726 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
727 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
728 faf_max = ofm_quant_qmax
729 elif faf == "Relu6":
730 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
731 faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
732 faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
733 elif faf == "ReluN1To1":
734 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
735 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
736 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
737 elif faf == "Tanh":
738 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200739 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
740 faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
741 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
742 else:
743 faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
744 faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100745 elif faf == "Sigmoid":
746 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
Fredrik Svedberg620d88c2020-05-19 10:43:01 +0200747 if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
748 faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
749 faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
750 else:
751 faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
752 faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
Tim Hall79d07d22020-04-27 18:20:16 +0100753 else:
754 raise Exception("Unsupported fused_activation_function = " + faf)
755
756 # Activation range needs to be set based upon the quantisation range and the fused activation range
757 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
758 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
759
760 out_shape = cmd.ofm_box.get_size_shape()
761 if len(out_shape) >= 4:
762 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
763 else:
764 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
765 if len(out_shape) >= 2:
766 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
767 else:
768 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
769 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
770
771 if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
772 in_shape = cmd.ifm_box.get_size_shape()
773 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
774 else:
775 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
776
Jacob Bohlin3c678292020-04-27 10:27:25 +0200777 for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
Tim Hall79d07d22020-04-27 18:20:16 +0100778 (
779 cmd.ifm_tensor,
780 cmd.ifm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200781 cmd0.NPU_SET_IFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100782 (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
783 (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
784 cmd0.NPU_SET_IFM_ZERO_POINT,
785 ),
786 (
787 cmd.ifm2_tensor,
788 cmd.ifm2_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200789 cmd0.NPU_SET_IFM2_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100790 (
791 cmd1.NPU_SET_IFM2_BASE0,
792 cmd1.NPU_SET_IFM2_BASE1,
793 cmd1.NPU_SET_IFM2_BASE2,
794 cmd1.NPU_SET_IFM2_BASE3,
795 ),
796 (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
797 cmd0.NPU_SET_IFM2_ZERO_POINT,
798 ),
799 (
800 cmd.ofm_tensor,
801 cmd.ofm_box,
Jacob Bohlin3c678292020-04-27 10:27:25 +0200802 cmd0.NPU_SET_OFM_REGION,
Tim Hall79d07d22020-04-27 18:20:16 +0100803 (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
804 (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
805 cmd0.NPU_SET_OFM_ZERO_POINT,
806 ),
807 ):
808
Diego Russoea6111a2020-04-14 18:41:58 +0100809 if tens is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100810 continue
811
Diego Russoea6111a2020-04-14 18:41:58 +0100812 need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite")
Tim Hall79d07d22020-04-27 18:20:16 +0100813 if (
Dwight Lidman86d49932020-06-04 15:31:56 +0200814 primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")) and not need_zero_point
Diego Russoea6111a2020-04-14 18:41:58 +0100815 ) or tens.quantization is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100816 # Actual integer operation, just set scale to 1 and zero point to 0
817 emit.cmd0_with_param(zero_point_op, 0)
818 else:
819 assert tens.quantization.zero_point is not None, "need an actual zero point set"
820 emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point))
821
822 if tens.shape == []:
823 # Empty shape, elementwise constant
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200824 ifm2_scalar = tens.quant_values
Tim Hall79d07d22020-04-27 18:20:16 +0100825 assert ifm2_scalar.size == 1
Louis Verhaardc88a96f2020-06-10 09:04:33 +0200826 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
Tim Hall79d07d22020-04-27 18:20:16 +0100827 continue
828
829 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
830 box.start_coord, box.end_coord
831 )
832 if npu_block_type != NpuBlockType.VectorProduct:
833 if tens == cmd.ifm_tensor:
834 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
835 emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
836 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
837 elif tens == cmd.ofm_tensor:
838 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
839 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
840 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
Louis Verhaard0cf06c72020-05-12 08:31:05 +0200841 if tens == cmd.ifm2_tensor:
Tim Hall79d07d22020-04-27 18:20:16 +0100842 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
843 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
844 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
845 else:
846 if len(out_shape) == 2:
847 # TODO: N is put in W-dimension for now
848 # Should be spread over H and W, but then block size selectetion,
849 # and stride calculation should be changed
850 if tens == cmd.ifm_tensor:
851 emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
852 elif tens == cmd.ofm_tensor:
853 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
854 else:
855 assert False
856
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200857 emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
Jacob Bohlin3c678292020-04-27 10:27:25 +0200858
Tim Hall79d07d22020-04-27 18:20:16 +0100859 for idx, addr in enumerate(addresses):
860 if addr is None:
861 addresses[idx] = 0
862
863 emit.cmd1_with_offset(ptr_ops[0], addresses[0])
864 emit.cmd1_with_offset(ptr_ops[1], addresses[1])
865 emit.cmd1_with_offset(ptr_ops[2], addresses[2])
866 emit.cmd1_with_offset(ptr_ops[3], addresses[3])
867
868 strides = tens.get_strides()
869 emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
870 emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
871 emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
872
873 if tens.format == TensorFormat.NHCWB16:
874 # Check that all BasePointer addresses are aligned to 16 bytes
875 assert (int(addresses[0]) % 16) == 0
876 assert (int(addresses[1]) % 16) == 0
877 assert (int(addresses[2]) % 16) == 0
878 assert (int(addresses[3]) % 16) == 0
879
880 ofm_dtype = cmd.ofm_tensor.dtype
881 assert ofm_dtype.type & BaseType.Int
882 prec = 0
883 if ofm_dtype.size_in_bits() == 8:
884 prec = 0
885 elif ofm_dtype.size_in_bits() == 16:
886 prec = 2
887 else:
888 assert 0
889
890 if ofm_dtype.type & BaseType.Signed:
891 prec += 1
892
893 if use_global_scale:
894 # Set global scale bit, as opposed to using per channel scale
895 prec |= 1 << 8
896
897 if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
898 prec |= 1 << 6
899
900 prec |= rounding_mode.value << 14
901
902 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
903
904 prec = None
905 weight_bits = 8
906 if cmd.weight_tensor is not None:
907 weight_bits = cmd.weight_tensor.dtype.size_in_bits()
908
909 ifm_dtype = cmd.ifm_tensor.dtype
910
911 assert weight_bits == 8, "Unsupported weight bit depth"
912 assert ifm_dtype.size_in_bits() in {8, 16}
913
914 if ifm_dtype.size_in_bits() == 8:
915 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200916 prec = ifm_precision.S8
Tim Hall79d07d22020-04-27 18:20:16 +0100917 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200918 prec = ifm_precision.U8
Tim Hall79d07d22020-04-27 18:20:16 +0100919 elif ifm_dtype.size_in_bits() == 16:
920 if ifm_dtype.type & BaseType.Signed:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200921 prec = ifm_precision.S16
Tim Hall79d07d22020-04-27 18:20:16 +0100922 else:
Diqing Zhongfed918b2020-04-27 10:27:34 +0200923 prec = ifm_precision.U16
Tim Hall79d07d22020-04-27 18:20:16 +0100924
925 ifm_prec = prec.value
926 ifm2_prec = ifm_prec
927
928 if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
929 ifm_prec |= 1 << 6
930
931 ifm_prec |= op_to_scale << 8
932
933 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
934
935 if cmd.ifm2_tensor is not None:
936 if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
937 ifm2_prec |= 1 << 6
938 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
939
940 emit_wait_commands(cmd)
941
942 # Get op parameters
943 cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
944 cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
945 cur_ofm_rect = get_op_ofm_rect(cmd)
946 cur_ifm_rect = get_op_ifm_rect(cmd)
Tim Hall79d07d22020-04-27 18:20:16 +0100947 cur_padLT = get_op_padding_lt(cmd)
948 if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
949 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
950 blockdep = arch.calc_block_dep(
951 prev_ifm_rect,
952 prev_ofm_rect,
953 prev_ifm_block_depth,
954 prev_ofm_block,
955 prev_kernel,
956 cur_ifm_rect,
957 cur_ofm_rect,
958 cur_ifm_block_depth,
959 cur_ofm_block,
960 cur_kernel,
961 cur_padLT,
962 )
963 else:
964 blockdep = 0
965 else:
966 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
967
968 # Set between every op (dependent or not)
969 blockdep = min(blockdep, arch.max_blockdep)
970 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
971 prev_cmd = cmd
972
973 if npu_block_type == NpuBlockType.ConvolutionMxN:
974 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
975 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
976 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
977 elif npu_block_type == NpuBlockType.VectorProduct:
978 # Vector product is implemented using a 1x1 convolution
979 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
980 elif npu_block_type == NpuBlockType.Pooling:
981 param = "Max" not in primary_op.type
982 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
983 elif npu_block_type == NpuBlockType.ElementWise:
984 param = elementwise_mode_map[primary_op.type]
985 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
986 else:
987 print("Warning: Skipping register command stream generation for", ps)
988
989 # Fill in final part of command stream:
990 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
991
992 sg.register_command_stream = emit.to_list()
993 if verbose:
994 emit.print_cmds()
995 print("number of commands", len(emit.cmd_stream))
996 print("command stream length in words", len(sg.register_command_stream))