Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates |
| 18 | # all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit |
| 19 | # stream suitable for interpretation by the Ethos-U55 processor. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 20 | from collections import defaultdict |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 21 | from collections import namedtuple |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 22 | from enum import Enum |
| 23 | from enum import IntEnum |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 24 | |
| 25 | import numpy as np |
| 26 | |
| 27 | from . import scaling |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 28 | from .architecture_features import ArchitectureFeatures |
| 29 | from .architecture_features import Block |
| 30 | from .architecture_features import Kernel |
| 31 | from .architecture_features import Rect |
| 32 | from .architecture_features import SharedBufferArea |
| 33 | from .architecture_features import SHRAMElements |
| 34 | from .data_type import BaseType |
| 35 | from .data_type import DataType |
| 36 | from .ethos_u55_regs.ethos_u55_regs import acc_format |
| 37 | from .ethos_u55_regs.ethos_u55_regs import activation |
| 38 | from .ethos_u55_regs.ethos_u55_regs import cmd0 |
| 39 | from .ethos_u55_regs.ethos_u55_regs import cmd1 |
| 40 | from .ethos_u55_regs.ethos_u55_regs import elementwise_mode |
| 41 | from .ethos_u55_regs.ethos_u55_regs import ifm_precision |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 42 | from .ethos_u55_regs.ethos_u55_regs import pooling_mode |
Jacob Bohlin | cf7da10 | 2020-05-20 09:03:40 +0200 | [diff] [blame] | 43 | from .ethos_u55_regs.ethos_u55_regs import resampling_mode |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 44 | from .ethos_u55_regs.ethos_u55_regs import rounding |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 45 | from .high_level_command_stream import CommandType |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 46 | from .numeric_util import clamp_sigmoid |
| 47 | from .numeric_util import clamp_tanh |
Louis Verhaard | b2fb212 | 2020-06-04 15:51:24 +0200 | [diff] [blame] | 48 | from .numeric_util import full_shape |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 49 | from .numeric_util import quantise_float32 |
| 50 | from .numeric_util import round_away_zero |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 51 | from .numeric_util import round_up_to_int |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 52 | from .operation import NpuBlockType |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 53 | from .operation import Op |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 54 | from .tensor import MemType |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 55 | from .tensor import TensorBlockTraversal |
| 56 | from .tensor import TensorFormat |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 57 | from .tensor import TensorPurpose |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 58 | |
| 59 | |
| 60 | class RegisterMachine: |
| 61 | def __init__(self): |
| 62 | self.n_banks = 1 |
| 63 | self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)] |
| 64 | self.bank_idx = 0 |
| 65 | |
| 66 | def set_register(self, reg, value): |
| 67 | is_changed = self.registers[self.bank_idx][reg] != value |
| 68 | self.registers[self.bank_idx][reg] = value |
| 69 | # is_changed = True # force command |
| 70 | return is_changed |
| 71 | |
| 72 | def switch_bank(self): |
| 73 | self.bank_idx = (self.bank_idx + 1) % self.n_banks |
| 74 | |
| 75 | |
| 76 | class CmdMode(IntEnum): |
| 77 | NoPayload = 0x0000 |
| 78 | Payload32 = 0x4000 |
| 79 | Mask = 0xC000 |
| 80 | CmdOpMask = 0x03FF |
| 81 | |
| 82 | |
| 83 | class BasePointerIndex(IntEnum): |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 84 | WeightTensor = 0 # base address index for the Weight tensor |
| 85 | ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena |
| 86 | ScratchFastTensor = 2 # base address for the Scratch_fast_tensor |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 87 | Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 88 | |
| 89 | |
| 90 | # TODO: Replace with definitions from ethos_u55_regs |
| 91 | class IFM2Broadcast(IntEnum): |
| 92 | BroadcastHdim = 1 << 0 |
| 93 | BroadcastWdim = 1 << 1 |
| 94 | BroadcastCdim = 1 << 2 |
| 95 | ReverseOperandOrder = 1 << 6 |
| 96 | UseIFM2Scalar = 1 << 7 |
| 97 | |
| 98 | |
| 99 | class CommandStreamEmitter: |
| 100 | def __init__(self): |
| 101 | self.cmd_stream = [] |
| 102 | self.reg_machine = [RegisterMachine(), RegisterMachine()] |
| 103 | self.last_absolute_wait = defaultdict(int) |
| 104 | |
| 105 | def get_reg_machine(self, cmd): |
| 106 | if "DMA" in cmd.name: |
| 107 | return self.reg_machine[1] |
| 108 | else: |
| 109 | return self.reg_machine[0] |
| 110 | |
| 111 | def size_in_bytes(self): |
| 112 | sz = 0 |
| 113 | for cmd in self.cmd_stream: |
| 114 | sz += len(cmd) * 4 |
| 115 | return sz |
| 116 | |
| 117 | def to_list(self): |
| 118 | return [elem for cmd in self.cmd_stream for elem in cmd] |
| 119 | |
| 120 | def print_cmds(self): |
| 121 | print("Code: Command: Param: Payload:") |
| 122 | for words_for_one_command in self.cmd_stream: |
| 123 | code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits |
| 124 | param = words_for_one_command[0] >> 16 # higher 16 bits |
| 125 | |
| 126 | payload_mode = CmdMode(code & CmdMode.Mask) |
| 127 | |
| 128 | # code and command |
| 129 | s = " 0x%04x " % code |
| 130 | if payload_mode == CmdMode.NoPayload: |
| 131 | s += str(cmd0(code & CmdMode.CmdOpMask)) |
| 132 | else: |
| 133 | s += str(cmd1(code & CmdMode.CmdOpMask)) |
| 134 | |
| 135 | s = s.ljust(40) |
| 136 | s += "%5d" % param |
| 137 | |
| 138 | # payload |
| 139 | if payload_mode == CmdMode.Payload32: |
| 140 | s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1]) |
| 141 | else: |
| 142 | s += " -" |
| 143 | |
| 144 | print(s) |
| 145 | |
| 146 | def cmd0_with_param(self, cmd, param): |
| 147 | if isinstance(param, Enum): |
| 148 | param = int(param.value) |
| 149 | else: |
| 150 | param = int(param) |
| 151 | param = param & 0xFFFF |
| 152 | command = cmd.value | (param << 16) |
| 153 | if not self.get_reg_machine(cmd).set_register(cmd, (command, param)): |
| 154 | return |
| 155 | |
| 156 | # This is not a redundant command, actually write it |
| 157 | self.cmd_stream.append((command,)) |
| 158 | |
| 159 | def cmd1_with_offset(self, cmd, offset, param=0x0): |
| 160 | offset = int(offset) & 0xFFFFFFFFF |
| 161 | command = cmd.value | CmdMode.Payload32.value | (param << 16) |
| 162 | |
| 163 | if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)): |
| 164 | return |
| 165 | |
| 166 | # This is not a redundant command, actually write it |
| 167 | self.cmd_stream.append((command, offset)) |
| 168 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 169 | def cmd_wait(self, cmd, channel, outstanding_count): |
| 170 | param = (16 * channel) + outstanding_count |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 171 | command = ((param & 0xFFFF) << 16) | cmd.value |
| 172 | self.cmd_stream.append((command,)) |
| 173 | |
| 174 | def cmd_do_operation(self, cmd, param=0): |
| 175 | param = int(param) |
| 176 | command = ((param & 0xFFFF) << 16) | cmd.value |
| 177 | |
| 178 | self.cmd_stream.append((command,)) |
| 179 | self.get_reg_machine(cmd).switch_bank() |
| 180 | |
| 181 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 182 | Watermark = namedtuple("Watermark", ["npu", "dma"]) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 183 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 184 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 185 | def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark): |
| 186 | cmd = cmd_stream[cmd_index] |
| 187 | cmd_access = memory_accesses[cmd] |
| 188 | index = cmd_index - 1 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 189 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 190 | # NPU dependency tracking |
| 191 | npu_outstanding = -1 |
| 192 | npu_ops = 0 |
| 193 | npu_index = watermark.npu |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 194 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 195 | # DMA dependency tracking |
| 196 | dma_outstanding = -1 |
| 197 | dma_ops = 0 |
| 198 | dma_index = watermark.dma |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 199 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 200 | # Seek back in the command stream looking for NPU or DMA dependencies |
| 201 | # but only as far as the first dependency or the watermarks (dependencies |
| 202 | # before this point have been satisfied already). |
| 203 | # The watermark moves to after the latest element we must wait for, not |
| 204 | # the command that issues the wait. |
| 205 | # NPU->NPU dependency is handled via blockdep. |
| 206 | while (index >= npu_index) or (index >= dma_index): |
| 207 | prev_cmd = cmd_stream[index] |
| 208 | prev_access = memory_accesses[prev_cmd] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 209 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 210 | # Check DMA consuming NPU output |
| 211 | if prev_cmd.cmdtype == CommandType.NpuStripe: |
| 212 | if index >= npu_index: |
| 213 | if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access): |
| 214 | npu_outstanding = npu_ops |
| 215 | npu_ops = npu_ops + 1 # Count NPU ops in the pipeline |
| 216 | if npu_ops >= arch.max_outstanding_kernels: |
| 217 | npu_index = max(index + 1, npu_index) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 218 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 219 | # Check NPU consuming DMA output |
| 220 | elif prev_cmd.cmdtype == CommandType.DMA: |
| 221 | if index >= dma_index: |
| 222 | if cmd.cmdtype == CommandType.NpuStripe: |
| 223 | if (dma_outstanding == -1) and prev_access.conflicts(cmd_access): |
| 224 | dma_outstanding = dma_ops |
| 225 | dma_ops = dma_ops + 1 # Count DMA ops in the pipeline |
| 226 | if dma_ops >= arch.max_outstanding_dma: |
| 227 | dma_index = max(index + 1, dma_index) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 228 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 229 | index = index - 1 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 230 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 231 | # Update DMA watermark if we didn't see any and the NPU pipeline is full |
| 232 | if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels): |
| 233 | dma_index = cmd_index |
| 234 | |
| 235 | # Bring the search watermark forwards as we complete for those dependencies |
| 236 | watermark = Watermark(npu_index, dma_index) |
| 237 | outstanding = Watermark(npu_outstanding, dma_outstanding) |
| 238 | |
| 239 | return watermark, outstanding |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 240 | |
| 241 | |
| 242 | def get_op_kernel(ps): |
| 243 | if ps.primary_op is None: |
| 244 | return None |
| 245 | |
| 246 | strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1)) |
| 247 | dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1)) |
| 248 | if ps.weight_tensor: |
| 249 | if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)): |
| 250 | k_h = 1 |
| 251 | k_w = 1 |
| 252 | else: |
| 253 | k_h = ps.weight_tensor.shape[0] |
| 254 | k_w = ps.weight_tensor.shape[1] |
| 255 | else: |
| 256 | k_h = ps.primary_op.attrs.get("filter_height", 1) |
| 257 | k_w = ps.primary_op.attrs.get("filter_width", 1) |
| 258 | |
| 259 | return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1]) |
| 260 | |
| 261 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 262 | def has_prev_op_dependency(prev_cmd, cmd): |
| 263 | if prev_cmd is None: |
| 264 | return False |
| 265 | if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps): |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 266 | if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 267 | return True |
Tim Hall | 9033795 | 2020-05-07 16:42:35 +0100 | [diff] [blame] | 268 | elif cmd.ifm2_tensor is not None: |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 269 | return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 270 | return False |
| 271 | |
| 272 | |
| 273 | def get_op_ofm_rect(cmd): |
Charles Xu | 3e9c434 | 2020-04-22 08:31:43 +0200 | [diff] [blame] | 274 | start = full_shape(4, cmd.ofm_box.start_coord, 0) |
| 275 | end = full_shape(4, cmd.ofm_box.end_coord, 1) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 276 | return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1) |
| 277 | |
| 278 | |
| 279 | def get_op_ifm_rect(cmd): |
Charles Xu | 3e9c434 | 2020-04-22 08:31:43 +0200 | [diff] [blame] | 280 | start = full_shape(4, cmd.ifm_box.start_coord, 0) |
| 281 | end = full_shape(4, cmd.ifm_box.end_coord, 1) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 282 | return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1) |
| 283 | |
| 284 | |
| 285 | def get_op_ifmofm_block_depth(arch, cmd): |
| 286 | # Note: NOT equivalent to the normal ifm block depth calculation since |
| 287 | # it takes into account 'depthless' block operations by returning full |
| 288 | # depth |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 289 | if cmd.ps.npu_block_type in ( |
| 290 | NpuBlockType.ConvolutionDepthWise, |
| 291 | NpuBlockType.Pooling, |
| 292 | NpuBlockType.ElementWise, |
| 293 | NpuBlockType.ReduceSum, |
| 294 | ): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 295 | return cmd.ofm_box.get_size_shape()[-1] |
| 296 | |
| 297 | return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits) |
| 298 | |
| 299 | |
| 300 | def get_op_padding_lt(cmd): |
| 301 | if cmd.ps.npu_block_type not in ( |
| 302 | NpuBlockType.ConvolutionDepthWise, |
| 303 | NpuBlockType.Pooling, |
| 304 | NpuBlockType.ConvolutionMxN, |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 305 | NpuBlockType.ReduceSum, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 306 | ): |
| 307 | return (0, 0) |
| 308 | |
| 309 | explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) |
| 310 | |
| 311 | # Check if this is for horizontal ifm streaming |
| 312 | if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): |
| 313 | explicit_padding[0] = cmd.pad_top |
| 314 | explicit_padding[2] = cmd.pad_bottom |
| 315 | |
| 316 | return (explicit_padding[1], explicit_padding[0]) |
| 317 | |
| 318 | |
Jacob Bohlin | e99b893 | 2020-07-13 16:01:51 +0200 | [diff] [blame] | 319 | def ifm_ifm2_correct_order(ifm_shape, ifm2_shape): |
| 320 | if ifm_shape == []: |
| 321 | # Scalar needs to be in IFM2 |
| 322 | return False |
| 323 | elif ifm2_shape == []: |
| 324 | return True |
| 325 | |
| 326 | for ifm, ifm2 in zip(ifm_shape, ifm2_shape): |
| 327 | if ifm != ifm2 and ifm == 1: |
| 328 | # Broadcasted FM needs to be in IFM2 |
| 329 | return False |
| 330 | |
| 331 | return True |
| 332 | |
| 333 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 334 | def generate_register_command_stream(nng, sg, arch, verbose=False): |
| 335 | emit = CommandStreamEmitter() |
| 336 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 337 | if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: |
| 338 | base_ptr_idx_map = { |
| 339 | MemType.Permanent_NPU: BasePointerIndex.WeightTensor, |
| 340 | MemType.Permanent_CPU: BasePointerIndex.WeightTensor, |
| 341 | MemType.Scratch: BasePointerIndex.ScratchTensor, |
| 342 | MemType.Scratch_fast: BasePointerIndex.ScratchTensor, |
| 343 | } |
| 344 | else: |
| 345 | base_ptr_idx_map = { |
| 346 | MemType.Permanent_NPU: BasePointerIndex.WeightTensor, |
| 347 | MemType.Permanent_CPU: BasePointerIndex.WeightTensor, |
| 348 | MemType.Scratch: BasePointerIndex.ScratchTensor, |
| 349 | MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor, |
| 350 | } |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 351 | |
| 352 | # Maps an AccumulatorType enum to the corresponding acc_format value |
| 353 | acc_format_map = { |
| 354 | SHRAMElements.Acc16: acc_format.FP_S5_10.value, |
| 355 | SHRAMElements.Acc32: acc_format.INT_32BIT.value, |
| 356 | SHRAMElements.Acc40: acc_format.INT_40BIT.value, |
| 357 | } |
| 358 | |
| 359 | # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE |
| 360 | elementwise_mode_map = { |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 361 | Op.Mul: elementwise_mode.MUL.value, |
| 362 | Op.Add: elementwise_mode.ADD.value, |
| 363 | Op.Sub: elementwise_mode.SUB.value, |
| 364 | Op.Minimum: elementwise_mode.MIN.value, |
| 365 | Op.Maximum: elementwise_mode.MAX.value, |
| 366 | Op.LeakyRelu: elementwise_mode.LRELU.value, |
| 367 | Op.Abs: elementwise_mode.ABS.value, |
| 368 | Op.CLZ: elementwise_mode.CLZ.value, |
| 369 | Op.SHR: elementwise_mode.SHR.value, |
| 370 | Op.SHL: elementwise_mode.SHL.value, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 371 | } |
| 372 | |
| 373 | cmd_stream = [] |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 374 | memory_accesses = {} |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 375 | for cmd in sg.high_level_command_stream: |
| 376 | if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default: |
| 377 | print("Warning: Skipping register command stream generation for", cmd.ps) |
| 378 | else: |
| 379 | cmd_stream.append(cmd) |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 380 | memory_accesses[cmd] = cmd.get_memory_accesses() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 381 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 382 | def emit_cmd_waits(cmd_waits): |
| 383 | if cmd_waits.npu >= 0: |
| 384 | emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu) |
| 385 | |
| 386 | if cmd_waits.dma >= 0: |
| 387 | emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 388 | |
| 389 | # Initialise operator dependency state |
| 390 | prev_ifm_rect = cur_ifm_rect = None |
| 391 | prev_ifm_block_depth = cur_ifm_block_depth = None |
| 392 | prev_ofm_rect = cur_ofm_rect = None |
| 393 | prev_ofm_block = cur_ofm_block = None |
| 394 | prev_kernel = cur_kernel = None |
| 395 | prev_cmd = None |
| 396 | |
Tim Hall | 42e4189 | 2020-07-06 10:51:31 +0100 | [diff] [blame] | 397 | if arch.is_yoda_system: |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 398 | emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1) |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 399 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 400 | dep_watermark = Watermark(0, 0) |
| 401 | |
| 402 | for cmd_index, cmd in enumerate(cmd_stream): |
| 403 | dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark) |
| 404 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 405 | if cmd.cmdtype == CommandType.DMA: |
| 406 | start_coord = cmd.box.start_coord |
| 407 | |
| 408 | src_addr = cmd.in_tensor.address_for_coordinate(start_coord) |
| 409 | dst_addr = cmd.out_tensor.address_for_coordinate(start_coord) |
| 410 | |
| 411 | if cmd.in_tensor.compressed_values is not None: |
| 412 | stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord) |
| 413 | sz = cmd.in_tensor.size_of_compressed_stream(stream_index) |
| 414 | else: |
| 415 | sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr |
| 416 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 417 | emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type]) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 418 | emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr) |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 419 | if cmd.out_tensor.purpose == TensorPurpose.LUT: |
| 420 | emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem) |
| 421 | else: |
| 422 | emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type]) |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 423 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 424 | emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr) |
| 425 | emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz) |
| 426 | dma_channel = 0 |
| 427 | mode = 0 # From external to external |
| 428 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 429 | emit_cmd_waits(cmd_waits) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 430 | emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode) |
| 431 | |
| 432 | elif cmd.cmdtype == CommandType.NpuStripe: |
| 433 | |
| 434 | ps = cmd.ps |
| 435 | primary_op = ps.primary_op |
| 436 | npu_block_type = ps.npu_block_type |
| 437 | # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale |
| 438 | use_global_scale = False |
| 439 | # Specifies type of rounding to be used. |
Tim Hall | d775e37 | 2020-08-28 18:33:38 +0100 | [diff] [blame] | 440 | rounding_mode = ( |
| 441 | rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL |
| 442 | ) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 443 | if primary_op.type == Op.ResizeBilinear: |
Dwight Lidman | 3ec04ac | 2020-04-30 11:54:48 +0200 | [diff] [blame] | 444 | rounding_mode = rounding.TRUNCATE |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 445 | fmf = primary_op.memory_function |
| 446 | faf = primary_op.activation |
| 447 | fused_quantize = any(op.type == Op.Quantize for op in ps.ops) |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 448 | # Force output scale, used in operations with fused LUT |
| 449 | # Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization |
| 450 | # except when primary_op is AddAct + 0 (no-op) + LUT |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 451 | forced_ofm_quantization = primary_op.forced_output_quantization |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 452 | ofm_quant = cmd.ofm_tensor.quantization |
| 453 | if forced_ofm_quantization is not None: |
| 454 | ofm_quant = forced_ofm_quantization |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 455 | |
| 456 | # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB |
| 457 | op_to_scale = 0 |
| 458 | |
| 459 | # Update state history |
| 460 | prev_ifm_rect = cur_ifm_rect |
| 461 | prev_ifm_block_depth = cur_ifm_block_depth |
| 462 | prev_ofm_rect = cur_ofm_rect |
| 463 | prev_ofm_block = cur_ofm_block |
| 464 | prev_kernel = cur_kernel |
Louis Verhaard | b2fb212 | 2020-06-04 15:51:24 +0200 | [diff] [blame] | 465 | cur_kernel = get_op_kernel(ps) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 466 | |
| 467 | block_config = ps.block_config |
| 468 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1) |
| 469 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1) |
| 470 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1) |
| 471 | |
| 472 | shared_buffer = ps.shared_buffer |
| 473 | |
| 474 | if npu_block_type == NpuBlockType.ElementWise: |
Jacob Bohlin | be733cf | 2020-08-13 10:21:34 +0200 | [diff] [blame] | 475 | ifm2_broadcast = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 476 | |
Jacob Bohlin | bf61268 | 2020-08-13 09:37:02 +0200 | [diff] [blame] | 477 | if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 478 | # The scalar has to be the ifm2 tensor so switch the ifms |
| 479 | cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor |
| 480 | cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box |
| 481 | |
| 482 | # Set ReverseOperandOrder bit to IFM2_BROADCAST |
| 483 | ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder |
| 484 | |
| 485 | # Calculate scales needed for arithmetic elementwise operators |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 486 | if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)): |
Fredrik Svedberg | 0f98b36 | 2020-09-29 10:00:39 +0200 | [diff] [blame] | 487 | input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None |
| 488 | input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None |
| 489 | output_scale = ofm_quant.scale_f32 if ofm_quant else None |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 490 | use_global_scale = True |
| 491 | |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 492 | if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh): |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 493 | output_scale = 1 / 0x3000 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 494 | |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 495 | if primary_op.type == Op.Mul: |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 496 | if None in (input_scale, input2_scale, output_scale): |
| 497 | ofm_scale = 1 |
| 498 | shift = 0 |
| 499 | else: |
| 500 | ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 501 | emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) |
| 502 | else: # AddAct/SubAct |
Charles Xu | 9a03fdf | 2020-07-02 15:12:40 +0200 | [diff] [blame] | 503 | # Force output scale same as the input scale for |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 504 | # resizebilinear 1x1 that is converted to add |
Charles Xu | 9a03fdf | 2020-07-02 15:12:40 +0200 | [diff] [blame] | 505 | if "resizebilinear" in primary_op.attrs: |
| 506 | output_scale = input2_scale |
| 507 | |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 508 | if None in (input_scale, input2_scale, output_scale): |
| 509 | opa_scale = opb_scale = ofm_scale = 1 |
| 510 | opa_shift = shift = 0 |
Fredrik Svedberg | 597fd3f | 2020-08-13 10:02:53 +0200 | [diff] [blame] | 511 | ofm_scale, shift = primary_op.attrs.get("rescale", [1, 0]) |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 512 | elif input_scale == input2_scale: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 513 | opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale( |
| 514 | input_scale, input2_scale, output_scale |
| 515 | ) |
| 516 | opa_shift = 0 # Unused for this case |
| 517 | else: |
| 518 | # Use advanced implementation only when input scales differ |
| 519 | bitdepth = cmd.ifm_tensor.dtype.bits |
| 520 | ( |
| 521 | opa_scale, |
| 522 | opa_shift, |
| 523 | ofm_scale, |
| 524 | shift, |
| 525 | op_to_scale, |
| 526 | ) = scaling.advanced_elementwise_add_sub_scale( |
| 527 | input_scale, input2_scale, output_scale, bitdepth |
| 528 | ) |
| 529 | opb_scale = 0 # Unused for this case |
| 530 | if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder: |
| 531 | # If the operand order is reversed we also have to swap which operand is scaled |
| 532 | if op_to_scale == scaling.OperandToScale.OPa: |
| 533 | op_to_scale = scaling.OperandToScale.OPb |
| 534 | else: |
| 535 | op_to_scale = scaling.OperandToScale.OPa |
| 536 | |
| 537 | emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift) |
| 538 | emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale) |
| 539 | emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) |
| 540 | |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 541 | elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)): |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 542 | output_scale = ofm_quant.scale_f32 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 543 | use_global_scale = True |
| 544 | |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 545 | if primary_op.type == Op.LeakyRelu: |
Louis Verhaard | 58520b9 | 2020-08-24 16:45:38 +0200 | [diff] [blame] | 546 | output_scale = primary_op.attrs["alpha"] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 547 | |
| 548 | ofm_scale, shift = scaling.quantise_scale(output_scale) |
| 549 | emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 550 | else: |
| 551 | emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 552 | |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 553 | # For elementwise set the required SHRAM to be equal to the total size of available SHRAM |
| 554 | uses_lut = primary_op.activation_lut is not None |
| 555 | shram_required = arch.available_shram_banks(uses_lut) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 556 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required) |
| 557 | |
| 558 | # Acc buffers not needed so set AB_START to size of SHRAM |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 559 | emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 560 | |
| 561 | # Is not a unary operator |
| 562 | if cmd.ifm2_tensor is not None: |
| 563 | if cmd.ifm2_tensor.shape == []: |
| 564 | # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST |
| 565 | ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar |
| 566 | else: |
| 567 | ifm_box_shape = cmd.ifm_box.get_size_shape() |
| 568 | ifm2_box_shape = cmd.ifm2_box.get_size_shape() |
| 569 | |
| 570 | if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]: |
| 571 | # Broadcast in 'H' dimension |
| 572 | assert cmd.ifm2_tensor.shape[1] == 1 |
| 573 | ifm2_broadcast |= IFM2Broadcast.BroadcastHdim |
| 574 | |
| 575 | if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]: |
| 576 | # Broadcast in 'W' dimension |
| 577 | assert cmd.ifm2_tensor.shape[2] == 1 |
| 578 | ifm2_broadcast |= IFM2Broadcast.BroadcastWdim |
| 579 | |
| 580 | if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]: |
| 581 | # Broadcast in 'C' dimension |
| 582 | assert cmd.ifm2_tensor.shape[3] == 1 |
| 583 | ifm2_broadcast |= IFM2Broadcast.BroadcastCdim |
| 584 | |
| 585 | # Set IFM2_IB_START to the latter half of the IB space |
| 586 | ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM] |
| 587 | emit.cmd0_with_param( |
| 588 | cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start |
| 589 | ) |
| 590 | |
| 591 | emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast) |
| 592 | |
| 593 | else: |
| 594 | emit.cmd0_with_param( |
| 595 | cmd0.NPU_SET_IFM_IB_END, |
| 596 | shared_buffer.bank_locations[SharedBufferArea.IFM] |
| 597 | + shared_buffer.banks_required[SharedBufferArea.IFM], |
| 598 | ) |
| 599 | emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators]) |
| 600 | |
| 601 | emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element]) |
| 602 | |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 603 | if primary_op.type == Op.ResizeBilinear: |
Dwight Lidman | 3ec04ac | 2020-04-30 11:54:48 +0200 | [diff] [blame] | 604 | # perform nearest neighbor upscale |
Jacob Bohlin | cf7da10 | 2020-05-20 09:03:40 +0200 | [diff] [blame] | 605 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 606 | elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias: |
Jacob Bohlin | cf7da10 | 2020-05-20 09:03:40 +0200 | [diff] [blame] | 607 | # perform insert zero upscale |
| 608 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE) |
Dwight Lidman | 3ec04ac | 2020-04-30 11:54:48 +0200 | [diff] [blame] | 609 | else: |
Jacob Bohlin | cf7da10 | 2020-05-20 09:03:40 +0200 | [diff] [blame] | 610 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 611 | |
| 612 | if npu_block_type in set( |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 613 | ( |
| 614 | NpuBlockType.ConvolutionMxN, |
| 615 | NpuBlockType.ConvolutionDepthWise, |
| 616 | NpuBlockType.Pooling, |
| 617 | NpuBlockType.ReduceSum, |
| 618 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 619 | ): |
| 620 | # Set up padding |
| 621 | explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) |
| 622 | |
| 623 | # Check if this is for horizontal ifm streaming |
| 624 | if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): |
| 625 | explicit_padding[0] = cmd.pad_top |
| 626 | explicit_padding[2] = cmd.pad_bottom |
| 627 | |
| 628 | # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output, |
| 629 | # because of activation function needed to be fused. |
| 630 | if cmd.ifm_box.start_coord[-2] > 0: |
| 631 | explicit_padding[1] = 0 |
| 632 | if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]: |
| 633 | explicit_padding[3] = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 634 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0]) |
| 635 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1]) |
| 636 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2]) |
| 637 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3]) |
| 638 | |
Dwight Lidman | 0538a77 | 2020-05-06 14:09:17 +0200 | [diff] [blame] | 639 | # set kernel x stride low bit |
| 640 | stride = primary_op.attrs["strides"][2] - 1 & 1 |
| 641 | # set kernel y stride low bit |
| 642 | stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1 |
| 643 | # set kernel x stride extension bits |
| 644 | stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6 |
| 645 | # set kernel y stride extension bits |
| 646 | stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9 |
| 647 | |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 648 | if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 649 | k_height, k_width = primary_op.attrs["ksize"][1:3] |
| 650 | emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1) |
| 651 | emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1) |
| 652 | |
| 653 | valid_padding = sum(explicit_padding) == 0 |
| 654 | |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 655 | if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 656 | # For valid padding vela has to output scaling values |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 657 | if faf == Op.Sigmoid or faf == Op.Tanh: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 658 | rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32 |
Fredrik Svedberg | 620d88c | 2020-05-19 10:43:01 +0200 | [diff] [blame] | 659 | if cmd.ifm_tensor.dtype == DataType.int16: |
Charles Xu | f899231 | 2020-08-18 08:41:54 +0200 | [diff] [blame] | 660 | # Calculate scale and shift for the output scale of 1/(3*4096) |
| 661 | shift = 0 |
| 662 | max_rescale = np.iinfo(np.int16).max / 2 |
| 663 | while rescale <= max_rescale and shift <= 30: |
| 664 | shift += 1 |
| 665 | rescale *= 2 |
| 666 | scale = int(rescale) |
Fredrik Svedberg | 620d88c | 2020-05-19 10:43:01 +0200 | [diff] [blame] | 667 | else: |
Charles Xu | f899231 | 2020-08-18 08:41:54 +0200 | [diff] [blame] | 668 | rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 |
| 669 | scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) |
Fredrik Svedberg | 620d88c | 2020-05-19 10:43:01 +0200 | [diff] [blame] | 670 | scale = int(round_away_zero(scale * rescale)) |
Jacob Bohlin | 9fbc491 | 2020-06-29 11:58:50 +0200 | [diff] [blame] | 671 | elif fused_quantize: |
| 672 | # Quantize op requires different scaling |
| 673 | ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32) |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 674 | ofm_scale_f64 = np.double(ofm_quant.scale_f32) |
Jacob Bohlin | 9fbc491 | 2020-06-29 11:58:50 +0200 | [diff] [blame] | 675 | scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 676 | elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs: |
Charles Xu | 87c1350 | 2020-08-06 12:17:26 +0200 | [diff] [blame] | 677 | rescale = primary_op.attrs["rescale"] |
| 678 | rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 |
| 679 | scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) |
| 680 | scale = int(round_away_zero(scale * rescale)) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 681 | else: |
| 682 | # In case avg pool fused with concat or other memory operation, rescaling might be needed. |
| 683 | # k_height == k_width == 1 is allways true in this case |
| 684 | # Normally the scale is maximised, to get maximum precision, which means that |
| 685 | # if rescale != 1, scale need to consider the number of bits needed for rescaling |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 686 | if None not in (ofm_quant.scale_f32, cmd.ifm_tensor.quantization.scale_f32,): |
| 687 | rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32 |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 688 | rescale_bits = 0 |
| 689 | if k_height == k_width == 1: |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 690 | if fmf == Op.ConcatSliceWrite: |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 691 | rounding_mode = rounding.NATURAL |
| 692 | if rescale > 1: |
| 693 | rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 |
| 694 | elif rescale < 1: |
| 695 | rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1) |
| 696 | scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) |
| 697 | scale = int(round_away_zero(scale * rescale)) |
| 698 | else: |
| 699 | scale = 1 |
| 700 | shift = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 701 | |
| 702 | emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift) |
| 703 | # Valid-padded average pool should use the global scale from |
| 704 | # NPU_SET_OFM_SCALE register, which is set above. |
| 705 | use_global_scale = True |
| 706 | |
| 707 | else: # Convolution |
| 708 | assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default |
Fredrik Svedberg | d67c0aa | 2020-03-30 13:15:28 +0200 | [diff] [blame] | 709 | # Reduced precision quantization and natural rounding used for int16 |
| 710 | if cmd.ifm_tensor.dtype == DataType.int16: |
| 711 | rounding_mode = rounding.NATURAL |
Louis Verhaard | b2fb212 | 2020-06-04 15:51:24 +0200 | [diff] [blame] | 712 | stride |= (cur_kernel.dilation.y - 1) << 4 |
| 713 | stride |= (cur_kernel.dilation.x - 1) << 3 |
| 714 | emit.cmd0_with_param( |
| 715 | cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1) |
| 716 | ) |
| 717 | emit.cmd0_with_param( |
| 718 | cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1) |
| 719 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 720 | if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst: |
| 721 | # Part-kernel-first weight ordering |
| 722 | assert npu_block_type == NpuBlockType.ConvolutionMxN |
| 723 | stride |= 1 << 2 |
| 724 | |
| 725 | emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride) |
| 726 | |
| 727 | elif npu_block_type in set((NpuBlockType.VectorProduct,)): |
| 728 | # Vector product is implemented using a 1x1 convolution so need |
| 729 | # to setup the appropriate padding and kernel info |
| 730 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0) |
| 731 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0) |
| 732 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0) |
| 733 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0) |
| 734 | |
| 735 | # kernel stride reg = 0 means stride(1,1) + depth first weight |
| 736 | # order + dilation(0,0) + kernel_split_size=8 |
| 737 | emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0) |
| 738 | |
| 739 | emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0) |
| 740 | emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0) |
| 741 | |
| 742 | if npu_block_type in set( |
| 743 | (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) |
| 744 | ): |
| 745 | # Emit Weight base address commands, only maps the area required for |
| 746 | # this command's weights from the larger tensor. |
| 747 | stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord) |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 748 | weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index] |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 749 | substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 750 | |
| 751 | # Extract weight substream offsets and calculate their lengths |
| 752 | assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 753 | weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord) |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 754 | |
Tim Hall | 6231676 | 2020-06-25 16:55:02 +0100 | [diff] [blame] | 755 | # Set weights sources for active and present cores |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 756 | for core, param in enumerate( |
| 757 | [ |
| 758 | (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH), |
| 759 | (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH), |
| 760 | ] |
| 761 | ): |
Tim Hall | 6231676 | 2020-06-25 16:55:02 +0100 | [diff] [blame] | 762 | if core < substreams: |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 763 | emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core]) |
| 764 | emit.cmd1_with_offset( |
| 765 | param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core] |
| 766 | ) |
Tim Hall | 6231676 | 2020-06-25 16:55:02 +0100 | [diff] [blame] | 767 | elif core < arch.ncores: |
| 768 | emit.cmd1_with_offset(param[0], weight_addr) |
| 769 | emit.cmd1_with_offset(param[1], 0) |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 770 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 771 | weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 772 | emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 773 | |
| 774 | # Emit Scale & Bias base address commands, with length matching the amount required by |
| 775 | # the weight tensors. |
| 776 | if cmd.scale_tensor is not None: |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 777 | scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index] |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 778 | substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 779 | |
| 780 | # Extract scale substream offsets and calculate their lengths |
| 781 | assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0) |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 782 | scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:]) |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 783 | |
Tim Hall | 6231676 | 2020-06-25 16:55:02 +0100 | [diff] [blame] | 784 | # Set scale sources for active and present cores |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 785 | for core, param in enumerate( |
| 786 | [ |
| 787 | (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), |
| 788 | (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH), |
| 789 | ] |
| 790 | ): |
Tim Hall | 6231676 | 2020-06-25 16:55:02 +0100 | [diff] [blame] | 791 | if core < substreams: |
Jacob Bohlin | 0b9ca78 | 2020-07-09 11:16:30 +0200 | [diff] [blame] | 792 | emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core]) |
| 793 | emit.cmd1_with_offset( |
| 794 | param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core] |
| 795 | ) |
Tim Hall | 6231676 | 2020-06-25 16:55:02 +0100 | [diff] [blame] | 796 | elif core < arch.ncores: |
| 797 | emit.cmd1_with_offset(param[0], scale_addr) |
| 798 | emit.cmd1_with_offset(param[1], 0) |
Tim Hall | f7e810a | 2020-06-25 15:04:31 +0100 | [diff] [blame] | 799 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 800 | # Emit base address for NPU to access scale & bias data |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 801 | scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 802 | emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 803 | |
Fredrik Svedberg | 0f98b36 | 2020-09-29 10:00:39 +0200 | [diff] [blame] | 804 | ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min |
| 805 | ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max |
| 806 | ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min |
| 807 | ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 808 | |
| 809 | # Emit commands for any fused activation function |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 810 | if faf is None: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 811 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) |
| 812 | # Even if no activation function, values need to be set to override previous values |
| 813 | faf_min = ofm_quant_qmin |
| 814 | faf_max = ofm_quant_qmax |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 815 | elif faf == Op.Relu: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 816 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) |
| 817 | faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 818 | faf_max = ofm_quant_qmax |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 819 | elif faf == Op.Relu6: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 820 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) |
| 821 | faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 822 | faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 823 | elif faf == Op.ReluN1To1: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 824 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) |
| 825 | faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 826 | faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 827 | elif faf == Op.Tanh: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 828 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 829 | if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)): |
Fredrik Svedberg | 620d88c | 2020-05-19 10:43:01 +0200 | [diff] [blame] | 830 | faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 831 | faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 832 | else: |
| 833 | faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point) |
| 834 | faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 835 | elif faf == Op.Sigmoid: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 836 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 837 | if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)): |
Fredrik Svedberg | 620d88c | 2020-05-19 10:43:01 +0200 | [diff] [blame] | 838 | faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 839 | faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) |
| 840 | else: |
| 841 | faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point) |
| 842 | faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 843 | elif faf == Op.LUT: |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 844 | lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1) |
| 845 | assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range." |
Fredrik Svedberg | 597fd3f | 2020-08-13 10:02:53 +0200 | [diff] [blame] | 846 | if cmd.ofm_tensor.dtype == DataType.int32: |
Fredrik Svedberg | 1575b94 | 2020-08-18 13:19:18 +0200 | [diff] [blame] | 847 | lut_index |= 3 << 12 # Force I8 range |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 848 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index) |
| 849 | faf_min = ofm_quant_qmin |
| 850 | faf_max = ofm_quant_qmax |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 851 | else: |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 852 | raise Exception("Unsupported fused_activation_function = " + faf.name) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 853 | |
| 854 | # Activation range needs to be set based upon the quantisation range and the fused activation range |
| 855 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min)) |
| 856 | emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max)) |
| 857 | |
| 858 | out_shape = cmd.ofm_box.get_size_shape() |
| 859 | if len(out_shape) >= 4: |
| 860 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1) |
| 861 | else: |
| 862 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0) |
| 863 | if len(out_shape) >= 2: |
| 864 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1) |
| 865 | else: |
| 866 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0) |
| 867 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1) |
| 868 | |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 869 | if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 870 | in_shape = cmd.ifm_box.get_size_shape() |
| 871 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1) |
| 872 | else: |
| 873 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1) |
| 874 | |
Jacob Bohlin | 3c67829 | 2020-04-27 10:27:25 +0200 | [diff] [blame] | 875 | for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in ( |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 876 | ( |
| 877 | cmd.ifm_tensor, |
| 878 | cmd.ifm_box, |
Jacob Bohlin | 3c67829 | 2020-04-27 10:27:25 +0200 | [diff] [blame] | 879 | cmd0.NPU_SET_IFM_REGION, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 880 | (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3), |
| 881 | (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X), |
| 882 | cmd0.NPU_SET_IFM_ZERO_POINT, |
| 883 | ), |
| 884 | ( |
| 885 | cmd.ifm2_tensor, |
| 886 | cmd.ifm2_box, |
Jacob Bohlin | 3c67829 | 2020-04-27 10:27:25 +0200 | [diff] [blame] | 887 | cmd0.NPU_SET_IFM2_REGION, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 888 | ( |
| 889 | cmd1.NPU_SET_IFM2_BASE0, |
| 890 | cmd1.NPU_SET_IFM2_BASE1, |
| 891 | cmd1.NPU_SET_IFM2_BASE2, |
| 892 | cmd1.NPU_SET_IFM2_BASE3, |
| 893 | ), |
| 894 | (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X), |
| 895 | cmd0.NPU_SET_IFM2_ZERO_POINT, |
| 896 | ), |
| 897 | ( |
| 898 | cmd.ofm_tensor, |
| 899 | cmd.ofm_box, |
Jacob Bohlin | 3c67829 | 2020-04-27 10:27:25 +0200 | [diff] [blame] | 900 | cmd0.NPU_SET_OFM_REGION, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 901 | (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3), |
| 902 | (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X), |
| 903 | cmd0.NPU_SET_OFM_ZERO_POINT, |
| 904 | ), |
| 905 | ): |
| 906 | |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 907 | if tens is None: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 908 | continue |
| 909 | |
Louis Verhaard | 98a3499 | 2020-09-01 10:39:04 +0200 | [diff] [blame] | 910 | need_zero_point = ( |
| 911 | (faf is not None and forced_ofm_quantization is None) |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 912 | or (fmf == Op.ConcatSliceWrite) |
Louis Verhaard | 98a3499 | 2020-09-01 10:39:04 +0200 | [diff] [blame] | 913 | or fused_quantize |
| 914 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 915 | if ( |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 916 | (primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point) |
Fredrik Svedberg | 237d72d | 2020-08-28 18:12:28 +0200 | [diff] [blame] | 917 | or ( |
| 918 | tens.dtype == DataType.int32 |
| 919 | and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT) |
| 920 | ) |
| 921 | or tens.quantization is None |
| 922 | ): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 923 | # Actual integer operation, just set scale to 1 and zero point to 0 |
| 924 | emit.cmd0_with_param(zero_point_op, 0) |
| 925 | else: |
| 926 | assert tens.quantization.zero_point is not None, "need an actual zero point set" |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 927 | if cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op and forced_ofm_quantization is not None: |
| 928 | zero_point = forced_ofm_quantization.zero_point |
| 929 | elif ( |
Charles Xu | 9a03fdf | 2020-07-02 15:12:40 +0200 | [diff] [blame] | 930 | "resizebilinear" in primary_op.attrs |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 931 | and primary_op.type == Op.Add |
Charles Xu | 9a03fdf | 2020-07-02 15:12:40 +0200 | [diff] [blame] | 932 | and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op |
| 933 | ): |
| 934 | # Force output zero point same as the input zero point |
Louis Verhaard | d7911c4 | 2020-08-25 13:36:41 +0200 | [diff] [blame] | 935 | # for resizebilinear 1x1 that is converted to add |
Charles Xu | 9a03fdf | 2020-07-02 15:12:40 +0200 | [diff] [blame] | 936 | zero_point = cmd.ifm2_tensor.quantization.zero_point |
| 937 | else: |
| 938 | zero_point = tens.quantization.zero_point |
| 939 | emit.cmd0_with_param(zero_point_op, int(zero_point)) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 940 | |
| 941 | if tens.shape == []: |
| 942 | # Empty shape, elementwise constant |
Louis Verhaard | c88a96f | 2020-06-10 09:04:33 +0200 | [diff] [blame] | 943 | ifm2_scalar = tens.quant_values |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 944 | assert ifm2_scalar.size == 1 |
Louis Verhaard | c88a96f | 2020-06-10 09:04:33 +0200 | [diff] [blame] | 945 | emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0))) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 946 | continue |
| 947 | |
| 948 | height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer( |
| 949 | box.start_coord, box.end_coord |
| 950 | ) |
| 951 | if npu_block_type != NpuBlockType.VectorProduct: |
| 952 | if tens == cmd.ifm_tensor: |
| 953 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1) |
| 954 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1) |
| 955 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1) |
| 956 | elif tens == cmd.ofm_tensor: |
| 957 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1) |
| 958 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1) |
| 959 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1) |
Louis Verhaard | 0cf06c7 | 2020-05-12 08:31:05 +0200 | [diff] [blame] | 960 | if tens == cmd.ifm2_tensor: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 961 | emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1) |
| 962 | emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1) |
| 963 | emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1) |
| 964 | else: |
| 965 | if len(out_shape) == 2: |
Patrik Gustavsson | cb33704 | 2020-09-16 14:55:40 +0200 | [diff] [blame] | 966 | assert out_shape[0] == 1 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 967 | if tens == cmd.ifm_tensor: |
Patrik Gustavsson | cb33704 | 2020-09-16 14:55:40 +0200 | [diff] [blame] | 968 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, 0) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 969 | elif tens == cmd.ofm_tensor: |
Patrik Gustavsson | cb33704 | 2020-09-16 14:55:40 +0200 | [diff] [blame] | 970 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, 0) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 971 | else: |
| 972 | assert False |
| 973 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 974 | emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type]) |
Jacob Bohlin | 3c67829 | 2020-04-27 10:27:25 +0200 | [diff] [blame] | 975 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 976 | for idx, addr in enumerate(addresses): |
| 977 | if addr is None: |
| 978 | addresses[idx] = 0 |
| 979 | |
| 980 | emit.cmd1_with_offset(ptr_ops[0], addresses[0]) |
| 981 | emit.cmd1_with_offset(ptr_ops[1], addresses[1]) |
| 982 | emit.cmd1_with_offset(ptr_ops[2], addresses[2]) |
| 983 | emit.cmd1_with_offset(ptr_ops[3], addresses[3]) |
| 984 | |
| 985 | strides = tens.get_strides() |
| 986 | emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C) |
| 987 | emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W) |
| 988 | emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H) |
| 989 | |
| 990 | if tens.format == TensorFormat.NHCWB16: |
| 991 | # Check that all BasePointer addresses are aligned to 16 bytes |
| 992 | assert (int(addresses[0]) % 16) == 0 |
| 993 | assert (int(addresses[1]) % 16) == 0 |
| 994 | assert (int(addresses[2]) % 16) == 0 |
| 995 | assert (int(addresses[3]) % 16) == 0 |
| 996 | |
| 997 | ofm_dtype = cmd.ofm_tensor.dtype |
| 998 | assert ofm_dtype.type & BaseType.Int |
| 999 | prec = 0 |
| 1000 | if ofm_dtype.size_in_bits() == 8: |
| 1001 | prec = 0 |
| 1002 | elif ofm_dtype.size_in_bits() == 16: |
| 1003 | prec = 2 |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 1004 | elif ofm_dtype.size_in_bits() == 32: |
| 1005 | prec = 4 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1006 | else: |
| 1007 | assert 0 |
| 1008 | |
| 1009 | if ofm_dtype.type & BaseType.Signed: |
| 1010 | prec += 1 |
| 1011 | |
| 1012 | if use_global_scale: |
| 1013 | # Set global scale bit, as opposed to using per channel scale |
| 1014 | prec |= 1 << 8 |
| 1015 | |
| 1016 | if cmd.ofm_tensor.format == TensorFormat.NHCWB16: |
| 1017 | prec |= 1 << 6 |
| 1018 | |
| 1019 | prec |= rounding_mode.value << 14 |
| 1020 | |
| 1021 | emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec) |
| 1022 | |
| 1023 | prec = None |
| 1024 | weight_bits = 8 |
| 1025 | if cmd.weight_tensor is not None: |
| 1026 | weight_bits = cmd.weight_tensor.dtype.size_in_bits() |
| 1027 | |
| 1028 | ifm_dtype = cmd.ifm_tensor.dtype |
| 1029 | |
| 1030 | assert weight_bits == 8, "Unsupported weight bit depth" |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 1031 | assert ( |
| 1032 | ifm_dtype.size_in_bits() in {8, 16} |
| 1033 | or ifm_dtype.size_in_bits() == 32 |
| 1034 | and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum) |
| 1035 | ), "Unsupported ifm bit depth" |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1036 | |
| 1037 | if ifm_dtype.size_in_bits() == 8: |
| 1038 | if ifm_dtype.type & BaseType.Signed: |
Diqing Zhong | fed918b | 2020-04-27 10:27:34 +0200 | [diff] [blame] | 1039 | prec = ifm_precision.S8 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1040 | else: |
Diqing Zhong | fed918b | 2020-04-27 10:27:34 +0200 | [diff] [blame] | 1041 | prec = ifm_precision.U8 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1042 | elif ifm_dtype.size_in_bits() == 16: |
| 1043 | if ifm_dtype.type & BaseType.Signed: |
Diqing Zhong | fed918b | 2020-04-27 10:27:34 +0200 | [diff] [blame] | 1044 | prec = ifm_precision.S16 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1045 | else: |
Diqing Zhong | fed918b | 2020-04-27 10:27:34 +0200 | [diff] [blame] | 1046 | prec = ifm_precision.U16 |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 1047 | elif ifm_dtype == DataType.int32: |
| 1048 | prec = ifm_precision.S32 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1049 | |
| 1050 | ifm_prec = prec.value |
| 1051 | ifm2_prec = ifm_prec |
| 1052 | |
| 1053 | if cmd.ifm_tensor.format == TensorFormat.NHCWB16: |
| 1054 | ifm_prec |= 1 << 6 |
| 1055 | |
| 1056 | ifm_prec |= op_to_scale << 8 |
| 1057 | |
| 1058 | emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec) |
| 1059 | |
| 1060 | if cmd.ifm2_tensor is not None: |
| 1061 | if cmd.ifm2_tensor.format == TensorFormat.NHCWB16: |
| 1062 | ifm2_prec |= 1 << 6 |
| 1063 | emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec) |
| 1064 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1065 | # Get op parameters |
| 1066 | cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd) |
| 1067 | cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3]) |
| 1068 | cur_ofm_rect = get_op_ofm_rect(cmd) |
| 1069 | cur_ifm_rect = get_op_ifm_rect(cmd) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1070 | cur_padLT = get_op_padding_lt(cmd) |
| 1071 | if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd): |
| 1072 | if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape: |
| 1073 | blockdep = arch.calc_block_dep( |
| 1074 | prev_ifm_rect, |
| 1075 | prev_ofm_rect, |
| 1076 | prev_ifm_block_depth, |
| 1077 | prev_ofm_block, |
| 1078 | prev_kernel, |
| 1079 | cur_ifm_rect, |
| 1080 | cur_ofm_rect, |
| 1081 | cur_ifm_block_depth, |
| 1082 | cur_ofm_block, |
| 1083 | cur_kernel, |
| 1084 | cur_padLT, |
| 1085 | ) |
| 1086 | else: |
| 1087 | blockdep = 0 |
| 1088 | else: |
| 1089 | blockdep = ArchitectureFeatures.MAX_BLOCKDEP |
| 1090 | |
| 1091 | # Set between every op (dependent or not) |
| 1092 | blockdep = min(blockdep, arch.max_blockdep) |
| 1093 | emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep) |
| 1094 | prev_cmd = cmd |
| 1095 | |
Tim Hall | 289a41d | 2020-08-04 21:40:14 +0100 | [diff] [blame] | 1096 | emit_cmd_waits(cmd_waits) |
| 1097 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1098 | if npu_block_type == NpuBlockType.ConvolutionMxN: |
| 1099 | emit.cmd_do_operation(cmd0.NPU_OP_CONV) |
| 1100 | elif npu_block_type == NpuBlockType.ConvolutionDepthWise: |
| 1101 | emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE) |
| 1102 | elif npu_block_type == NpuBlockType.VectorProduct: |
| 1103 | # Vector product is implemented using a 1x1 convolution |
| 1104 | emit.cmd_do_operation(cmd0.NPU_OP_CONV) |
| 1105 | elif npu_block_type == NpuBlockType.Pooling: |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 1106 | param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1107 | emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param) |
Fredrik Svedberg | a0c3624 | 2020-06-03 15:43:31 +0200 | [diff] [blame] | 1108 | elif npu_block_type == NpuBlockType.ReduceSum: |
| 1109 | emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1110 | elif npu_block_type == NpuBlockType.ElementWise: |
| 1111 | param = elementwise_mode_map[primary_op.type] |
| 1112 | emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param) |
| 1113 | else: |
| 1114 | print("Warning: Skipping register command stream generation for", ps) |
| 1115 | |
| 1116 | # Fill in final part of command stream: |
| 1117 | emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF) |
| 1118 | |
| 1119 | sg.register_command_stream = emit.to_list() |
| 1120 | if verbose: |
| 1121 | emit.print_cmds() |
| 1122 | print("number of commands", len(emit.cmd_stream)) |
| 1123 | print("command stream length in words", len(sg.register_command_stream)) |