Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 18 | # Contains external APIs |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 19 | from enum import auto |
| 20 | from enum import Enum |
| 21 | from typing import List |
| 22 | from typing import NamedTuple |
| 23 | from typing import Optional |
| 24 | from typing import Tuple |
| 25 | |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 26 | import numpy |
| 27 | |
Patrik Gustavsson | c74682c | 2021-08-17 14:26:38 +0200 | [diff] [blame] | 28 | |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 29 | API_VERSION_MAJOR = 1 |
Tim Hall | 68df8a1 | 2022-03-16 16:51:16 +0000 | [diff] [blame] | 30 | API_VERSION_MINOR = 3 |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 31 | API_VERSION = f"{API_VERSION_MAJOR}.{API_VERSION_MINOR}" |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 32 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 33 | |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 34 | class NpuAccelerator(Enum): |
| 35 | """ |
| 36 | Supported accelerators |
| 37 | """ |
| 38 | |
| 39 | Ethos_U55_32 = auto() |
| 40 | Ethos_U55_64 = auto() |
| 41 | Ethos_U55_128 = auto() |
| 42 | Ethos_U55_256 = auto() |
| 43 | Ethos_U65_256 = auto() |
| 44 | Ethos_U65_512 = auto() |
| 45 | |
| 46 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 47 | class NpuElementWiseOp(Enum): |
| 48 | """ |
| 49 | Elementwise operation |
| 50 | """ |
| 51 | |
| 52 | ADD = auto() |
| 53 | SUB = auto() |
| 54 | MUL = auto() |
| 55 | ABS = auto() |
| 56 | MIN = auto() |
| 57 | MAX = auto() |
| 58 | LRELU = auto() # Leaky relu |
| 59 | CLZ = auto() # Number leading zeros |
| 60 | SHR = auto() # Rounded right-shift |
| 61 | SHL = auto() # Bitwise shift-left |
| 62 | |
| 63 | |
| 64 | class NpuPoolingOp(Enum): |
| 65 | """ |
| 66 | Pooling operation |
| 67 | """ |
| 68 | |
| 69 | MAX = auto() |
| 70 | AVERAGE = auto() |
| 71 | REDUCE_SUM = auto() |
| 72 | |
| 73 | |
| 74 | class NpuActivationOp(Enum): |
| 75 | """ |
| 76 | Activation function |
| 77 | """ |
| 78 | |
| 79 | NONE_OR_RELU = auto() # Clamps output using min/max |
| 80 | TANH = auto() |
| 81 | SIGMOID = auto() |
| 82 | TABLE_LOOKUP = auto() # Performs table look-up, using the provided table lookup index |
| 83 | |
| 84 | |
| 85 | class NpuRoundingMode(Enum): |
| 86 | """ |
| 87 | Available rounding modes |
| 88 | """ |
| 89 | |
| 90 | TFL = auto() # TensorFlow Lite rounding |
| 91 | TRUNCATE = auto() # Truncate towards zero |
| 92 | NATURAL = auto() # Round to nearest with x.5 rounded up, towards +infinity |
| 93 | |
| 94 | |
| 95 | class NpuLayout(Enum): |
| 96 | """ |
| 97 | Tensor layout of feature maps |
| 98 | """ |
| 99 | |
| 100 | NHWC = auto() |
| 101 | NHCWB16 = auto() |
| 102 | |
| 103 | def __str__(self): |
| 104 | return self.name |
| 105 | |
| 106 | |
| 107 | class NpuResamplingMode(Enum): |
| 108 | """ |
| 109 | Resampling mode |
| 110 | """ |
| 111 | |
| 112 | NONE = auto() # No resampling is performed |
| 113 | NEAREST = auto() # 2x2 insert nearest |
| 114 | TRANSPOSE = auto() # 2x2 transpose |
| 115 | |
| 116 | |
| 117 | class NpuBlockTraversal(Enum): |
| 118 | """ |
| 119 | Block-traversal of weights |
| 120 | """ |
| 121 | |
| 122 | DEPTH_FIRST = auto() |
| 123 | PART_KERNEL_FIRST = auto() |
| 124 | |
| 125 | |
| 126 | class NpuDataType(Enum): |
| 127 | """ |
| 128 | Supported data types in feature maps |
| 129 | """ |
| 130 | |
| 131 | UINT8 = 8, False, auto() |
| 132 | INT8 = 8, True, auto() |
| 133 | UINT16 = 16, False, auto() |
| 134 | INT16 = 16, True, auto() |
| 135 | INT32 = 32, True, auto() |
| 136 | |
| 137 | def is_signed(self) -> bool: |
| 138 | """Checks if this data type is signed or unsigned""" |
| 139 | return self.value[1] |
| 140 | |
| 141 | def size_in_bits(self) -> int: |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 142 | """Size of the data type in bits""" |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 143 | return self.value[0] |
| 144 | |
| 145 | def size_in_bytes(self) -> int: |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 146 | """Size of the data type in bytes""" |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 147 | return self.value[0] // 8 |
| 148 | |
| 149 | def min_value(self) -> int: |
| 150 | """Minimum value of this type""" |
| 151 | if self.is_signed(): |
| 152 | return -(1 << (self.size_in_bits() - 1)) |
| 153 | else: |
| 154 | return 0 |
| 155 | |
| 156 | def max_value(self) -> int: |
| 157 | """Maximum value of this type""" |
| 158 | if self.is_signed(): |
| 159 | return (1 << (self.size_in_bits() - 1)) - 1 |
| 160 | else: |
| 161 | return (1 << self.size_in_bits()) - 1 |
| 162 | |
| 163 | def __str__(self): |
| 164 | return self.name |
| 165 | |
| 166 | __repr__ = __str__ |
| 167 | |
| 168 | |
| 169 | class NpuAddressRange(NamedTuple): |
| 170 | """ |
| 171 | Address range |
| 172 | """ |
| 173 | |
| 174 | region: int # Memory region, a value between 0 and 7 |
| 175 | address: int # Address, offset from the region's base address |
| 176 | length: int # The length of the range, in bytes |
| 177 | |
| 178 | def __str__(self): |
| 179 | return f"(region={self.region}, address={hex(self.address)}, length={self.length})" |
| 180 | |
| 181 | |
| 182 | class NpuTileBox(NamedTuple): |
| 183 | """ |
| 184 | Specifies the addresses and dimensions of the tiles of a feature map. |
| 185 | A feature map can use 1 to 4 tiles |
| 186 | """ |
| 187 | |
| 188 | height_0: int # The height of tile 0 |
| 189 | height_1: int # The height of tile 1, 0 if unused |
| 190 | width_0: int # the width of tile 0, and tile 2 (if used) |
| 191 | addresses: List[int] # A list of 4 addresses, set unused addresses to 0 |
| 192 | |
| 193 | |
| 194 | class NpuShape3D(NamedTuple): |
| 195 | """ |
| 196 | Shape of (part of) a feature map |
| 197 | """ |
| 198 | |
| 199 | height: int |
| 200 | width: int |
| 201 | depth: int |
| 202 | |
| 203 | |
| 204 | class NpuQuantization(NamedTuple): |
| 205 | """ |
| 206 | Quantization parameters |
| 207 | """ |
| 208 | |
| 209 | scale_f32: Optional[float] |
| 210 | zero_point: int |
| 211 | |
| 212 | |
| 213 | class NpuPadding(NamedTuple): |
| 214 | """ |
| 215 | Padding to be applied to a convolution operation |
| 216 | """ |
| 217 | |
| 218 | top: int |
| 219 | left: int |
| 220 | bottom: int |
| 221 | right: int |
| 222 | |
| 223 | |
| 224 | class NpuActivation: |
| 225 | """ |
| 226 | Activation function, fused with NPU operations |
| 227 | """ |
| 228 | |
| 229 | def __init__(self, op_type: NpuActivationOp): |
| 230 | self.op_type = op_type # The activation operation to be performed |
| 231 | # min/max are optional |
| 232 | self.min: Optional[float] = None # E.g. set to 0.0 for RELU |
| 233 | self.max: Optional[float] = None # E.g. set to 6.0 for RELU6 |
| 234 | # Table lookup index, only applicable for TABLE_LOOKUP activation, 0-7 |
| 235 | self.lookup_table_index: int = 0 |
| 236 | |
| 237 | |
| 238 | class NpuFeatureMap: |
| 239 | """ |
| 240 | Basic information about IFM, IFM2, OFM |
| 241 | """ |
| 242 | |
| 243 | def __init__(self): |
| 244 | self.data_type: NpuDataType = NpuDataType.UINT8 |
| 245 | # The memory region, a value 0-7 |
| 246 | self.region: int = 0 |
| 247 | # Shape of the feature map |
| 248 | self.shape: NpuShape3D = NpuShape3D(height=0, width=0, depth=0) |
| 249 | # The tiles that comprise the feature map. In the normal case when only 1 tile is used, |
| 250 | # height_0 == self.shape.height, height_1 is 0, width_0 == self.shape.width, addresses[1:] are set to 0 |
| 251 | self.tiles: NpuTileBox = NpuTileBox(height_0=0, height_1=0, width_0=0, addresses=[0, 0, 0, 0]) |
| 252 | self.quantization: Optional[NpuQuantization] |
| 253 | self.layout: NpuLayout = NpuLayout.NHWC |
| 254 | # x/y/c strides used by the NPU when traversing the feature map, if None, vela will use default strides |
| 255 | self.strides: Optional[NpuShape3D] = None |
Tim Hall | 68df8a1 | 2022-03-16 16:51:16 +0000 | [diff] [blame] | 256 | # Used for debug |
| 257 | self.name: Optional[str] = None |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 258 | |
| 259 | |
| 260 | class NpuKernel: |
| 261 | """ |
| 262 | Kernel information for NPU operations |
| 263 | """ |
| 264 | |
| 265 | def __init__(self, w: int, h: int, stride_x: int = 1, stride_y: int = 1, dilation_x: int = 1, dilation_y: int = 1): |
| 266 | assert stride_x > 0 and stride_y > 0 |
| 267 | assert dilation_x > 0 and dilation_y > 0 |
| 268 | self.width = w |
| 269 | self.height = h |
| 270 | self.stride_x = stride_x |
| 271 | self.stride_y = stride_y |
| 272 | self.dilation_x = dilation_x |
| 273 | self.dilation_y = dilation_y |
| 274 | |
| 275 | |
| 276 | class NpuOperationType(Enum): |
| 277 | """ |
| 278 | Type of NPU operation |
| 279 | """ |
| 280 | |
| 281 | Dma = auto() |
| 282 | Conv2D = auto() |
| 283 | ConvDepthWise = auto() |
| 284 | Pooling = auto() |
| 285 | ElementWise = auto() |
| 286 | |
| 287 | |
| 288 | class NpuOperation: |
| 289 | """ |
| 290 | Base class for all NPU operations |
| 291 | """ |
| 292 | |
| 293 | def __init__(self, op_type: NpuOperationType): |
| 294 | self.op_type = op_type |
Tim Hall | 68df8a1 | 2022-03-16 16:51:16 +0000 | [diff] [blame] | 295 | # Used for debug |
| 296 | self.name: Optional[str] = None |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 297 | |
| 298 | |
| 299 | class NpuDmaOperation(NpuOperation): |
| 300 | """ |
| 301 | DMA operation |
| 302 | """ |
| 303 | |
| 304 | def __init__(self, src: NpuAddressRange, dest: NpuAddressRange): |
| 305 | super().__init__(NpuOperationType.Dma) |
| 306 | self.src = src |
| 307 | self.dest = dest |
| 308 | # DMA channel, usually 0 (user channel) |
| 309 | self.channel: int = 0 |
| 310 | # Channel mode, 0 = external, 1 = internal (should usually be 0) |
| 311 | self.mode: int = 0 |
| 312 | |
| 313 | |
| 314 | class NpuBlockOperation(NpuOperation): |
| 315 | """ |
| 316 | Base class for operations which produce an OFM |
| 317 | """ |
| 318 | |
| 319 | def __init__(self, op_type: NpuOperationType): |
| 320 | super().__init__(op_type) |
| 321 | self.ifm: Optional[NpuFeatureMap] = None |
| 322 | self.ifm2: Optional[NpuFeatureMap] = None |
| 323 | # The non-quantized scalar value in a binary elementwise operation. Only set if IFM2 is scalar |
| 324 | self.ifm2_scalar: Optional[float] = None |
| 325 | self.ofm: Optional[NpuFeatureMap] = None |
| 326 | self.kernel: Optional[NpuKernel] = None |
| 327 | # Weights, one element for each NPU core, empty if no weights are used. |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 328 | # Must have been compressed using npu_encode_weights() |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 329 | self.weights: List[NpuAddressRange] = [] |
| 330 | # Biases, one element for each NPU core, empty if no bias is used. |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 331 | # Must have been encoded using npu_encode_bias() |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 332 | self.biases: List[NpuAddressRange] = [] |
| 333 | self.padding: Optional[NpuPadding] = None |
| 334 | # Optional activation function to be applied |
| 335 | self.activation: Optional[NpuActivation] = None |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 336 | # The block config to be used, which must be valid for the given operation. |
| 337 | # See also npu_find_block_configs. |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 338 | # If the operation has weights, the depth of the block config must be the same as |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 339 | # the ofm depth used in the call to npu_encode_weights() |
| 340 | self.block_config: NpuShape3D |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 341 | self.rounding_mode: NpuRoundingMode = NpuRoundingMode.TFL |
| 342 | # Set to True if the operations is fused with a Quantize operation (affects scaling) |
| 343 | self.fused_quantize: bool = False |
| 344 | # IFM upscaling to be applied |
| 345 | self.ifm_upscale: NpuResamplingMode = NpuResamplingMode.NONE |
| 346 | |
| 347 | |
| 348 | class NpuConv2DOperation(NpuBlockOperation): |
| 349 | """ |
| 350 | NPU_OP_CONV operation |
| 351 | """ |
| 352 | |
| 353 | def __init__(self): |
| 354 | super().__init__(NpuOperationType.Conv2D) |
| 355 | # Block traversal must be consistent with the block_traversal parameter specified in |
| 356 | # weight_compressor.encode_weights() |
| 357 | self.block_traversal: NpuBlockTraversal = NpuBlockTraversal.PART_KERNEL_FIRST |
| 358 | |
| 359 | |
| 360 | class NpuConvDepthWiseOperation(NpuBlockOperation): |
| 361 | """ |
| 362 | NPU_OP_DEPTHWISE operation |
| 363 | """ |
| 364 | |
| 365 | def __init__(self): |
| 366 | super().__init__(NpuOperationType.ConvDepthWise) |
| 367 | |
| 368 | |
| 369 | class NpuPoolingOperation(NpuBlockOperation): |
| 370 | """ |
| 371 | NPU_OP_POOL operation |
| 372 | """ |
| 373 | |
| 374 | def __init__(self, pooling_op_type: NpuPoolingOp): |
| 375 | super().__init__(NpuOperationType.Pooling) |
| 376 | self.sub_op_type: NpuPoolingOp = pooling_op_type |
| 377 | # Set to a float value for ResizeBilinear operations (affects scaling), else to None |
| 378 | self.rescale: Optional[float] = None |
| 379 | |
| 380 | |
| 381 | class NpuElementWiseOperation(NpuBlockOperation): |
| 382 | """ |
| 383 | NPU_OP_ELEMENTWISE operation |
| 384 | """ |
| 385 | |
| 386 | def __init__(self, elementwise_op_type: NpuElementWiseOp): |
| 387 | super().__init__(NpuOperationType.ElementWise) |
| 388 | self.sub_op_type: NpuElementWiseOp = elementwise_op_type |
| 389 | # Set to True for binary operators where IFM2 should be used as first operand |
| 390 | self.reversed_operands: bool = False |
| 391 | # Set to a tuple (scale, shift) for explicit rescale, else to None |
| 392 | self.rescale: Optional[Tuple] = None |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 393 | |
| 394 | |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 395 | def npu_get_api_version(): |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 396 | """ |
| 397 | Public facing API to get the API version |
| 398 | :return: int, the 16 most significant bits, corresponding to major version |
| 399 | the 16 least significant bits, corresponding to minor version |
| 400 | """ |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 401 | version = (API_VERSION_MAJOR << 16) | (API_VERSION_MINOR & 0xFFFF) |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 402 | return version |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 403 | |
| 404 | |
| 405 | def npu_encode_weights( |
| 406 | accelerator: NpuAccelerator, |
| 407 | weights_volume: numpy.ndarray, |
| 408 | dilation_xy: Tuple[int, int], |
| 409 | ifm_bitdepth: int, |
| 410 | ofm_block_depth: int, |
| 411 | is_depthwise: bool, |
| 412 | block_traversal: NpuBlockTraversal, |
| 413 | ): |
| 414 | """ |
| 415 | Public facing API to use the Ethos-U weight encoding. |
| 416 | |
| 417 | :param accelerator: NpuAccelerator enum to pick the correct accelerator |
| 418 | :param weights_volume: numpy.ndarray in OHWI layout with a shape of four |
| 419 | :param dilation_xy: a two element tuple of dilation attributes in x,y dimension |
| 420 | :param ifm_bitdepth: the bitdepth of input feature map |
| 421 | :param ofm_block_depth: the depth of blocks for processing |
| 422 | :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal |
| 423 | :param block_traversal: indicates how these weights are traversed on sub-kernel basis |
Fredrik Svedberg | f5c07c4 | 2021-04-23 14:36:42 +0200 | [diff] [blame] | 424 | :return: a bytearray of encoded weights |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 425 | """ |
| 426 | from .architecture_features import Accelerator |
| 427 | from . import weight_compressor |
| 428 | |
| 429 | acc = Accelerator.from_npu_accelerator(accelerator) |
Fredrik Svedberg | f5c07c4 | 2021-04-23 14:36:42 +0200 | [diff] [blame] | 430 | encoded_weights, _ = weight_compressor.encode_weights( |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 431 | acc, weights_volume, dilation_xy, ifm_bitdepth, ofm_block_depth, is_depthwise, block_traversal |
| 432 | ) |
Fredrik Svedberg | f5c07c4 | 2021-04-23 14:36:42 +0200 | [diff] [blame] | 433 | return encoded_weights |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 434 | |
| 435 | |
| 436 | def npu_encode_bias(bias: numpy.int64, scale: int, shift: int): |
| 437 | """ |
| 438 | Public facing API to pack bias and scale values as required by the hardware |
| 439 | :param bias: 64-bit signed number that includes 40-bit signed bias |
| 440 | :param scale: 32-bit scale value |
| 441 | :param shift: 6-bit shift value |
| 442 | :return: packed 80-bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)] |
| 443 | """ |
| 444 | from . import weight_compressor |
| 445 | |
| 446 | return weight_compressor.encode_bias(bias, scale, shift) |
| 447 | |
| 448 | |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 449 | def npu_find_block_configs(npu_op: NpuOperation, accelerator: NpuAccelerator) -> List[NpuShape3D]: |
| 450 | """ |
| 451 | Public facing API that returns a list of block configs that are valid for the given operation. |
| 452 | This function can be used to find a valid value for npu_op.block_config. |
| 453 | The block config is the unit of work in which the NPU generates the OFM. |
| 454 | """ |
Jacob Bohlin | b8060f5 | 2021-08-09 12:22:51 +0100 | [diff] [blame] | 455 | from .architecture_features import Accelerator |
| 456 | from .architecture_features import ArchitectureFeatures |
| 457 | from .architecture_features import Block |
| 458 | from .architecture_features import create_default_arch |
| 459 | from .architecture_allocator import try_block_config |
| 460 | from .register_command_stream_generator import resampling_mode_map |
| 461 | from .register_command_stream_util import to_kernel |
| 462 | from .operation import NpuBlockType |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 463 | |
Jacob Bohlin | b8060f5 | 2021-08-09 12:22:51 +0100 | [diff] [blame] | 464 | is_partkernel = False |
| 465 | if isinstance(npu_op, NpuConv2DOperation): |
| 466 | block_type = NpuBlockType.ConvolutionMxN |
| 467 | is_partkernel = npu_op.block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST |
| 468 | elif isinstance(npu_op, NpuConvDepthWiseOperation): |
| 469 | block_type = NpuBlockType.ConvolutionDepthWise |
| 470 | elif isinstance(npu_op, NpuPoolingOperation): |
| 471 | block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling |
| 472 | elif isinstance(npu_op, NpuElementWiseOperation): |
| 473 | block_type = NpuBlockType.ElementWise |
| 474 | else: |
| 475 | assert 0, "Unsupported operation" |
| 476 | |
| 477 | ifm_shape = Block(npu_op.ifm.shape.width, npu_op.ifm.shape.height, npu_op.ifm.shape.depth) |
| 478 | ifm2_shape = None |
| 479 | if npu_op.ifm2: |
| 480 | ifm2_shape = Block(npu_op.ifm2.shape.width, npu_op.ifm2.shape.height, npu_op.ifm2.shape.depth) |
| 481 | ofm_shape = Block(npu_op.ofm.shape.width, npu_op.ofm.shape.height, npu_op.ofm.shape.depth) |
| 482 | |
| 483 | ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale] |
| 484 | ifm_bits = npu_op.ifm.data_type.size_in_bits() |
| 485 | kernel = to_kernel(npu_op.kernel) |
| 486 | lut_banks = 0 |
| 487 | if npu_op.activation: |
| 488 | lut_banks = 2 if npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP else 0 |
| 489 | |
| 490 | has_scaling = True |
| 491 | for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]: |
| 492 | if tensor and tensor.quantization is None: |
| 493 | has_scaling = False |
| 494 | break |
| 495 | |
| 496 | arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator)) |
| 497 | |
| 498 | max_block_width = min(arch.ofm_block_max.width, ofm_shape.width) |
| 499 | max_block_height = min(arch.ofm_block_max.height, ofm_shape.height) |
| 500 | max_block_depth = min(arch.ofm_block_max.depth, ofm_shape.depth) |
| 501 | |
| 502 | min_block_height = max(arch.ofm_ublock.height, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1) |
| 503 | min_block_width = max(arch.ofm_ublock.width, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1) |
| 504 | |
| 505 | valid_block_configs = [] |
| 506 | for w in range(min_block_width, max_block_width + min_block_width, min_block_width): |
| 507 | for h in range(min_block_height, max_block_height + min_block_height, min_block_height): |
| 508 | # Try valid OFM block depths |
| 509 | for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth): |
| 510 | # OFM block depth has the constraint that if it causes the OFM to be |
| 511 | # split, it must be a multiple of the OFM split size |
| 512 | if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0): |
| 513 | block = Block(w, h, c) |
| 514 | config = try_block_config( |
| 515 | block, |
| 516 | arch, |
| 517 | block_type, |
| 518 | ofm_shape, |
| 519 | ifm_shape, |
| 520 | ifm2_shape, |
| 521 | npu_op.ifm2_scalar is not None, |
| 522 | ifm_bits, |
| 523 | is_partkernel, |
| 524 | kernel, |
| 525 | lut_banks, |
| 526 | has_scaling, |
| 527 | ifm_resampling_mode, |
| 528 | ) |
| 529 | |
| 530 | if config: |
| 531 | ofm_block = config.ofm_block |
| 532 | valid_block_configs.append(NpuShape3D(ofm_block.height, ofm_block.width, ofm_block.depth)) |
| 533 | |
| 534 | assert len(valid_block_configs) > 0 |
| 535 | return valid_block_configs |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 536 | |
| 537 | |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 538 | def npu_generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: NpuAccelerator) -> List[int]: |
| 539 | """ |
| 540 | Public facing API for generating an Ethos-U register command stream. |
| 541 | Calculates dependencies between commands and inserts wait operations if needed. |
| 542 | |
| 543 | :param npu_op_list: List[NpuOperation] list of high level NPU operations |
| 544 | :param accelerator: NpuAccelerator enum to pick the correct accelerator |
| 545 | :return register commands, as a list of 32-bit integers |
| 546 | """ |
| 547 | from . import register_command_stream_generator |
| 548 | |
| 549 | return register_command_stream_generator.generate_register_command_stream(npu_op_list, accelerator) |
Louis Verhaard | 5207830 | 2020-11-18 13:35:06 +0100 | [diff] [blame] | 550 | |
| 551 | |
| 552 | def npu_create_driver_payload(register_command_stream: List[int], accelerator: NpuAccelerator) -> bytes: |
| 553 | """ |
| 554 | Public facing API for generating driver payload, containing a driver header |
| 555 | and the given Ethos-U register command stream. |
| 556 | Returns the payload, in little endian format, which must be placed in memory on a 16-byte aligned |
| 557 | address. |
| 558 | |
| 559 | :param register_command_stream: List[int] register commands, as a list of 32-bit integers |
| 560 | :param accelerator: NpuAccelerator enum to pick the correct accelerator |
| 561 | :return driver payload, as a byte array |
| 562 | """ |
| 563 | from . import driver_actions |
| 564 | |
| 565 | return driver_actions.npu_create_driver_payload(register_command_stream, accelerator) |