Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 18 | # Contains external APIs |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 19 | from enum import auto |
| 20 | from enum import Enum |
| 21 | from typing import List |
| 22 | from typing import NamedTuple |
| 23 | from typing import Optional |
| 24 | from typing import Tuple |
| 25 | |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 26 | import numpy |
| 27 | |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 28 | API_VERSION_MAJOR = 1 |
| 29 | API_VERSION_MINOR = 0 |
| 30 | API_VERSION = f"{API_VERSION_MAJOR}.{API_VERSION_MINOR}" |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 31 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 32 | |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 33 | class NpuAccelerator(Enum): |
| 34 | """ |
| 35 | Supported accelerators |
| 36 | """ |
| 37 | |
| 38 | Ethos_U55_32 = auto() |
| 39 | Ethos_U55_64 = auto() |
| 40 | Ethos_U55_128 = auto() |
| 41 | Ethos_U55_256 = auto() |
| 42 | Ethos_U65_256 = auto() |
| 43 | Ethos_U65_512 = auto() |
| 44 | |
| 45 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 46 | class NpuElementWiseOp(Enum): |
| 47 | """ |
| 48 | Elementwise operation |
| 49 | """ |
| 50 | |
| 51 | ADD = auto() |
| 52 | SUB = auto() |
| 53 | MUL = auto() |
| 54 | ABS = auto() |
| 55 | MIN = auto() |
| 56 | MAX = auto() |
| 57 | LRELU = auto() # Leaky relu |
| 58 | CLZ = auto() # Number leading zeros |
| 59 | SHR = auto() # Rounded right-shift |
| 60 | SHL = auto() # Bitwise shift-left |
| 61 | |
| 62 | |
| 63 | class NpuPoolingOp(Enum): |
| 64 | """ |
| 65 | Pooling operation |
| 66 | """ |
| 67 | |
| 68 | MAX = auto() |
| 69 | AVERAGE = auto() |
| 70 | REDUCE_SUM = auto() |
| 71 | |
| 72 | |
| 73 | class NpuActivationOp(Enum): |
| 74 | """ |
| 75 | Activation function |
| 76 | """ |
| 77 | |
| 78 | NONE_OR_RELU = auto() # Clamps output using min/max |
| 79 | TANH = auto() |
| 80 | SIGMOID = auto() |
| 81 | TABLE_LOOKUP = auto() # Performs table look-up, using the provided table lookup index |
| 82 | |
| 83 | |
| 84 | class NpuRoundingMode(Enum): |
| 85 | """ |
| 86 | Available rounding modes |
| 87 | """ |
| 88 | |
| 89 | TFL = auto() # TensorFlow Lite rounding |
| 90 | TRUNCATE = auto() # Truncate towards zero |
| 91 | NATURAL = auto() # Round to nearest with x.5 rounded up, towards +infinity |
| 92 | |
| 93 | |
| 94 | class NpuLayout(Enum): |
| 95 | """ |
| 96 | Tensor layout of feature maps |
| 97 | """ |
| 98 | |
| 99 | NHWC = auto() |
| 100 | NHCWB16 = auto() |
| 101 | |
| 102 | def __str__(self): |
| 103 | return self.name |
| 104 | |
| 105 | |
| 106 | class NpuResamplingMode(Enum): |
| 107 | """ |
| 108 | Resampling mode |
| 109 | """ |
| 110 | |
| 111 | NONE = auto() # No resampling is performed |
| 112 | NEAREST = auto() # 2x2 insert nearest |
| 113 | TRANSPOSE = auto() # 2x2 transpose |
| 114 | |
| 115 | |
| 116 | class NpuBlockTraversal(Enum): |
| 117 | """ |
| 118 | Block-traversal of weights |
| 119 | """ |
| 120 | |
| 121 | DEPTH_FIRST = auto() |
| 122 | PART_KERNEL_FIRST = auto() |
| 123 | |
| 124 | |
| 125 | class NpuDataType(Enum): |
| 126 | """ |
| 127 | Supported data types in feature maps |
| 128 | """ |
| 129 | |
| 130 | UINT8 = 8, False, auto() |
| 131 | INT8 = 8, True, auto() |
| 132 | UINT16 = 16, False, auto() |
| 133 | INT16 = 16, True, auto() |
| 134 | INT32 = 32, True, auto() |
| 135 | |
| 136 | def is_signed(self) -> bool: |
| 137 | """Checks if this data type is signed or unsigned""" |
| 138 | return self.value[1] |
| 139 | |
| 140 | def size_in_bits(self) -> int: |
| 141 | """ Size of the data type in bits""" |
| 142 | return self.value[0] |
| 143 | |
| 144 | def size_in_bytes(self) -> int: |
| 145 | """ Size of the data type in bytes""" |
| 146 | return self.value[0] // 8 |
| 147 | |
| 148 | def min_value(self) -> int: |
| 149 | """Minimum value of this type""" |
| 150 | if self.is_signed(): |
| 151 | return -(1 << (self.size_in_bits() - 1)) |
| 152 | else: |
| 153 | return 0 |
| 154 | |
| 155 | def max_value(self) -> int: |
| 156 | """Maximum value of this type""" |
| 157 | if self.is_signed(): |
| 158 | return (1 << (self.size_in_bits() - 1)) - 1 |
| 159 | else: |
| 160 | return (1 << self.size_in_bits()) - 1 |
| 161 | |
| 162 | def __str__(self): |
| 163 | return self.name |
| 164 | |
| 165 | __repr__ = __str__ |
| 166 | |
| 167 | |
| 168 | class NpuAddressRange(NamedTuple): |
| 169 | """ |
| 170 | Address range |
| 171 | """ |
| 172 | |
| 173 | region: int # Memory region, a value between 0 and 7 |
| 174 | address: int # Address, offset from the region's base address |
| 175 | length: int # The length of the range, in bytes |
| 176 | |
| 177 | def __str__(self): |
| 178 | return f"(region={self.region}, address={hex(self.address)}, length={self.length})" |
| 179 | |
| 180 | |
| 181 | class NpuTileBox(NamedTuple): |
| 182 | """ |
| 183 | Specifies the addresses and dimensions of the tiles of a feature map. |
| 184 | A feature map can use 1 to 4 tiles |
| 185 | """ |
| 186 | |
| 187 | height_0: int # The height of tile 0 |
| 188 | height_1: int # The height of tile 1, 0 if unused |
| 189 | width_0: int # the width of tile 0, and tile 2 (if used) |
| 190 | addresses: List[int] # A list of 4 addresses, set unused addresses to 0 |
| 191 | |
| 192 | |
| 193 | class NpuShape3D(NamedTuple): |
| 194 | """ |
| 195 | Shape of (part of) a feature map |
| 196 | """ |
| 197 | |
| 198 | height: int |
| 199 | width: int |
| 200 | depth: int |
| 201 | |
| 202 | |
| 203 | class NpuQuantization(NamedTuple): |
| 204 | """ |
| 205 | Quantization parameters |
| 206 | """ |
| 207 | |
| 208 | scale_f32: Optional[float] |
| 209 | zero_point: int |
| 210 | |
| 211 | |
| 212 | class NpuPadding(NamedTuple): |
| 213 | """ |
| 214 | Padding to be applied to a convolution operation |
| 215 | """ |
| 216 | |
| 217 | top: int |
| 218 | left: int |
| 219 | bottom: int |
| 220 | right: int |
| 221 | |
| 222 | |
| 223 | class NpuActivation: |
| 224 | """ |
| 225 | Activation function, fused with NPU operations |
| 226 | """ |
| 227 | |
| 228 | def __init__(self, op_type: NpuActivationOp): |
| 229 | self.op_type = op_type # The activation operation to be performed |
| 230 | # min/max are optional |
| 231 | self.min: Optional[float] = None # E.g. set to 0.0 for RELU |
| 232 | self.max: Optional[float] = None # E.g. set to 6.0 for RELU6 |
| 233 | # Table lookup index, only applicable for TABLE_LOOKUP activation, 0-7 |
| 234 | self.lookup_table_index: int = 0 |
| 235 | |
| 236 | |
| 237 | class NpuFeatureMap: |
| 238 | """ |
| 239 | Basic information about IFM, IFM2, OFM |
| 240 | """ |
| 241 | |
| 242 | def __init__(self): |
| 243 | self.data_type: NpuDataType = NpuDataType.UINT8 |
| 244 | # The memory region, a value 0-7 |
| 245 | self.region: int = 0 |
| 246 | # Shape of the feature map |
| 247 | self.shape: NpuShape3D = NpuShape3D(height=0, width=0, depth=0) |
| 248 | # The tiles that comprise the feature map. In the normal case when only 1 tile is used, |
| 249 | # height_0 == self.shape.height, height_1 is 0, width_0 == self.shape.width, addresses[1:] are set to 0 |
| 250 | self.tiles: NpuTileBox = NpuTileBox(height_0=0, height_1=0, width_0=0, addresses=[0, 0, 0, 0]) |
| 251 | self.quantization: Optional[NpuQuantization] |
| 252 | self.layout: NpuLayout = NpuLayout.NHWC |
| 253 | # x/y/c strides used by the NPU when traversing the feature map, if None, vela will use default strides |
| 254 | self.strides: Optional[NpuShape3D] = None |
| 255 | |
| 256 | |
| 257 | class NpuKernel: |
| 258 | """ |
| 259 | Kernel information for NPU operations |
| 260 | """ |
| 261 | |
| 262 | def __init__(self, w: int, h: int, stride_x: int = 1, stride_y: int = 1, dilation_x: int = 1, dilation_y: int = 1): |
| 263 | assert stride_x > 0 and stride_y > 0 |
| 264 | assert dilation_x > 0 and dilation_y > 0 |
| 265 | self.width = w |
| 266 | self.height = h |
| 267 | self.stride_x = stride_x |
| 268 | self.stride_y = stride_y |
| 269 | self.dilation_x = dilation_x |
| 270 | self.dilation_y = dilation_y |
| 271 | |
| 272 | |
| 273 | class NpuOperationType(Enum): |
| 274 | """ |
| 275 | Type of NPU operation |
| 276 | """ |
| 277 | |
| 278 | Dma = auto() |
| 279 | Conv2D = auto() |
| 280 | ConvDepthWise = auto() |
| 281 | Pooling = auto() |
| 282 | ElementWise = auto() |
| 283 | |
| 284 | |
| 285 | class NpuOperation: |
| 286 | """ |
| 287 | Base class for all NPU operations |
| 288 | """ |
| 289 | |
| 290 | def __init__(self, op_type: NpuOperationType): |
| 291 | self.op_type = op_type |
| 292 | |
| 293 | |
| 294 | class NpuDmaOperation(NpuOperation): |
| 295 | """ |
| 296 | DMA operation |
| 297 | """ |
| 298 | |
| 299 | def __init__(self, src: NpuAddressRange, dest: NpuAddressRange): |
| 300 | super().__init__(NpuOperationType.Dma) |
| 301 | self.src = src |
| 302 | self.dest = dest |
| 303 | # DMA channel, usually 0 (user channel) |
| 304 | self.channel: int = 0 |
| 305 | # Channel mode, 0 = external, 1 = internal (should usually be 0) |
| 306 | self.mode: int = 0 |
| 307 | |
| 308 | |
| 309 | class NpuBlockOperation(NpuOperation): |
| 310 | """ |
| 311 | Base class for operations which produce an OFM |
| 312 | """ |
| 313 | |
| 314 | def __init__(self, op_type: NpuOperationType): |
| 315 | super().__init__(op_type) |
| 316 | self.ifm: Optional[NpuFeatureMap] = None |
| 317 | self.ifm2: Optional[NpuFeatureMap] = None |
| 318 | # The non-quantized scalar value in a binary elementwise operation. Only set if IFM2 is scalar |
| 319 | self.ifm2_scalar: Optional[float] = None |
| 320 | self.ofm: Optional[NpuFeatureMap] = None |
| 321 | self.kernel: Optional[NpuKernel] = None |
| 322 | # Weights, one element for each NPU core, empty if no weights are used. |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 323 | # Must have been compressed using npu_encode_weights() |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 324 | self.weights: List[NpuAddressRange] = [] |
| 325 | # Biases, one element for each NPU core, empty if no bias is used. |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 326 | # Must have been encoded using npu_encode_bias() |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 327 | self.biases: List[NpuAddressRange] = [] |
| 328 | self.padding: Optional[NpuPadding] = None |
| 329 | # Optional activation function to be applied |
| 330 | self.activation: Optional[NpuActivation] = None |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 331 | # The block config to be used, which must be valid for the given operation. |
| 332 | # See also npu_find_block_configs. |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 333 | # If the operation has weights, the depth of the block config must be the same as |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 334 | # the ofm depth used in the call to npu_encode_weights() |
| 335 | self.block_config: NpuShape3D |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 336 | self.rounding_mode: NpuRoundingMode = NpuRoundingMode.TFL |
| 337 | # Set to True if the operations is fused with a Quantize operation (affects scaling) |
| 338 | self.fused_quantize: bool = False |
| 339 | # IFM upscaling to be applied |
| 340 | self.ifm_upscale: NpuResamplingMode = NpuResamplingMode.NONE |
| 341 | |
| 342 | |
| 343 | class NpuConv2DOperation(NpuBlockOperation): |
| 344 | """ |
| 345 | NPU_OP_CONV operation |
| 346 | """ |
| 347 | |
| 348 | def __init__(self): |
| 349 | super().__init__(NpuOperationType.Conv2D) |
| 350 | # Block traversal must be consistent with the block_traversal parameter specified in |
| 351 | # weight_compressor.encode_weights() |
| 352 | self.block_traversal: NpuBlockTraversal = NpuBlockTraversal.PART_KERNEL_FIRST |
| 353 | |
| 354 | |
| 355 | class NpuConvDepthWiseOperation(NpuBlockOperation): |
| 356 | """ |
| 357 | NPU_OP_DEPTHWISE operation |
| 358 | """ |
| 359 | |
| 360 | def __init__(self): |
| 361 | super().__init__(NpuOperationType.ConvDepthWise) |
| 362 | |
| 363 | |
| 364 | class NpuPoolingOperation(NpuBlockOperation): |
| 365 | """ |
| 366 | NPU_OP_POOL operation |
| 367 | """ |
| 368 | |
| 369 | def __init__(self, pooling_op_type: NpuPoolingOp): |
| 370 | super().__init__(NpuOperationType.Pooling) |
| 371 | self.sub_op_type: NpuPoolingOp = pooling_op_type |
| 372 | # Set to a float value for ResizeBilinear operations (affects scaling), else to None |
| 373 | self.rescale: Optional[float] = None |
| 374 | |
| 375 | |
| 376 | class NpuElementWiseOperation(NpuBlockOperation): |
| 377 | """ |
| 378 | NPU_OP_ELEMENTWISE operation |
| 379 | """ |
| 380 | |
| 381 | def __init__(self, elementwise_op_type: NpuElementWiseOp): |
| 382 | super().__init__(NpuOperationType.ElementWise) |
| 383 | self.sub_op_type: NpuElementWiseOp = elementwise_op_type |
| 384 | # Set to True for binary operators where IFM2 should be used as first operand |
| 385 | self.reversed_operands: bool = False |
| 386 | # Set to a tuple (scale, shift) for explicit rescale, else to None |
| 387 | self.rescale: Optional[Tuple] = None |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 388 | |
| 389 | |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 390 | def npu_get_api_version(): |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 391 | """ |
| 392 | Public facing API to get the API version |
| 393 | :return: int, the 16 most significant bits, corresponding to major version |
| 394 | the 16 least significant bits, corresponding to minor version |
| 395 | """ |
Louis Verhaard | 11831ce | 2020-11-18 18:53:24 +0100 | [diff] [blame] | 396 | version = (API_VERSION_MAJOR << 16) | (API_VERSION_MINOR & 0xFFFF) |
Patrik Gustavsson | c8a22f1 | 2020-11-18 17:05:50 +0100 | [diff] [blame] | 397 | return version |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 398 | |
| 399 | |
| 400 | def npu_encode_weights( |
| 401 | accelerator: NpuAccelerator, |
| 402 | weights_volume: numpy.ndarray, |
| 403 | dilation_xy: Tuple[int, int], |
| 404 | ifm_bitdepth: int, |
| 405 | ofm_block_depth: int, |
| 406 | is_depthwise: bool, |
| 407 | block_traversal: NpuBlockTraversal, |
| 408 | ): |
| 409 | """ |
| 410 | Public facing API to use the Ethos-U weight encoding. |
| 411 | |
| 412 | :param accelerator: NpuAccelerator enum to pick the correct accelerator |
| 413 | :param weights_volume: numpy.ndarray in OHWI layout with a shape of four |
| 414 | :param dilation_xy: a two element tuple of dilation attributes in x,y dimension |
| 415 | :param ifm_bitdepth: the bitdepth of input feature map |
| 416 | :param ofm_block_depth: the depth of blocks for processing |
| 417 | :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal |
| 418 | :param block_traversal: indicates how these weights are traversed on sub-kernel basis |
| 419 | :return: a bytearray of compressed weights |
| 420 | """ |
| 421 | from .architecture_features import Accelerator |
| 422 | from . import weight_compressor |
| 423 | |
| 424 | acc = Accelerator.from_npu_accelerator(accelerator) |
| 425 | return weight_compressor.encode_weights( |
| 426 | acc, weights_volume, dilation_xy, ifm_bitdepth, ofm_block_depth, is_depthwise, block_traversal |
| 427 | ) |
| 428 | |
| 429 | |
| 430 | def npu_encode_bias(bias: numpy.int64, scale: int, shift: int): |
| 431 | """ |
| 432 | Public facing API to pack bias and scale values as required by the hardware |
| 433 | :param bias: 64-bit signed number that includes 40-bit signed bias |
| 434 | :param scale: 32-bit scale value |
| 435 | :param shift: 6-bit shift value |
| 436 | :return: packed 80-bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)] |
| 437 | """ |
| 438 | from . import weight_compressor |
| 439 | |
| 440 | return weight_compressor.encode_bias(bias, scale, shift) |
| 441 | |
| 442 | |
Louis Verhaard | 933f55e | 2020-11-25 14:10:30 +0100 | [diff] [blame] | 443 | def npu_find_block_configs(npu_op: NpuOperation, accelerator: NpuAccelerator) -> List[NpuShape3D]: |
| 444 | """ |
| 445 | Public facing API that returns a list of block configs that are valid for the given operation. |
| 446 | This function can be used to find a valid value for npu_op.block_config. |
| 447 | The block config is the unit of work in which the NPU generates the OFM. |
| 448 | """ |
| 449 | from . import register_command_stream_generator |
| 450 | |
| 451 | return register_command_stream_generator.find_block_configs(npu_op, accelerator) |
| 452 | |
| 453 | |
Louis Verhaard | aeae567 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 454 | def npu_generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: NpuAccelerator) -> List[int]: |
| 455 | """ |
| 456 | Public facing API for generating an Ethos-U register command stream. |
| 457 | Calculates dependencies between commands and inserts wait operations if needed. |
| 458 | |
| 459 | :param npu_op_list: List[NpuOperation] list of high level NPU operations |
| 460 | :param accelerator: NpuAccelerator enum to pick the correct accelerator |
| 461 | :return register commands, as a list of 32-bit integers |
| 462 | """ |
| 463 | from . import register_command_stream_generator |
| 464 | |
| 465 | return register_command_stream_generator.generate_register_command_stream(npu_op_list, accelerator) |
Louis Verhaard | 5207830 | 2020-11-18 13:35:06 +0100 | [diff] [blame] | 466 | |
| 467 | |
| 468 | def npu_create_driver_payload(register_command_stream: List[int], accelerator: NpuAccelerator) -> bytes: |
| 469 | """ |
| 470 | Public facing API for generating driver payload, containing a driver header |
| 471 | and the given Ethos-U register command stream. |
| 472 | Returns the payload, in little endian format, which must be placed in memory on a 16-byte aligned |
| 473 | address. |
| 474 | |
| 475 | :param register_command_stream: List[int] register commands, as a list of 32-bit integers |
| 476 | :param accelerator: NpuAccelerator enum to pick the correct accelerator |
| 477 | :return driver payload, as a byte array |
| 478 | """ |
| 479 | from . import driver_actions |
| 480 | |
| 481 | return driver_actions.npu_create_driver_payload(register_command_stream, accelerator) |