blob: 7125e889d809ddc9df48a4905547372dd73d1e8a [file] [log] [blame]
William Isaksson56e5f0c2024-01-10 12:28:04 +01001# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaarde8a5a782020-11-02 18:04:27 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
Louis Verhaardaeae5672020-11-02 18:04:27 +010018# Contains external APIs
Louis Verhaarde8a5a782020-11-02 18:04:27 +010019from enum import auto
20from enum import Enum
21from typing import List
22from typing import NamedTuple
23from typing import Optional
24from typing import Tuple
25
Louis Verhaardaeae5672020-11-02 18:04:27 +010026import numpy
27
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020028
Louis Verhaard11831ce2020-11-18 18:53:24 +010029API_VERSION_MAJOR = 1
William Isaksson56e5f0c2024-01-10 12:28:04 +010030API_VERSION_MINOR = 5
Louis Verhaard11831ce2020-11-18 18:53:24 +010031API_VERSION = f"{API_VERSION_MAJOR}.{API_VERSION_MINOR}"
Patrik Gustavssonc8a22f12020-11-18 17:05:50 +010032
Louis Verhaarde8a5a782020-11-02 18:04:27 +010033
Louis Verhaardaeae5672020-11-02 18:04:27 +010034class NpuAccelerator(Enum):
35 """
36 Supported accelerators
37 """
38
39 Ethos_U55_32 = auto()
40 Ethos_U55_64 = auto()
41 Ethos_U55_128 = auto()
42 Ethos_U55_256 = auto()
43 Ethos_U65_256 = auto()
44 Ethos_U65_512 = auto()
45
46
Louis Verhaarde8a5a782020-11-02 18:04:27 +010047class NpuElementWiseOp(Enum):
48 """
49 Elementwise operation
50 """
51
52 ADD = auto()
53 SUB = auto()
54 MUL = auto()
55 ABS = auto()
56 MIN = auto()
57 MAX = auto()
58 LRELU = auto() # Leaky relu
59 CLZ = auto() # Number leading zeros
60 SHR = auto() # Rounded right-shift
61 SHL = auto() # Bitwise shift-left
62
63
64class NpuPoolingOp(Enum):
65 """
66 Pooling operation
67 """
68
69 MAX = auto()
70 AVERAGE = auto()
71 REDUCE_SUM = auto()
72
73
74class NpuActivationOp(Enum):
75 """
76 Activation function
77 """
78
79 NONE_OR_RELU = auto() # Clamps output using min/max
80 TANH = auto()
81 SIGMOID = auto()
82 TABLE_LOOKUP = auto() # Performs table look-up, using the provided table lookup index
83
84
85class NpuRoundingMode(Enum):
86 """
87 Available rounding modes
88 """
89
90 TFL = auto() # TensorFlow Lite rounding
91 TRUNCATE = auto() # Truncate towards zero
92 NATURAL = auto() # Round to nearest with x.5 rounded up, towards +infinity
93
94
95class NpuLayout(Enum):
96 """
97 Tensor layout of feature maps
98 """
99
100 NHWC = auto()
101 NHCWB16 = auto()
102
103 def __str__(self):
104 return self.name
105
106
107class NpuResamplingMode(Enum):
108 """
109 Resampling mode
110 """
111
112 NONE = auto() # No resampling is performed
113 NEAREST = auto() # 2x2 insert nearest
114 TRANSPOSE = auto() # 2x2 transpose
115
116
117class NpuBlockTraversal(Enum):
118 """
119 Block-traversal of weights
120 """
121
122 DEPTH_FIRST = auto()
123 PART_KERNEL_FIRST = auto()
124
125
126class NpuDataType(Enum):
127 """
128 Supported data types in feature maps
129 """
130
131 UINT8 = 8, False, auto()
132 INT8 = 8, True, auto()
133 UINT16 = 16, False, auto()
134 INT16 = 16, True, auto()
135 INT32 = 32, True, auto()
136
137 def is_signed(self) -> bool:
138 """Checks if this data type is signed or unsigned"""
139 return self.value[1]
140
141 def size_in_bits(self) -> int:
Jonas Ohlssond8575072022-03-30 10:30:25 +0200142 """Size of the data type in bits"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100143 return self.value[0]
144
145 def size_in_bytes(self) -> int:
Jonas Ohlssond8575072022-03-30 10:30:25 +0200146 """Size of the data type in bytes"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100147 return self.value[0] // 8
148
149 def min_value(self) -> int:
150 """Minimum value of this type"""
151 if self.is_signed():
152 return -(1 << (self.size_in_bits() - 1))
153 else:
154 return 0
155
156 def max_value(self) -> int:
157 """Maximum value of this type"""
158 if self.is_signed():
159 return (1 << (self.size_in_bits() - 1)) - 1
160 else:
161 return (1 << self.size_in_bits()) - 1
162
163 def __str__(self):
164 return self.name
165
166 __repr__ = __str__
167
168
169class NpuAddressRange(NamedTuple):
170 """
171 Address range
172 """
173
174 region: int # Memory region, a value between 0 and 7
175 address: int # Address, offset from the region's base address
176 length: int # The length of the range, in bytes
177
178 def __str__(self):
179 return f"(region={self.region}, address={hex(self.address)}, length={self.length})"
180
181
182class NpuTileBox(NamedTuple):
183 """
184 Specifies the addresses and dimensions of the tiles of a feature map.
185 A feature map can use 1 to 4 tiles
186 """
187
188 height_0: int # The height of tile 0
189 height_1: int # The height of tile 1, 0 if unused
190 width_0: int # the width of tile 0, and tile 2 (if used)
191 addresses: List[int] # A list of 4 addresses, set unused addresses to 0
192
193
194class NpuShape3D(NamedTuple):
195 """
196 Shape of (part of) a feature map
197 """
198
199 height: int
200 width: int
201 depth: int
202
203
204class NpuQuantization(NamedTuple):
205 """
206 Quantization parameters
207 """
208
209 scale_f32: Optional[float]
210 zero_point: int
211
212
213class NpuPadding(NamedTuple):
214 """
215 Padding to be applied to a convolution operation
216 """
217
218 top: int
219 left: int
220 bottom: int
221 right: int
222
223
224class NpuActivation:
225 """
226 Activation function, fused with NPU operations
227 """
228
229 def __init__(self, op_type: NpuActivationOp):
230 self.op_type = op_type # The activation operation to be performed
231 # min/max are optional
232 self.min: Optional[float] = None # E.g. set to 0.0 for RELU
233 self.max: Optional[float] = None # E.g. set to 6.0 for RELU6
234 # Table lookup index, only applicable for TABLE_LOOKUP activation, 0-7
235 self.lookup_table_index: int = 0
236
237
238class NpuFeatureMap:
239 """
240 Basic information about IFM, IFM2, OFM
241 """
242
243 def __init__(self):
244 self.data_type: NpuDataType = NpuDataType.UINT8
245 # The memory region, a value 0-7
246 self.region: int = 0
247 # Shape of the feature map
248 self.shape: NpuShape3D = NpuShape3D(height=0, width=0, depth=0)
249 # The tiles that comprise the feature map. In the normal case when only 1 tile is used,
250 # height_0 == self.shape.height, height_1 is 0, width_0 == self.shape.width, addresses[1:] are set to 0
251 self.tiles: NpuTileBox = NpuTileBox(height_0=0, height_1=0, width_0=0, addresses=[0, 0, 0, 0])
252 self.quantization: Optional[NpuQuantization]
253 self.layout: NpuLayout = NpuLayout.NHWC
254 # x/y/c strides used by the NPU when traversing the feature map, if None, vela will use default strides
255 self.strides: Optional[NpuShape3D] = None
Tim Hall68df8a12022-03-16 16:51:16 +0000256 # Used for debug
257 self.name: Optional[str] = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100258
259
260class NpuKernel:
261 """
262 Kernel information for NPU operations
263 """
264
265 def __init__(self, w: int, h: int, stride_x: int = 1, stride_y: int = 1, dilation_x: int = 1, dilation_y: int = 1):
266 assert stride_x > 0 and stride_y > 0
267 assert dilation_x > 0 and dilation_y > 0
268 self.width = w
269 self.height = h
270 self.stride_x = stride_x
271 self.stride_y = stride_y
272 self.dilation_x = dilation_x
273 self.dilation_y = dilation_y
274
275
William Isaksson56e5f0c2024-01-10 12:28:04 +0100276class NpuAccumulatorType(Enum):
277 """
278 Accumulator dtype of NPU operation
279 """
280
281 Default = auto()
282 Int32 = auto()
283 Int40 = auto()
284
285
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100286class NpuOperationType(Enum):
287 """
288 Type of NPU operation
289 """
290
291 Dma = auto()
292 Conv2D = auto()
293 ConvDepthWise = auto()
294 Pooling = auto()
295 ElementWise = auto()
296
297
298class NpuOperation:
299 """
300 Base class for all NPU operations
301 """
302
303 def __init__(self, op_type: NpuOperationType):
304 self.op_type = op_type
Tim Hall68df8a12022-03-16 16:51:16 +0000305 # Used for debug
306 self.name: Optional[str] = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100307
308
309class NpuDmaOperation(NpuOperation):
310 """
311 DMA operation
312 """
313
314 def __init__(self, src: NpuAddressRange, dest: NpuAddressRange):
315 super().__init__(NpuOperationType.Dma)
316 self.src = src
317 self.dest = dest
318 # DMA channel, usually 0 (user channel)
319 self.channel: int = 0
320 # Channel mode, 0 = external, 1 = internal (should usually be 0)
321 self.mode: int = 0
322
323
324class NpuBlockOperation(NpuOperation):
325 """
326 Base class for operations which produce an OFM
327 """
328
329 def __init__(self, op_type: NpuOperationType):
330 super().__init__(op_type)
331 self.ifm: Optional[NpuFeatureMap] = None
332 self.ifm2: Optional[NpuFeatureMap] = None
333 # The non-quantized scalar value in a binary elementwise operation. Only set if IFM2 is scalar
334 self.ifm2_scalar: Optional[float] = None
335 self.ofm: Optional[NpuFeatureMap] = None
336 self.kernel: Optional[NpuKernel] = None
337 # Weights, one element for each NPU core, empty if no weights are used.
Louis Verhaard933f55e2020-11-25 14:10:30 +0100338 # Must have been compressed using npu_encode_weights()
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100339 self.weights: List[NpuAddressRange] = []
340 # Biases, one element for each NPU core, empty if no bias is used.
Louis Verhaard933f55e2020-11-25 14:10:30 +0100341 # Must have been encoded using npu_encode_bias()
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100342 self.biases: List[NpuAddressRange] = []
343 self.padding: Optional[NpuPadding] = None
344 # Optional activation function to be applied
345 self.activation: Optional[NpuActivation] = None
Louis Verhaard933f55e2020-11-25 14:10:30 +0100346 # The block config to be used, which must be valid for the given operation.
347 # See also npu_find_block_configs.
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100348 # If the operation has weights, the depth of the block config must be the same as
Louis Verhaard933f55e2020-11-25 14:10:30 +0100349 # the ofm depth used in the call to npu_encode_weights()
350 self.block_config: NpuShape3D
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100351 self.rounding_mode: NpuRoundingMode = NpuRoundingMode.TFL
352 # Set to True if the operations is fused with a Quantize operation (affects scaling)
353 self.fused_quantize: bool = False
354 # IFM upscaling to be applied
355 self.ifm_upscale: NpuResamplingMode = NpuResamplingMode.NONE
William Isaksson56e5f0c2024-01-10 12:28:04 +0100356 self.accumulator_type: NpuAccumulatorType = NpuAccumulatorType.Default
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100357
358
359class NpuConv2DOperation(NpuBlockOperation):
360 """
361 NPU_OP_CONV operation
362 """
363
364 def __init__(self):
365 super().__init__(NpuOperationType.Conv2D)
366 # Block traversal must be consistent with the block_traversal parameter specified in
367 # weight_compressor.encode_weights()
368 self.block_traversal: NpuBlockTraversal = NpuBlockTraversal.PART_KERNEL_FIRST
369
370
371class NpuConvDepthWiseOperation(NpuBlockOperation):
372 """
373 NPU_OP_DEPTHWISE operation
374 """
375
376 def __init__(self):
377 super().__init__(NpuOperationType.ConvDepthWise)
378
379
380class NpuPoolingOperation(NpuBlockOperation):
381 """
382 NPU_OP_POOL operation
383 """
384
385 def __init__(self, pooling_op_type: NpuPoolingOp):
386 super().__init__(NpuOperationType.Pooling)
387 self.sub_op_type: NpuPoolingOp = pooling_op_type
Tim Hall885033b2022-07-21 11:46:03 +0100388 # Set to a float value for ResizeBilinear/NearestNeighbor operations (affects scaling), else to None
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100389 self.rescale: Optional[float] = None
390
391
392class NpuElementWiseOperation(NpuBlockOperation):
393 """
394 NPU_OP_ELEMENTWISE operation
395 """
396
397 def __init__(self, elementwise_op_type: NpuElementWiseOp):
398 super().__init__(NpuOperationType.ElementWise)
399 self.sub_op_type: NpuElementWiseOp = elementwise_op_type
400 # Set to True for binary operators where IFM2 should be used as first operand
401 self.reversed_operands: bool = False
402 # Set to a tuple (scale, shift) for explicit rescale, else to None
403 self.rescale: Optional[Tuple] = None
Patrik Gustavssonc8a22f12020-11-18 17:05:50 +0100404
405
Louis Verhaard11831ce2020-11-18 18:53:24 +0100406def npu_get_api_version():
Patrik Gustavssonc8a22f12020-11-18 17:05:50 +0100407 """
408 Public facing API to get the API version
409 :return: int, the 16 most significant bits, corresponding to major version
410 the 16 least significant bits, corresponding to minor version
411 """
Louis Verhaard11831ce2020-11-18 18:53:24 +0100412 version = (API_VERSION_MAJOR << 16) | (API_VERSION_MINOR & 0xFFFF)
Patrik Gustavssonc8a22f12020-11-18 17:05:50 +0100413 return version
Louis Verhaardaeae5672020-11-02 18:04:27 +0100414
415
416def npu_encode_weights(
417 accelerator: NpuAccelerator,
418 weights_volume: numpy.ndarray,
419 dilation_xy: Tuple[int, int],
420 ifm_bitdepth: int,
421 ofm_block_depth: int,
422 is_depthwise: bool,
423 block_traversal: NpuBlockTraversal,
424):
425 """
426 Public facing API to use the Ethos-U weight encoding.
427
428 :param accelerator: NpuAccelerator enum to pick the correct accelerator
429 :param weights_volume: numpy.ndarray in OHWI layout with a shape of four
430 :param dilation_xy: a two element tuple of dilation attributes in x,y dimension
431 :param ifm_bitdepth: the bitdepth of input feature map
432 :param ofm_block_depth: the depth of blocks for processing
433 :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
434 :param block_traversal: indicates how these weights are traversed on sub-kernel basis
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200435 :return: a bytearray of encoded weights
Louis Verhaardaeae5672020-11-02 18:04:27 +0100436 """
437 from .architecture_features import Accelerator
438 from . import weight_compressor
439
440 acc = Accelerator.from_npu_accelerator(accelerator)
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200441 encoded_weights, _ = weight_compressor.encode_weights(
Louis Verhaardaeae5672020-11-02 18:04:27 +0100442 acc, weights_volume, dilation_xy, ifm_bitdepth, ofm_block_depth, is_depthwise, block_traversal
443 )
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200444 return encoded_weights
Louis Verhaardaeae5672020-11-02 18:04:27 +0100445
446
447def npu_encode_bias(bias: numpy.int64, scale: int, shift: int):
448 """
449 Public facing API to pack bias and scale values as required by the hardware
450 :param bias: 64-bit signed number that includes 40-bit signed bias
451 :param scale: 32-bit scale value
452 :param shift: 6-bit shift value
453 :return: packed 80-bit [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
454 """
455 from . import weight_compressor
456
457 return weight_compressor.encode_bias(bias, scale, shift)
458
459
Louis Verhaard933f55e2020-11-25 14:10:30 +0100460def npu_find_block_configs(npu_op: NpuOperation, accelerator: NpuAccelerator) -> List[NpuShape3D]:
461 """
462 Public facing API that returns a list of block configs that are valid for the given operation.
463 This function can be used to find a valid value for npu_op.block_config.
464 The block config is the unit of work in which the NPU generates the OFM.
465 """
Jacob Bohlinb8060f52021-08-09 12:22:51 +0100466 from .architecture_features import Accelerator
467 from .architecture_features import ArchitectureFeatures
468 from .architecture_features import Block
469 from .architecture_features import create_default_arch
470 from .architecture_allocator import try_block_config
471 from .register_command_stream_generator import resampling_mode_map
472 from .register_command_stream_util import to_kernel
473 from .operation import NpuBlockType
Louis Verhaard933f55e2020-11-25 14:10:30 +0100474
Jacob Bohlinb8060f52021-08-09 12:22:51 +0100475 is_partkernel = False
476 if isinstance(npu_op, NpuConv2DOperation):
477 block_type = NpuBlockType.ConvolutionMxN
478 is_partkernel = npu_op.block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
479 elif isinstance(npu_op, NpuConvDepthWiseOperation):
480 block_type = NpuBlockType.ConvolutionDepthWise
481 elif isinstance(npu_op, NpuPoolingOperation):
482 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
483 elif isinstance(npu_op, NpuElementWiseOperation):
484 block_type = NpuBlockType.ElementWise
485 else:
486 assert 0, "Unsupported operation"
487
488 ifm_shape = Block(npu_op.ifm.shape.width, npu_op.ifm.shape.height, npu_op.ifm.shape.depth)
489 ifm2_shape = None
490 if npu_op.ifm2:
491 ifm2_shape = Block(npu_op.ifm2.shape.width, npu_op.ifm2.shape.height, npu_op.ifm2.shape.depth)
492 ofm_shape = Block(npu_op.ofm.shape.width, npu_op.ofm.shape.height, npu_op.ofm.shape.depth)
493
494 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
495 ifm_bits = npu_op.ifm.data_type.size_in_bits()
496 kernel = to_kernel(npu_op.kernel)
497 lut_banks = 0
498 if npu_op.activation:
499 lut_banks = 2 if npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP else 0
500
501 has_scaling = True
502 for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
503 if tensor and tensor.quantization is None:
504 has_scaling = False
505 break
506
507 arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
508
509 max_block_width = min(arch.ofm_block_max.width, ofm_shape.width)
510 max_block_height = min(arch.ofm_block_max.height, ofm_shape.height)
511 max_block_depth = min(arch.ofm_block_max.depth, ofm_shape.depth)
512
513 min_block_height = max(arch.ofm_ublock.height, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1)
514 min_block_width = max(arch.ofm_ublock.width, 2 if ifm_resampling_mode != NpuResamplingMode.NONE else 1)
515
516 valid_block_configs = []
517 for w in range(min_block_width, max_block_width + min_block_width, min_block_width):
518 for h in range(min_block_height, max_block_height + min_block_height, min_block_height):
519 # Try valid OFM block depths
520 for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
521 # OFM block depth has the constraint that if it causes the OFM to be
522 # split, it must be a multiple of the OFM split size
523 if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
524 block = Block(w, h, c)
525 config = try_block_config(
526 block,
527 arch,
528 block_type,
529 ofm_shape,
530 ifm_shape,
531 ifm2_shape,
532 npu_op.ifm2_scalar is not None,
533 ifm_bits,
534 is_partkernel,
535 kernel,
536 lut_banks,
537 has_scaling,
538 ifm_resampling_mode,
539 )
540
541 if config:
542 ofm_block = config.ofm_block
543 valid_block_configs.append(NpuShape3D(ofm_block.height, ofm_block.width, ofm_block.depth))
544
545 assert len(valid_block_configs) > 0
546 return valid_block_configs
Louis Verhaard933f55e2020-11-25 14:10:30 +0100547
548
Louis Verhaardaeae5672020-11-02 18:04:27 +0100549def npu_generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: NpuAccelerator) -> List[int]:
550 """
551 Public facing API for generating an Ethos-U register command stream.
552 Calculates dependencies between commands and inserts wait operations if needed.
553
554 :param npu_op_list: List[NpuOperation] list of high level NPU operations
555 :param accelerator: NpuAccelerator enum to pick the correct accelerator
556 :return register commands, as a list of 32-bit integers
557 """
558 from . import register_command_stream_generator
559
560 return register_command_stream_generator.generate_register_command_stream(npu_op_list, accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +0100561
562
563def npu_create_driver_payload(register_command_stream: List[int], accelerator: NpuAccelerator) -> bytes:
564 """
565 Public facing API for generating driver payload, containing a driver header
566 and the given Ethos-U register command stream.
567 Returns the payload, in little endian format, which must be placed in memory on a 16-byte aligned
568 address.
569
570 :param register_command_stream: List[int] register commands, as a list of 32-bit integers
571 :param accelerator: NpuAccelerator enum to pick the correct accelerator
572 :return driver payload, as a byte array
573 """
574 from . import driver_actions
575
576 return driver_actions.npu_create_driver_payload(register_command_stream, accelerator)