blob: 5e9dffac869e60c172b0f49c585d04fd25af4095 [file] [log] [blame]
Johan Alfvenbfe6fe32023-02-14 15:20:03 +01001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaarde8a5a782020-11-02 18:04:27 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Conversion from high level command to NpuOperation
19from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020from typing import cast
Louis Verhaard024c3552021-03-17 14:26:34 +010021from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010022from typing import List
23from typing import Optional
Jonas Ohlsson845e2322022-03-01 12:39:55 +010024from typing import Tuple
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025
26from .api import NpuActivation
27from .api import NpuActivationOp
28from .api import NpuAddressRange
29from .api import NpuBlockOperation
30from .api import NpuBlockTraversal
31from .api import NpuConv2DOperation
32from .api import NpuConvDepthWiseOperation
33from .api import NpuDataType
34from .api import NpuDmaOperation
35from .api import NpuElementWiseOp
36from .api import NpuElementWiseOperation
37from .api import NpuFeatureMap
Louis Verhaarde8a5a782020-11-02 18:04:27 +010038from .api import NpuLayout
39from .api import NpuOperation
Rickard Bolin9ae34552022-06-09 13:07:17 +000040from .api import NpuOperationType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010041from .api import NpuPadding
42from .api import NpuPoolingOp
43from .api import NpuPoolingOperation
44from .api import NpuQuantization
45from .api import NpuResamplingMode
46from .api import NpuRoundingMode
47from .api import NpuShape3D
48from .api import NpuTileBox
49from .architecture_features import ArchitectureFeatures
50from .data_type import DataType
Louis Verhaard1e170182020-11-26 11:42:04 +010051from .debug_database import DebugDatabase
Michael McGeagh7a6f8432020-12-02 15:29:22 +000052from .errors import UnsupportedFeatureError
Tim Hall3c5cfe92022-03-16 16:31:57 +000053from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Louis Verhaarde8a5a782020-11-02 18:04:27 +010054from .high_level_command_stream import Box
55from .high_level_command_stream import Command
Louis Verhaarde8a5a782020-11-02 18:04:27 +010056from .high_level_command_stream import DMA
Johan Alfven90724962023-02-02 09:07:48 +010057from .high_level_command_stream import NOP
Louis Verhaarde8a5a782020-11-02 18:04:27 +010058from .high_level_command_stream import NpuStripe
Fredrik Svedberg838df0a2021-09-17 16:29:22 +020059from .numeric_util import quantise_float32
Tim Halld8339a72021-05-27 18:49:40 +010060from .numeric_util import round_up
Louis Verhaarde8a5a782020-11-02 18:04:27 +010061from .operation import NpuBlockType
62from .operation import Op
63from .operation import Operation
Rickard Bolin9ae34552022-06-09 13:07:17 +000064from .operation import Padding
Tim Hall5ff4cd12023-05-16 22:39:14 +010065from .operation import RoundingMode
Louis Verhaard1e170182020-11-26 11:42:04 +010066from .register_command_stream_generator import generate_command_stream
67from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010068from .register_command_stream_util import to_npu_kernel
69from .register_command_stream_util import UNARY_ELEMWISE_OPS
patrik.gustavssoneeb85152020-12-21 17:10:40 +000070from .shape4d import Shape4D
Louis Verhaarde8a5a782020-11-02 18:04:27 +010071from .tensor import MemType
72from .tensor import Tensor
Louis Verhaarde8a5a782020-11-02 18:04:27 +010073from .tensor import TensorFormat
74from .tensor import TensorPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010075from .weight_compressor import NpuWeightTensor
Tim Halld8339a72021-05-27 18:49:40 +010076from .weight_compressor import WeightKey
Louis Verhaarde8a5a782020-11-02 18:04:27 +010077
78
Louis Verhaarde8a5a782020-11-02 18:04:27 +010079class BasePointerIndex(IntEnum):
80 WeightTensor = 0 # base address index for the Weight tensor
81 ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
82 ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Louis Verhaarde8a5a782020-11-02 18:04:27 +010083
84
85dtype_map = {
86 DataType.uint8: NpuDataType.UINT8,
87 DataType.int8: NpuDataType.INT8,
88 DataType.uint16: NpuDataType.UINT16,
89 DataType.int16: NpuDataType.INT16,
90 DataType.int32: NpuDataType.INT32,
91}
92
93
Louis Verhaarde8a5a782020-11-02 18:04:27 +010094# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
95elementwise_op_map = {
96 Op.Mul: NpuElementWiseOp.MUL,
97 Op.Add: NpuElementWiseOp.ADD,
98 Op.Sub: NpuElementWiseOp.SUB,
99 Op.Minimum: NpuElementWiseOp.MIN,
100 Op.Maximum: NpuElementWiseOp.MAX,
101 Op.LeakyRelu: NpuElementWiseOp.LRELU,
102 Op.Abs: NpuElementWiseOp.ABS,
103 Op.CLZ: NpuElementWiseOp.CLZ,
104 Op.SHR: NpuElementWiseOp.SHR,
105 Op.SHL: NpuElementWiseOp.SHL,
106}
107
108
Tim Hall3c5cfe92022-03-16 16:31:57 +0000109# inverse of the resampling_mode_map in the register command stream generator
110resampling_mode_inv_map = {
111 resampling_mode.NONE: NpuResamplingMode.NONE,
112 resampling_mode.NEAREST: NpuResamplingMode.NEAREST,
113 resampling_mode.TRANSPOSE: NpuResamplingMode.TRANSPOSE,
114}
115
116
Tim Hall5ff4cd12023-05-16 22:39:14 +0100117rounding_mode_map = {
118 RoundingMode.TFLite: NpuRoundingMode.TFL,
119 RoundingMode.ToZero: NpuRoundingMode.TRUNCATE,
120 RoundingMode.HalfUp: NpuRoundingMode.NATURAL,
121 RoundingMode.AwayZero: NpuRoundingMode.NATURAL,
122}
123
124
Johan Alfvén56a71b02022-10-19 11:20:12 +0200125def ifm_ifm2_correct_order(ifm_shape: Shape4D, ifm2_shape: Shape4D) -> bool:
126
127 if ifm_shape is None:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100128 # Scalar needs to be in IFM2
129 return False
Johan Alfvén56a71b02022-10-19 11:20:12 +0200130 if ifm2_shape is None:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100131 return True
132
Johan Alfvén56a71b02022-10-19 11:20:12 +0200133 for ifm, ifm2 in zip(ifm_shape.as_list(), ifm2_shape.as_list()):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100134 if ifm != ifm2 and ifm == 1:
135 # Broadcasted FM needs to be in IFM2
136 return False
137 return True
138
139
Patrik Gustavssonb0ca2742020-11-18 07:59:09 +0100140def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100141 """Specifies type of rounding to be used"""
142 rounding_mode = NpuRoundingMode.TFL
Tim Hall885033b2022-07-21 11:46:03 +0100143 if op.type.is_resize_op():
Dwight Lidman9d243932021-08-10 12:53:12 +0200144 rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100145 elif (
Johan Gunnarsson98556372023-08-10 13:10:44 +0200146 op.original_type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100147 and op.ifm.dtype == DataType.int16
148 ):
149 rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavssonb0ca2742020-11-18 07:59:09 +0100150 elif (
151 not fused_quantize
152 and op.type.is_avgpool_op()
153 and op.memory_function == Op.ConcatSliceWrite
154 and op.kernel.elements_wh() == 1
155 ):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100156 rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard1a92f782021-02-09 16:08:26 +0100157 if op.rounding_mode is not None:
Tim Hall5ff4cd12023-05-16 22:39:14 +0100158 rounding_mode = rounding_mode_map[op.rounding_mode]
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100159 return rounding_mode
160
161
Rickard Bolin9ae34552022-06-09 13:07:17 +0000162def create_padding(cmd: NpuStripe, primary_op: Operation, npu_op: NpuBlockOperation) -> NpuPadding:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100163 if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
164 return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard69b31762020-11-17 09:45:20 +0100165 top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100166
167 # Check if this is for horizontal ifm streaming
168 if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard69b31762020-11-17 09:45:20 +0100169 top = cmd.pad_top
170 bottom = cmd.pad_bottom
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100171
Tim Hall3751aa42021-12-16 13:17:29 +0000172 # the ifm box coordinate range depends upon whether the primary op was combined with a split slice read
173 ifm_read_offset = primary_op.read_offsets[0]
174 ifm_read_shape = primary_op.read_shapes[0]
175 if ifm_read_offset is None or len(ifm_read_offset) < 2:
176 box_start_coord_min = 0
177 box_end_coord_max = cmd.ps.ifm_shapes[0].width
178 else:
179 box_start_coord_min = ifm_read_offset[-2]
180 box_end_coord_max = ifm_read_shape[-2]
181
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100182 # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
183 # because of activation function needed to be fused.
Tim Hall3751aa42021-12-16 13:17:29 +0000184 if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > box_start_coord_min:
185 left = 0
186 if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < box_end_coord_max:
187 right = 0
Rickard Bolin9ae34552022-06-09 13:07:17 +0000188
189 # If tile padding is selected, modify the tile base addresses and set NpuPadding to zero.
190 if primary_op.attrs.get("padding", None) == Padding.TILE:
191 assert cmd.ifm_tensor.format == TensorFormat.NHCWB16, "Tensor format NHCWB16 required to perform tile padding"
192 assert npu_op.op_type == NpuOperationType.ConvDepthWise, "Tile padding only supported for depthwise convolution"
193 assert npu_op.ifm is not None, "Feature map must be initialized to modify the tile addresses"
194 npu_op.ifm.tiles = modify_tile_addresses_for_padding(
195 npu_op.ifm.tiles,
196 primary_op.attrs.get("explicit_padding", None),
197 channels=cmd.ps.ifm_shapes[0].depth,
198 dtype=cmd.ifm_tensor.dtype,
199 )
200 top, left, bottom, right = 0, 0, 0, 0
Rickard Bolinfea15162022-07-04 16:19:16 +0000201
Louis Verhaard69b31762020-11-17 09:45:20 +0100202 return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100203
204
Rickard Bolin9ae34552022-06-09 13:07:17 +0000205def modify_tile_addresses_for_padding(
206 tile_box: NpuTileBox, padding_direction: List[int], channels: int, dtype: DataType
207) -> NpuTileBox:
208 # Addresses are 16-bytes aligned when using the NHCWB16 format, which is required to utilize tiling
209 # Calculate the offset to top right, bottom left and bottom right element in the IFM (top left offset is 0)
210 """
211 Example: 4x4x1 IFM
212 | a b c d | <-- Offset to TR ('d') is (w0-1) = 3
213 | e f g h |
214 | i j k l |
215 | m n o p | <-- Offset to BL ('m') is (w0*(h0-1)) = 12 and to BR ('p') ((w0*h0)-1) = 15
216 """
217 h0, h1, w0, addresses = tile_box
218 elem_size = 2 if dtype == DataType.int16 else 1
219 tr_offset = (w0 - 1) * 16 * elem_size
220 bl_offset = w0 * (h0 - 1) * 16 * (round_up(channels, 16) // 16) * elem_size
221 br_offset = tr_offset + bl_offset
222
223 # Explicit padding order: (Top, Left, Bottom, Right)
224 if padding_direction == (1, 1, 0, 0):
225 # Pad top left corner
226 """
227 | a a b |
228 | a b | -> | a a b |
229 | c d | | c c d |
230 """
231 addresses = [addresses[0]] * 4
232 h0, h1, w0 = 1, 1, 1
233
234 elif padding_direction == (1, 0, 0, 1):
235 # Pad top right corner
236 """
237 | a b b |
238 | a b | -> | a b b |
239 | c d | | c d d |
240 """
241 addresses = [addresses[0], addresses[0] + tr_offset, addresses[0], addresses[0] + tr_offset]
242 h0, h1, w0 = 1, 1, w0
243
244 elif padding_direction == (0, 1, 1, 0):
245 # Pad bottom left corner
246 """
247 | a b | | a a b |
248 | c d | -> | c c d |
249 | c c d |
250 """
251 addresses = [addresses[0], addresses[0], addresses[0] + bl_offset, addresses[0] + bl_offset]
252 h0, h1, w0 = h0, h1, 1
253
254 elif padding_direction == (0, 0, 1, 1):
255 # Pad bottom right corner
256 """
257 | a b | | a b b |
258 | c d | -> | c d d |
259 | c d d |
260 """
261 addresses = [
262 addresses[0],
263 addresses[0] + tr_offset,
264 addresses[0] + bl_offset,
265 addresses[0] + br_offset,
266 ]
267 # h0, h1, w0 = h0, h1, w0
268 else:
269 assert 0, "Invalid padding direction for tile padding"
270
271 return NpuTileBox(height_0=h0, height_1=h1, width_0=w0, addresses=[int(addr) for addr in addresses])
272
273
Louis Verhaard024c3552021-03-17 14:26:34 +0100274def get_region(mem_type: MemType, arch: ArchitectureFeatures) -> int:
Tim Hall1bd531d2020-11-01 20:59:36 +0000275 base_ptr_idx_map = {
276 MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
277 MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
278 MemType.Scratch: BasePointerIndex.ScratchTensor,
279 }
280
281 if arch.is_spilling_enabled():
282 base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100283 else:
Tim Hall1bd531d2020-11-01 20:59:36 +0000284 base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
285
Louis Verhaard024c3552021-03-17 14:26:34 +0100286 return base_ptr_idx_map[mem_type].value
287
288
289def get_mem_limits_for_regions(arch: ArchitectureFeatures) -> Dict[int, int]:
290 """Returns map region -> max size of the region in bytes"""
291 mem_limits = dict()
292 for mem_type in MemType.all():
293 mem_limits[get_region(mem_type, arch)] = arch.mem_type_size(mem_type)
294 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
295 return mem_limits
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100296
297
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100298def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
299 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard69b31762020-11-17 09:45:20 +0100300 block = ifm_box.get_block()
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100301 else:
Louis Verhaard69b31762020-11-17 09:45:20 +0100302 block = ofm_box.get_block()
303 return block.depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100304
305
306def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
307 """Checks if quantization should use 0 as zero point"""
308 if tens.dtype == DataType.int32 and is_ifm_tensor:
309 return True
Tim Hall5ff4cd12023-05-16 22:39:14 +0100310 if ps.primary_op.rounding_mode == RoundingMode.AwayZero:
Raul Farkas3e7157b2023-05-09 09:09:17 +0100311 if (
312 ps.primary_op.original_type == Op.AvgPool
313 and ps.primary_op.type == Op.Conv2DBias
Johan Alfven4bf0cdf2023-11-06 11:52:56 +0100314 and ps.primary_op.attrs.get("padding", None) in (Padding.EXPLICIT, Padding.VALID)
Raul Farkas3e7157b2023-05-09 09:09:17 +0100315 ):
316 # Force zero point to 0 for AveragePool operators converted to a Conv2DBias with rounding away from
317 # zero.
318 return True
Tim Hall5ff4cd12023-05-16 22:39:14 +0100319 if ps.primary_op.original_type == Op.ResizeBilinear and ps.primary_op.type == Op.DepthwiseConv2DBias:
320 # Force zero point to 0 for ResizeBilinear operators converted to a DepthwiseConv with rounding away from
321 # zero. This is because the reference kernel ignores the zero points.
322 return True
323 if (
324 not is_ifm_tensor
325 and ps.primary_op.original_type == Op.AvgPool
326 and ps.primary_op.attrs.get("padding", None) == Padding.EXPLICIT
327 and ps.primary_op.type == Op.DepthwiseConv2DBias
328 ):
329 # Force zero point to 0 for the OFM of AvgPool operators that have been combined with a previous PAD
330 # operator and converted to a DepthwiseConv with rounding away from zero. This is because the zero point
331 # will already have been applied in the Bias.
332 return True
Tim Hall885033b2022-07-21 11:46:03 +0100333 if ps.primary_op.type not in (Op.AvgPool, Op.CLZ, Op.SHL) and not ps.primary_op.type.is_resize_op():
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100334 return False
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200335 if ps.primary_op.type == Op.AvgPool and ps.primary_op.explicit_scaling:
336 return False
Johan Gunnarsson98556372023-08-10 13:10:44 +0200337 fused_quantize = any(op.type == Op.Quantize or op.original_type == Op.Quantize for op in ps.ops)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100338 forced_ofm_quantization = ps.primary_op.forced_output_quantization
339 use_0 = (
Fredrik Svedberg838df0a2021-09-17 16:29:22 +0200340 (
341 ps.primary_op.activation is None
342 or forced_ofm_quantization is not None
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200343 or (ps.primary_op.type.is_avgpool_op() and ps.primary_op.activation.op_type.is_relu_op())
Fredrik Svedberg838df0a2021-09-17 16:29:22 +0200344 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100345 and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
346 and not fused_quantize
347 )
348 return use_0
349
350
351def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
352 """Gets quantization for IFM/IFM2"""
Dwight Lidman4f728c02020-12-17 15:14:45 +0100353 op = ps.primary_op
354 ifm_quant = op.forced_input_quantization if op.forced_input_quantization is not None else tens.quantization
355 if ifm_quant is None:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100356 return None
357 if use_zero_point_0(ps, tens, True):
358 zero_point = 0
359 else:
Dwight Lidman4f728c02020-12-17 15:14:45 +0100360 zero_point = int(ifm_quant.zero_point)
361 return NpuQuantization(scale_f32=ifm_quant.scale_f32, zero_point=zero_point)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100362
363
364def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
365 """Gets quantization for OFM"""
366 op = ps.primary_op
367 # Check if operation's output quantization is should be used instead of the output tensor's quantization
368 # (used in LUTs)
369 ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
370 if ofm_quant is None:
371 return None
372 if use_zero_point_0(ps, tens, False):
373 zero_point = 0
374 else:
375 zero_point = int(ofm_quant.zero_point)
376 return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
377
378
Rickard Bolin17e53b52022-09-06 16:09:01 +0000379def create_feature_map(
380 tens: Tensor,
381 box: Box,
382 arch: ArchitectureFeatures,
383 op_shape4D: Shape4D,
Rickard Bolinfea15162022-07-04 16:19:16 +0000384 tile_base_offsets: List[int],
Rickard Bolin17e53b52022-09-06 16:09:01 +0000385 stride_multiplier: Optional[List[int]] = None,
386) -> NpuFeatureMap:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100387 """Creates feature map with common fields populated"""
388 fm = NpuFeatureMap()
Louis Verhaard024c3552021-03-17 14:26:34 +0100389 fm.region = get_region(tens.mem_type, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100390 fm.data_type = dtype_map[tens.dtype]
391 if tens.format == TensorFormat.NHWC:
392 fm.layout = NpuLayout.NHWC
393 elif tens.format == TensorFormat.NHCWB16:
394 fm.layout = NpuLayout.NHCWB16
395 else:
396 assert 0, "Incorrect tensor format"
Rickard Bolin17e53b52022-09-06 16:09:01 +0000397
398 strides = tens.get_strides(op_shape4D)
399 assert strides is not None
400
401 if stride_multiplier and stride_multiplier != [1, 1, 1]:
402 assert (
403 tens.format == TensorFormat.NHWC
404 ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format"
405 # Multiply strides for C/H/W (in that order) with corresponding stride factor
406 for i, stride_factor in enumerate(stride_multiplier, start=1):
407 strides[i] *= stride_factor
408
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100409 height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
Rickard Bolin17e53b52022-09-06 16:09:01 +0000410 box.start_coord, box.end_coord, strides, op_shape4D
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100411 )
Rickard Bolin17e53b52022-09-06 16:09:01 +0000412
Rickard Bolinfea15162022-07-04 16:19:16 +0000413 for idx, offset in enumerate(tile_base_offsets):
414 addresses[idx] += offset
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100415 fm.tiles = NpuTileBox(
416 height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
417 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100418 fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
Tim Hall68df8a12022-03-16 16:51:16 +0000419 fm.name = tens.name
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100420 return fm
421
422
Tim Halld784af72021-06-08 21:25:57 +0100423def create_weights(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100424 weight_tensor: NpuWeightTensor, weight_box: Box, scale_tensor: NpuWeightTensor, arch: ArchitectureFeatures
425) -> Tuple[List[NpuAddressRange], List[NpuAddressRange]]:
Tim Halld8339a72021-05-27 18:49:40 +0100426 """Returns address ranges for weights and scales"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100427 weights = []
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100428 biases = []
Tim Halld784af72021-06-08 21:25:57 +0100429 shared_region = get_region(weight_tensor.mem_type, arch)
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100430 scale_region = get_region(scale_tensor.mem_type, arch) if scale_tensor else 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100431
Tim Halld8339a72021-05-27 18:49:40 +0100432 w_tensor_src = weight_tensor
433 if weight_tensor.src_tensor:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100434 w_tensor_src = cast(NpuWeightTensor, weight_tensor.src_tensor)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100435
Tim Halld8339a72021-05-27 18:49:40 +0100436 core_offset = 0
437 for core in range(0, arch.ncores):
438 # Get weight range per core
439 key = WeightKey(core, weight_box.start_coord[-1])
440 if key in w_tensor_src.encoded_ranges:
441 weight_range = w_tensor_src.encoded_ranges[key]
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000442 if weight_tensor == w_tensor_src:
443 # Straight from source tensor
444 address = weight_tensor.address + weight_range.offset
Tim Hallb5df7732022-05-04 16:20:43 +0100445 else:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000446 # Weight buffered tensor
447 address = weight_tensor.address + core_offset
448 core_offset += round_up(weight_range.total_bytes, 16)
Tim Halld8339a72021-05-27 18:49:40 +0100449
450 # Location of weights in tensor
451 addr_range = NpuAddressRange(
Tim Halld784af72021-06-08 21:25:57 +0100452 shared_region, int(address + weight_range.weight_offset), round_up(int(weight_range.weight_bytes), 16)
Tim Halld8339a72021-05-27 18:49:40 +0100453 )
454 weights.append(addr_range)
Tim Halld784af72021-06-08 21:25:57 +0100455
456 # Location of standalone scales or combined weights tensor scales
457 if scale_tensor:
458 assert scale_tensor.src_tensor is None # Must be standalone
459 scale_range = scale_tensor.encoded_ranges[key]
460 address = scale_tensor.address + scale_range.offset
461 addr_range = NpuAddressRange(scale_region, int(address), round_up(int(scale_range.scale_bytes), 16))
462 else:
463 addr_range = NpuAddressRange(shared_region, int(address), round_up(int(weight_range.scale_bytes), 16))
464
Tim Halld8339a72021-05-27 18:49:40 +0100465 biases.append(addr_range)
466
467 return weights, biases
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100468
469
470def create_npu_activation(op: Operation) -> NpuActivation:
471 """Creates fused activation function"""
472 if op.activation is None:
473 return NpuActivation(NpuActivationOp.NONE_OR_RELU)
474 faf = op.activation.op_type
475 act_op = NpuActivationOp.NONE_OR_RELU
476 if faf == Op.Tanh:
477 act_op = NpuActivationOp.TANH
478 elif faf == Op.Sigmoid:
479 act_op = NpuActivationOp.SIGMOID
480 elif faf == Op.LUT:
481 act_op = NpuActivationOp.TABLE_LOOKUP
482 elif not faf.is_relu_op():
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000483 raise UnsupportedFeatureError(f"Unsupported fused_activation_function: {faf.name}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100484
485 act = NpuActivation(act_op)
486 act.min = op.activation.min
487 act.max = op.activation.max
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200488 if act_op is NpuActivationOp.NONE_OR_RELU and op.type.is_avgpool_op() and not op.explicit_scaling:
Fredrik Svedberg838df0a2021-09-17 16:29:22 +0200489 quant = op.ofm.quantization
490 if quant and quant.zero_point: # Zero point is not 0
491 scale_f32 = 1 if quant.scale_f32 is None else quant.scale_f32
492 zero_point = quant.zero_point
493 if act.min is not None:
494 act.min = scale_f32 * quantise_float32(act.min, scale_f32, zero_point)
495 if act.max is not None:
496 act.max = scale_f32 * quantise_float32(act.max, scale_f32, zero_point)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100497 act.lookup_table_index = op.activation.lut_index
498 return act
499
500
501def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
502 """Sets common fields of the given operation"""
503 ps = cmd.ps
504 op = ps.primary_op
Louis Verhaard69b31762020-11-17 09:45:20 +0100505
506 ifm_height = cmd.ifm_box.get_block().height
Johan Alfvenbfe6fe32023-02-14 15:20:03 +0100507 ifm_width = cmd.ifm_box.get_block().width
Louis Verhaard69b31762020-11-17 09:45:20 +0100508 ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100509
Rickard Bolinfea15162022-07-04 16:19:16 +0000510 npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0], op.tile_base_offsets_ifm[0])
Louis Verhaard69b31762020-11-17 09:45:20 +0100511 npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100512 npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard69b31762020-11-17 09:45:20 +0100513
514 out_block = cmd.ofm_box.get_block()
Rickard Bolinfea15162022-07-04 16:19:16 +0000515 npu_op.ofm = create_feature_map(
516 cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.tile_base_offsets_ofm, op.ofm_stride_multiplier
517 )
Louis Verhaard69b31762020-11-17 09:45:20 +0100518 npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100519 npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
520
521 if cmd.weight_tensor is not None:
Tim Halld784af72021-06-08 21:25:57 +0100522 npu_op.weights, npu_op.biases = create_weights(cmd.weight_tensor, cmd.weight_box, cmd.scale_tensor, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100523 npu_op.activation = create_npu_activation(op)
Johan Gunnarsson98556372023-08-10 13:10:44 +0200524 npu_op.fused_quantize = any(op.type == Op.Quantize or op.original_type == Op.Quantize for op in ps.ops)
Patrik Gustavssonb0ca2742020-11-18 07:59:09 +0100525 npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100526 npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
527
528 if not op.type.is_elementwise_op():
Rickard Bolin9ae34552022-06-09 13:07:17 +0000529 npu_op.padding = create_padding(cmd, op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100530 npu_op.kernel = to_npu_kernel(op.kernel)
Tim Hall3c5cfe92022-03-16 16:31:57 +0000531 npu_op.ifm_upscale = resampling_mode_inv_map[op.ifm_resampling_mode]
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100532 return npu_op
533
534
535def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
536 """Converts the command to NpuConv2DOperation"""
537 npu_op = NpuConv2DOperation()
538 set_common_op_fields(npu_op, cmd, arch)
539 if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
540 npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
541 else:
Tim Halld8339a72021-05-27 18:49:40 +0100542 if cmd.weight_tensor.src_tensor:
543 npu_op.block_traversal = cmd.weight_tensor.src_tensor.hw_traversal
544 else:
545 npu_op.block_traversal = cmd.weight_tensor.hw_traversal
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100546 return npu_op
547
548
549def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
550 """Converts the command to NpuConvDepthWiseOperation"""
551 npu_op = NpuConvDepthWiseOperation()
552 set_common_op_fields(npu_op, cmd, arch)
553 return npu_op
554
555
556def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
557 """Converts the command to NpuPoolingOperation"""
558 ps = cmd.ps
559 op = ps.primary_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100560 if op.type.is_maxpool_op():
561 pool_op = NpuPoolingOp.MAX
Tim Hall885033b2022-07-21 11:46:03 +0100562 elif op.type.is_avgpool_op() or op.type.is_resize_op():
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100563 pool_op = NpuPoolingOp.AVERAGE
564 elif op.type == Op.ReduceSum:
565 pool_op = NpuPoolingOp.REDUCE_SUM
566 else:
567 assert 0, f"Unknown pool type {op.type}"
568 npu_op = NpuPoolingOperation(pool_op)
569 set_common_op_fields(npu_op, cmd, arch)
570 # Pooling specific info
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200571 if op.explicit_scaling:
572 # Note: reuse of rescale for explicit scaling to not expose this in the external API
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200573 npu_op.rescale = op.explicit_scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100574 return npu_op
575
576
577def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
578 """Converts the command to NpuElementWiseOperation"""
579 ps = cmd.ps
580 op = ps.primary_op
581 assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
582 elemwise_op = elementwise_op_map[op.type]
583 npu_op = NpuElementWiseOperation(elemwise_op)
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100584
Louis Verhaard1e170182020-11-26 11:42:04 +0100585 if elemwise_op not in UNARY_ELEMWISE_OPS:
Johan Alfvén56a71b02022-10-19 11:20:12 +0200586 ifm_shape = None if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0]
587 ifm2_shape = None if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1]
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200588 if cmd.reversed_operands:
589 assert ifm_ifm2_correct_order(ifm_shape, ifm2_shape)
590 npu_op.reversed_operands = True
591 elif not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100592 # The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
593 cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
594 cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100595 ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100596 npu_op.reversed_operands = True
Rickard Bolinfea15162022-07-04 16:19:16 +0000597 npu_op.ifm2 = create_feature_map(
598 cmd.ifm2_tensor,
599 cmd.ifm2_box,
600 arch,
601 ps.ifm_shapes[1],
602 op.tile_base_offsets_ifm[1],
603 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100604 npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
605 if cmd.ifm2_tensor.shape == []:
606 # scalar
James Peet7519d502021-07-19 16:47:58 +0100607 npu_op.ifm2_scalar = cmd.ifm2_tensor.get_scalar()
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100608 npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
609 else:
Louis Verhaard69b31762020-11-17 09:45:20 +0100610 ifm2_blk = cmd.ifm2_box.get_block()
Johan Alfvenbfe6fe32023-02-14 15:20:03 +0100611 npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_blk.width, depth=ifm2_blk.depth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100612 set_common_op_fields(npu_op, cmd, arch)
613 # Check if output scale needs to be overridden
614 output_scale = None
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200615 if op.explicit_scaling is not None:
616 assert not op.explicit_scaling.per_channel
617 assert op.type in (Op.Add, Op.Mul, Op.Sub)
618 npu_op.rescale = (op.explicit_scaling.multiplier[0], op.explicit_scaling.shift[0])
619 elif op.type == Op.Add and op.original_type.is_resize_op():
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100620 # Force output scale same as the input scale for
Tim Hall885033b2022-07-21 11:46:03 +0100621 # resizebilinear/nearestneighbor 1x1 that is converted to add
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100622 output_scale = npu_op.ifm2.quantization.scale_f32
Tim Hall885033b2022-07-21 11:46:03 +0100623 elif op.type == Op.Abs:
Fredrik Svedbergf2afd7f2021-02-01 21:42:12 +0100624 output_scale = npu_op.ifm.quantization.scale_f32 / npu_op.ofm.quantization.scale_f32
Tim Hall885033b2022-07-21 11:46:03 +0100625 elif op.type == Op.LeakyRelu:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100626 output_scale = op.attrs["alpha"]
Tim Hall885033b2022-07-21 11:46:03 +0100627 elif op.type in (Op.Add, Op.Mul, Op.Sub):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100628 if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
629 output_scale = 1 / 0x3000
630 if output_scale is not None:
631 npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
632 return npu_op
633
634
635def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
636 """Converts the command to NpuDmaOperation"""
Louis Verhaard024c3552021-03-17 14:26:34 +0100637 src_region = get_region(cmd.in_tensor.mem_type, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100638 if cmd.out_tensor.purpose == TensorPurpose.LUT:
Louis Verhaard1e170182020-11-26 11:42:04 +0100639 dest_region = BASE_PTR_INDEX_MEM2MEM
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100640 else:
Louis Verhaard024c3552021-03-17 14:26:34 +0100641 dest_region = get_region(cmd.out_tensor.mem_type, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100642
Tim Halld8339a72021-05-27 18:49:40 +0100643 if cmd.in_tensor.purpose == TensorPurpose.Weights:
644 # Get weight range per core
645 sz = 0
646 for core in range(0, arch.ncores):
647 key = WeightKey(core, cmd.box.start_coord[-1])
648 if key in cmd.in_tensor.encoded_ranges:
649 weight_range = cmd.in_tensor.encoded_ranges[key]
650 sz += round_up(weight_range.total_bytes, 16)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100651
Tim Halld8339a72021-05-27 18:49:40 +0100652 if core == 0:
653 weight_range = cmd.in_tensor.encoded_ranges[key]
654 src_addr = cmd.in_tensor.address + weight_range.offset
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000655 dest_addr = cmd.out_tensor.address
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100656 else:
Rickard Bolin17e53b52022-09-06 16:09:01 +0000657 src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord)
658 dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord)
Johan Alfven90724962023-02-02 09:07:48 +0100659 # DMA must use 16 bytes alignment (tensors are always aligned but the sz calculation uses actual size)
660 sz = round_up(cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr, 16)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100661 src = NpuAddressRange(src_region, int(src_addr), int(sz))
662 dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
663 return NpuDmaOperation(src, dest)
664
665
666def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
667 """Converts the high level command to NpuOperation"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100668 npu_op: NpuOperation
669 if isinstance(cmd, DMA):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100670 npu_op = create_dma_op(cmd, arch)
Tim Hall68df8a12022-03-16 16:51:16 +0000671 npu_op.name = cmd.out_tensor.name
Dwight Lidman9b43f842020-12-08 17:56:44 +0100672 elif isinstance(cmd, NpuStripe):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100673 npu_block_type = cmd.ps.primary_op.type.npu_block_type
674 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
675 npu_op = create_npu_conv2d_op(cmd, arch)
676 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
677 npu_op = create_npu_conv_depthwise_op(cmd, arch)
678 elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
679 npu_op = create_npu_pool_op(cmd, arch)
680 elif npu_block_type == NpuBlockType.ElementWise:
681 npu_op = create_npu_elementwise_op(cmd, arch)
682 else:
683 assert 0, f"Unknown command type {npu_block_type}"
Tim Hall68df8a12022-03-16 16:51:16 +0000684 npu_op.name = cmd.ps.primary_op.name
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100685 return npu_op
Louis Verhaard1e170182020-11-26 11:42:04 +0100686
687
688def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
689 """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
690 # Convert high level command stream to list of NpuOperation
691 npu_op_list = []
692 npu_op_to_cmd = dict() # map from npu op to high level command
693 for cmd in sg.high_level_command_stream:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100694 if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
Louis Verhaard1e170182020-11-26 11:42:04 +0100695 print("Warning: Skipping register command stream generation for", cmd.ps)
Johan Alfven90724962023-02-02 09:07:48 +0100696 elif isinstance(cmd, NOP):
697 # NOP should not generate anything
698 continue
Louis Verhaard1e170182020-11-26 11:42:04 +0100699 else:
700 npu_op = convert_command_to_npu_op(cmd, arch)
701 npu_op_list.append(npu_op)
702 npu_op_to_cmd[npu_op] = cmd
Louis Verhaard024c3552021-03-17 14:26:34 +0100703 mem_limits = get_mem_limits_for_regions(arch)
Louis Verhaard1e170182020-11-26 11:42:04 +0100704 # Generate register commands
erik.andersson@arm.comad45f792021-02-03 10:20:16 +0100705 if len(sg.high_level_command_stream) > 0:
706 stream_id = DebugDatabase.add_stream(sg)
707 sg.generated_stream_id = stream_id
Louis Verhaard1e170182020-11-26 11:42:04 +0100708
erik.andersson@arm.comad45f792021-02-03 10:20:16 +0100709 def add_to_debug_db(npu_op: NpuOperation, offset: int):
710 """Adds info to the debug database"""
711 if not isinstance(npu_op, NpuDmaOperation):
712 cmd = npu_op_to_cmd[npu_op]
713 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Louis Verhaard1e170182020-11-26 11:42:04 +0100714
Louis Verhaard024c3552021-03-17 14:26:34 +0100715 sg.register_command_stream = generate_command_stream(
716 npu_op_list, arch, verbose, mem_limits, add_to_debug_db, npu_op_to_cmd
717 )