blob: 49b24b2b16113d906de19fb5e74e609ff28c1a15 [file] [log] [blame]
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Contains unit tests for generate_register_command_stream API for an external consumer
19from ethosu.vela.api import NpuActivation
20from ethosu.vela.api import NpuActivationOp
21from ethosu.vela.api import NpuAddressRange
22from ethosu.vela.api import NpuBlockTraversal
23from ethosu.vela.api import NpuConv2DOperation
24from ethosu.vela.api import NpuConvDepthWiseOperation
25from ethosu.vela.api import NpuDataType
26from ethosu.vela.api import NpuDmaOperation
27from ethosu.vela.api import NpuElementWiseOp
28from ethosu.vela.api import NpuElementWiseOperation
29from ethosu.vela.api import NpuFeatureMap
30from ethosu.vela.api import NpuKernel
31from ethosu.vela.api import NpuLayout
32from ethosu.vela.api import NpuPadding
33from ethosu.vela.api import NpuPoolingOp
34from ethosu.vela.api import NpuPoolingOperation
35from ethosu.vela.api import NpuQuantization
36from ethosu.vela.api import NpuShape3D
37from ethosu.vela.api import NpuTileBox
38from ethosu.vela.architecture_features import Accelerator
39from ethosu.vela.ethos_u55_regs.ethos_u55_regs import cmd0
40from ethosu.vela.ethos_u55_regs.ethos_u55_regs import cmd1
41from ethosu.vela.register_command_stream_generator import CmdMode
42from ethosu.vela.register_command_stream_generator import generate_register_command_stream
43from ethosu.vela.register_command_stream_generator import get_address_ranges
44
45
46def check_cmd0(cmd_stream, cmd, param):
47 """Checks that the command stream contains the given command + parameter"""
48 param = int(param) & 0xFFFF
49 command = cmd.value | (param << 16)
50 assert command in cmd_stream, f"Not in command stream: {cmd} {param}"
51
52
53def check_cmd1(cmd_stream, cmd, offset, param=0x0):
54 """Checks that the command stream contains the given command + parameter"""
55 offset = int(offset) & 0xFFFFFFFFF
56 command = cmd.value | CmdMode.Payload32.value | (param << 16)
57 for i in range(len(cmd_stream) - 1):
58 if cmd_stream[i] == command and cmd_stream[i + 1] == offset:
59 return # found
60 assert False, f"Not in command stream: {cmd} {offset} {param}"
61
62
63def find_cmd0(cmd_stream, cmd) -> int:
64 """Returns parameter of the first command in the stream that matches the given command"""
65 for command in cmd_stream:
66 if (command & 0xFFFF) == cmd.value:
67 return (command >> 16) & 0xFFFF
68 assert False, f"Not in command stream: {cmd}"
69
70
71def create_feature_map(
72 shape: NpuShape3D,
73 region: int,
74 address: int,
75 dtype: NpuDataType = NpuDataType.UINT8,
76 layout: NpuLayout = NpuLayout.NHWC,
77 quant=NpuQuantization(scale_f32=1, zero_point=0),
78) -> NpuFeatureMap:
79 """Creates feature map using 1 tile"""
80 fm = NpuFeatureMap()
81 fm.data_type = dtype
82 fm.shape = shape
83 fm.tiles = NpuTileBox(
84 width_0=shape.width, height_0=shape.height, height_1=shape.height, addresses=[address, 0, 0, 0]
85 )
86 fm.region = region
87 fm.layout = layout
88 fm.quantization = quant
89 return fm
90
91
92def test_conv2d():
93 """Tests command stream generation for a conv2d operation"""
94 op = NpuConv2DOperation()
95 op.ifm = create_feature_map(
96 NpuShape3D(height=30, width=62, depth=46), 1, 512, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128)
97 )
98 op.ofm = create_feature_map(
99 NpuShape3D(height=30, width=31, depth=46),
100 1,
101 0x14E40,
102 quant=NpuQuantization(scale_f32=0.20392157, zero_point=128),
103 )
104 op.kernel = NpuKernel(3, 2, 2, 1)
105 op.weights = [NpuAddressRange(region=0, address=0, length=7696)]
106 op.biases = [NpuAddressRange(region=0, address=32000, length=464)]
107 op.padding = NpuPadding(top=0, left=0, right=1, bottom=1)
108 op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
109 # In this example we assume that the weights were compressed with ofm depth 16;
110 # let vela choose suitable block width and height by setting these to -1
111 op.block_config = NpuShape3D(height=-1, width=-1, depth=16)
112 cmds = generate_register_command_stream([op], Accelerator.Ethos_U55_128)
113 check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1)
114 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE0, 512)
115 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE1, 0)
116 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE2, 0)
117 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE3, 0)
118 check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT0_M1, 29)
119 check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT1_M1, 29)
120 check_cmd0(cmds, cmd0.NPU_SET_IFM_WIDTH0_M1, 61)
121 check_cmd0(cmds, cmd0.NPU_SET_IFM_DEPTH_M1, 45)
122 check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_C, 1)
123 check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_Y, 2852)
124 check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_X, 46)
125 check_cmd0(cmds, cmd0.NPU_SET_IFM_ZERO_POINT, 128)
126 check_cmd0(cmds, cmd0.NPU_SET_IFM_PRECISION, 0)
127 check_cmd0(cmds, cmd0.NPU_SET_IFM_UPSCALE, 0)
128 check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_TOP, 0)
129 check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_LEFT, 0)
130 check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_BOTTOM, 1)
131 check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_RIGHT, 1)
132 check_cmd0(cmds, cmd0.NPU_SET_OFM_REGION, 1)
133 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE0, 85568)
134 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE1, 0)
135 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE2, 0)
136 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE3, 0)
137 check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT0_M1, 29)
138 check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT1_M1, 29)
139 check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH0_M1, 30)
140 check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT_M1, 29)
141 check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH_M1, 30)
142 check_cmd0(cmds, cmd0.NPU_SET_OFM_DEPTH_M1, 45)
143 check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_C, 1)
144 check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_Y, 1426)
145 check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_X, 46)
146 check_cmd0(cmds, cmd0.NPU_SET_OFM_ZERO_POINT, 128)
147 check_cmd0(cmds, cmd0.NPU_SET_OFM_PRECISION, 0)
148 check_cmd0(cmds, cmd0.NPU_SET_KERNEL_HEIGHT_M1, 1)
149 check_cmd0(cmds, cmd0.NPU_SET_KERNEL_WIDTH_M1, 2)
150 check_cmd0(cmds, cmd0.NPU_SET_KERNEL_STRIDE, 5)
151 check_cmd0(cmds, cmd0.NPU_SET_WEIGHT_REGION, 0)
152 check_cmd1(cmds, cmd1.NPU_SET_WEIGHT_BASE, 0)
153 check_cmd1(cmds, cmd1.NPU_SET_WEIGHT_LENGTH, 7696)
154 check_cmd0(cmds, cmd0.NPU_SET_SCALE_REGION, 0)
155 check_cmd1(cmds, cmd1.NPU_SET_SCALE_BASE, 32000)
156 check_cmd1(cmds, cmd1.NPU_SET_SCALE_LENGTH, 464)
157 check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION, 0)
158 check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MIN, 0)
159 check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MAX, 255)
160 check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 15)
161 check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3)
162 check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 15)
163 check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 14)
164 check_cmd0(cmds, cmd0.NPU_SET_AB_START, 14)
165 check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
166 check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
167 check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
168 # Check that block width/height were generated that fit
169 blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
170 blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
171 assert blk_height > 0
172 assert blk_width > 0
173 assert (blk_height + 1) * (blk_width + 1) <= 64
174
175
176def create_fully_connected_op() -> NpuConv2DOperation:
177 op = NpuConv2DOperation()
178 op.ifm = create_feature_map(
179 NpuShape3D(height=1, width=1, depth=114),
180 1,
181 0,
182 quant=NpuQuantization(scale_f32=0.007843138, zero_point=128),
183 layout=NpuLayout.NHCWB16,
184 )
185 op.ofm = create_feature_map(
186 NpuShape3D(height=1, width=1, depth=96),
187 1,
188 0x6A0,
189 quant=NpuQuantization(scale_f32=0.20392157, zero_point=128),
190 layout=NpuLayout.NHCWB16,
191 )
192 op.kernel = NpuKernel(1, 1)
193 op.weights = [NpuAddressRange(region=0, address=0x16880, length=13120)]
194 op.biases = [NpuAddressRange(region=0, address=0x19BC0, length=960)]
195 op.padding = NpuPadding(top=0, left=0, right=0, bottom=0)
196 op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
197 # In this example we assume that the weights were compressed with ofm depth 96;
198 # let vela choose suitable block width and height by setting these to -1
199 op.block_config = NpuShape3D(height=-1, width=-1, depth=96)
200 return op
201
202
203def test_fully_connected():
204 """Tests command stream generation for a fully connected operation"""
205 op = create_fully_connected_op()
206 cmds = generate_register_command_stream([op], Accelerator.Ethos_U55_128)
207 check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
208 assert len(cmds) > 20
209
210
211def test_depthwise():
212 """Test depthwise operation, preceeded by DMA operation"""
213 weights_src = NpuAddressRange(region=0, address=0x40, length=96)
214 weights_dest = NpuAddressRange(region=1, address=0x10000, length=96)
215 dma_op = NpuDmaOperation(weights_src, weights_dest)
216 op = NpuConvDepthWiseOperation()
217 ifm_quant = NpuQuantization(scale_f32=0.007843138, zero_point=128)
218 op.ifm = create_feature_map(NpuShape3D(height=64, width=64, depth=8), 1, 0x0, quant=ifm_quant)
219 ofm_quant = NpuQuantization(scale_f32=0.062745101749897, zero_point=128)
220 op.ofm = create_feature_map(NpuShape3D(height=64, width=64, depth=8), 1, 0x8000, quant=ofm_quant)
221 op.kernel = NpuKernel(3, 3)
222 op.padding = NpuPadding(top=1, left=1, right=1, bottom=1)
223 op.weights = [weights_dest]
224 op.biases = [NpuAddressRange(region=0, address=0, length=80)]
225 op.block_config = NpuShape3D(height=-1, width=-1, depth=8)
226 cmds = generate_register_command_stream([dma_op, op], Accelerator.Ethos_U55_128)
227 check_cmd0(cmds, cmd0.NPU_SET_DMA0_SRC_REGION, 0)
228 check_cmd1(cmds, cmd1.NPU_SET_DMA0_SRC, 0x40)
229 check_cmd0(cmds, cmd0.NPU_SET_DMA0_DST_REGION, 1)
230 check_cmd1(cmds, cmd1.NPU_SET_DMA0_DST, 0x10000)
231 check_cmd1(cmds, cmd1.NPU_SET_DMA0_LEN, 96)
232 check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0)
233 # A DMA WAIT should have been inserted
234 check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0)
235 check_cmd0(cmds, cmd0.NPU_OP_DEPTHWISE, 0)
236 blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
237 blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
238 assert blk_height > 0
239 assert blk_width > 0
240
241
242def test_mul_with_broadcast_and_relu():
243 """Test multiplication with broadcasted IFM2"""
244 op = NpuElementWiseOperation(NpuElementWiseOp.MUL)
245 op.ifm = create_feature_map(NpuShape3D(height=31, width=22, depth=31), 1, 0x20)
246 op.ifm2 = create_feature_map(NpuShape3D(height=1, width=22, depth=1), 1, 0)
247 op.ofm = create_feature_map(NpuShape3D(height=31, width=22, depth=31), 1, 0x52C0)
248 op.activation = NpuActivation(NpuActivationOp.NONE_OR_RELU)
249 op.activation.min = 0 # RELU
250 # Do not set a block config, let vela choose one
251 cmds = generate_register_command_stream([op], Accelerator.Ethos_U55_32)
252 check_cmd1(cmds, cmd1.NPU_SET_OFM_SCALE, 1073741824, 30)
253 check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1)
254 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE0, 32)
255 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE1, 0)
256 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE2, 0)
257 check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE3, 0)
258 check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT0_M1, 30)
259 check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT1_M1, 30)
260 check_cmd0(cmds, cmd0.NPU_SET_IFM_WIDTH0_M1, 21)
261 check_cmd0(cmds, cmd0.NPU_SET_IFM_DEPTH_M1, 30)
262 check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_C, 1)
263 check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_Y, 682)
264 check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_X, 31)
265 check_cmd0(cmds, cmd0.NPU_SET_IFM_ZERO_POINT, 0)
266 check_cmd0(cmds, cmd0.NPU_SET_IFM_PRECISION, 0)
267 check_cmd0(cmds, cmd0.NPU_SET_IFM_UPSCALE, 0)
268 check_cmd0(cmds, cmd0.NPU_SET_OFM_REGION, 1)
269 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE0, 21184)
270 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE1, 0)
271 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE2, 0)
272 check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE3, 0)
273 check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT0_M1, 30)
274 check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT1_M1, 30)
275 check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH0_M1, 21)
276 check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT_M1, 30)
277 check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH_M1, 21)
278 check_cmd0(cmds, cmd0.NPU_SET_OFM_DEPTH_M1, 30)
279 check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_C, 1)
280 check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_Y, 682)
281 check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_X, 31)
282 check_cmd0(cmds, cmd0.NPU_SET_OFM_ZERO_POINT, 0)
283 check_cmd0(cmds, cmd0.NPU_SET_OFM_PRECISION, 256)
284 check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION, 0)
285 check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MIN, 0)
286 check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MAX, 255)
287 check_cmd0(cmds, cmd0.NPU_SET_IFM2_REGION, 1)
288 check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE0, 0)
289 check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE1, 0)
290 check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE2, 0)
291 check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE3, 0)
292 check_cmd0(cmds, cmd0.NPU_SET_IFM2_HEIGHT0_M1, 0)
293 check_cmd0(cmds, cmd0.NPU_SET_IFM2_HEIGHT1_M1, 0)
294 check_cmd0(cmds, cmd0.NPU_SET_IFM2_WIDTH0_M1, 21)
295 check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_C, 1)
296 check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_Y, 22)
297 check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_X, 1)
298 check_cmd0(cmds, cmd0.NPU_SET_IFM2_ZERO_POINT, 0)
299 check_cmd0(cmds, cmd0.NPU_SET_IFM2_PRECISION, 0)
300 check_cmd0(cmds, cmd0.NPU_SET_IFM2_BROADCAST, 5)
301 check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 23)
302 check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3)
303 check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 31)
304 check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 16)
305 check_cmd0(cmds, cmd0.NPU_SET_AB_START, 16)
306 check_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START, 9)
307 check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
308 check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
309 check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0)
310 # Check that block width/height were generated that fit
311 blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
312 blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
313 blk_depth = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1)
314 assert blk_height >= 0
315 assert blk_width >= 0
316 assert blk_depth >= 0
317 assert (blk_height + 1) * (blk_width + 1) + (blk_depth + 1) <= 3072
318
319
320def create_avg_pool_op() -> NpuPoolingOperation:
321 op = NpuPoolingOperation(NpuPoolingOp.AVERAGE)
322 op.ifm = create_feature_map(
323 NpuShape3D(height=29, width=30, depth=27), 2, 0, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128)
324 )
325 op.ofm = create_feature_map(
326 NpuShape3D(height=10, width=10, depth=27),
327 2,
328 0x5BD0,
329 quant=NpuQuantization(scale_f32=0.20392157, zero_point=128),
330 )
331 op.kernel = NpuKernel(8, 2, 3, 3)
332 op.padding = NpuPadding(top=0, left=2, right=3, bottom=0)
333 # Do not set a block config, let vela choose one
334 return op
335
336
337def test_avg_pool():
338 """Tests average pool operation"""
339 op = create_avg_pool_op()
340 cmds = generate_register_command_stream([op], Accelerator.Ethos_U55_128)
341 check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)
342 assert len(cmds) > 10
343
344
345def test_two_operations():
346 """Tests code generation with 2 operations"""
347 op1 = create_fully_connected_op()
348 op2 = create_avg_pool_op()
349 cmds = generate_register_command_stream([op1, op2], Accelerator.Ethos_U55_64)
350 check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)
351 check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
352 check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
353 # The operations are not dependent, so expect a blockdep 3
354 check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 3)
355 assert len(cmds) > 10
356
357
358def test_dma_op():
359 """Tests DMA operation followed by average pool. The DMA provides the contents of the average pool's IFM."""
360 pool_op = create_avg_pool_op()
361 assert pool_op.ofm is not None
362 dest = get_address_ranges(pool_op.ofm)[0]
363 assert dest is not None
364 src = NpuAddressRange(0, 0x24000, dest.length)
365 dma_op = NpuDmaOperation(src, dest)
366 cmds = generate_register_command_stream([dma_op, pool_op], Accelerator.Ethos_U55_64)
367 check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0)
368 # A DMA WAIT should have been inserted
369 check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0)
370 check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)