blob: 543367655859abc54487579f60d0ed6071c2dc06 [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#ifdef __aarch64__
26
27#include "arm_gemm.hpp"
28#include "quantized.hpp"
29#include "utils.hpp"
30
31#include <cassert>
32
33namespace arm_gemm {
34
35template<>
36void row_sums_indirect(
37 unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
38 size_t M, int32_t *out_ptr, const Requantize32 *qp
39)
40{
41 struct KernelArgs {
42 unsigned int num_strings;
43 const unsigned int *string_lengths;
44 unsigned int input_initial_col;
45 } ka;
46
47 unsigned long flags=0;
48 void *input_ptr;
49 size_t input_offset;
50
51 if (A_arg.is_indirect) {
52 input_ptr=(void *)(A_arg.indirect.ptr);
53 input_offset=A_arg.indirect.start_row;
54 ka.input_initial_col=A_arg.indirect.start_col;
55 flags |= 0x8;
56 } else {
57 assert(num_strings==1);
58 input_ptr=(void *)(A_arg.direct.base);
59 input_offset=A_arg.direct.stride;
60 }
61
62 ka.num_strings = num_strings;
63 ka.string_lengths = string_lengths;
64
65 __asm__ __volatile__(
66 "add x19, %x[qp], %[b_offset]\n"
67 "ld1r { v2.4s }, [x19]\n"
68 "neg v2.4s, v2.4s\n"
69 "1:" // Row loop
70 "cmp %x[M], #0x6\n"
71 "bge 86f\n"
72 "cmp %x[M], #0x4\n"
73 "bgt 69f\n"
74 "beq 52f\n"
75 "cmp %x[M], #0x2\n"
76 "bgt 35f\n"
77 "beq 18f\n"
78 "movi v1.8h, #0x0\n"
79 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
80 "movi v0.4s, #0x0\n"
81 "mov x9, #0x0\n"
82 "mov x28, #0x0\n"
83 "2:" // Height 1: String loop
84 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
85 "ldr w27, [x19, x28, LSL #0x2]\n"
86 "tbz %x[flags], #3, 3f\n"
87 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
88 "add x19, x19, %x[input_offset], LSL #3\n"
89 "ldr x26, [x19, #0x0]\n"
90 "cbnz x28, 4f\n"
91 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
92 "add x26, x26, x19\n"
93 "b 4f\n"
94 "3:" // Height 1: setup direct input
95 "mov x26, %x[input_ptr]\n"
96 "4:" // Height 1: input setup done
97 "cmp x27, #0x10\n"
98 "blt 8f\n"
99 "cmp x27, #0x20\n"
100 "blt 7f\n"
101 "5:" // Height 1: Multiply loop: Main loop head
102 "ldr q31, [x26, #0x0]\n"
103 "cmp x9, #0x7e\n"
104 "add x26, x26, #0x10\n"
105 "blt 6f\n"
106 "sadalp v0.4s, v1.8h\n"
107 "movi v1.8h, #0x0\n"
108 "mov x9, #0x0\n"
109 "6:" // Height 1: Multiply loop: unique 1: no collapse
110 "sadalp v1.8h, v31.16b\n"
111 "add x9, x9, #0x1\n"
112 "sub x27, x27, #0x10\n"
113 "cmp x27, #0x20\n"
114 "bge 5b\n"
115 "7:" // Height 1: Multiply loop: Single iteration only
116 "sub x27, x27, #0x10\n"
117 "ldr q31, [x26, #0x0]\n"
118 "add x26, x26, #0x10\n"
119 "sadalp v1.8h, v31.16b\n"
120 "8:" // Height 1: Multiply loop: Main loop skip
121 "cbz x27, 17f\n"
122 "tbz x27, #3, 12f\n"
123 "ldr d31, [x26], #0x8\n"
124 "tbz x27, #2, 10f\n"
125 "ld1 { v31.s }[2], [x26], #0x4\n"
126 "tbz x27, #1, 9f\n"
127 "ld1 { v31.h }[6], [x26], #0x2\n"
128 "tbz x27, #0, 16f\n"
129 "ld1 { v31.b }[14], [x26]\n"
130 "b 16f\n"
131 "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12
132 "tbz x27, #0, 16f\n"
133 "ld1 { v31.b }[12], [x26]\n"
134 "b 16f\n"
135 "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8
136 "tbz x27, #1, 11f\n"
137 "ld1 { v31.h }[4], [x26], #0x2\n"
138 "tbz x27, #0, 16f\n"
139 "ld1 { v31.b }[10], [x26]\n"
140 "b 16f\n"
141 "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8
142 "tbz x27, #0, 16f\n"
143 "ld1 { v31.b }[8], [x26]\n"
144 "b 16f\n"
145 "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0
146 "tbz x27, #2, 14f\n"
147 "ldr s31, [x26], #0x4\n"
148 "tbz x27, #1, 13f\n"
149 "ld1 { v31.h }[2], [x26], #0x2\n"
150 "tbz x27, #0, 16f\n"
151 "ld1 { v31.b }[6], [x26]\n"
152 "b 16f\n"
153 "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
154 "tbz x27, #0, 16f\n"
155 "ld1 { v31.b }[4], [x26]\n"
156 "b 16f\n"
157 "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
158 "tbz x27, #1, 15f\n"
159 "ldr h31, [x26], #0x2\n"
160 "tbz x27, #0, 16f\n"
161 "ld1 { v31.b }[2], [x26]\n"
162 "b 16f\n"
163 "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
164 "ldr b31, [x26, #0x0]\n"
165 "16:" // Height 1: Multiply loop: Ragged operand read: Done
166 "sadalp v1.8h, v31.16b\n"
167 "17:" // Height 1: Multiply loop: No odd multiplies
168 "add x28, x28, #0x1\n"
169 "cmp x28, x20\n"
170 "bne 2b\n"
171 "sadalp v0.4s, v1.8h\n"
172 "addp v0.4s, v0.4s, v0.4s\n"
173 "addp v0.4s, v0.4s, v0.4s\n"
174 "mul v0.4s, v0.4s, v2.4s\n"
175 "str s0, [%x[out_ptr]], #0x4\n"
176 "b 104f\n"
177 "18:" // Height 2
178 "movi v1.8h, #0x0\n"
179 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
180 "mov x9, #0x0\n"
181 "movi v0.4s, #0x0\n"
182 "mov x28, #0x0\n"
183 "movi v30.8h, #0x0\n"
184 "movi v29.4s, #0x0\n"
185 "19:" // Height 2: String loop
186 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
187 "ldr w27, [x19, x28, LSL #0x2]\n"
188 "tbz %x[flags], #3, 20f\n"
189 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
190 "add x19, x19, %x[input_offset], LSL #3\n"
191 "ldr x26, [x19, #0x0]\n"
192 "ldr x25, [x19, #0x8]\n"
193 "cbnz x28, 21f\n"
194 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
195 "add x26, x26, x19\n"
196 "add x25, x25, x19\n"
197 "b 21f\n"
198 "20:" // Height 2: setup direct input
199 "mov x26, %x[input_ptr]\n"
200 "add x25, x26, %x[input_offset]\n"
201 "21:" // Height 2: input setup done
202 "cmp x27, #0x10\n"
203 "blt 25f\n"
204 "cmp x27, #0x20\n"
205 "blt 24f\n"
206 "22:" // Height 2: Multiply loop: Main loop head
207 "ldr q31, [x26, #0x0]\n"
208 "ldr q28, [x25, #0x0]\n"
209 "cmp x9, #0x7e\n"
210 "add x26, x26, #0x10\n"
211 "add x25, x25, #0x10\n"
212 "blt 23f\n"
213 "sadalp v0.4s, v1.8h\n"
214 "movi v1.8h, #0x0\n"
215 "sadalp v29.4s, v30.8h\n"
216 "movi v30.8h, #0x0\n"
217 "mov x9, #0x0\n"
218 "23:" // Height 2: Multiply loop: unique 2: no collapse
219 "sadalp v1.8h, v31.16b\n"
220 "sadalp v30.8h, v28.16b\n"
221 "add x9, x9, #0x1\n"
222 "sub x27, x27, #0x10\n"
223 "cmp x27, #0x20\n"
224 "bge 22b\n"
225 "24:" // Height 2: Multiply loop: Single iteration only
226 "sub x27, x27, #0x10\n"
227 "ldr q31, [x26, #0x0]\n"
228 "ldr q28, [x25, #0x0]\n"
229 "add x26, x26, #0x10\n"
230 "add x25, x25, #0x10\n"
231 "sadalp v1.8h, v31.16b\n"
232 "sadalp v30.8h, v28.16b\n"
233 "25:" // Height 2: Multiply loop: Main loop skip
234 "cbz x27, 34f\n"
235 "tbz x27, #3, 29f\n"
236 "ldr d31, [x26], #0x8\n"
237 "ldr d28, [x25], #0x8\n"
238 "tbz x27, #2, 27f\n"
239 "ld1 { v31.s }[2], [x26], #0x4\n"
240 "ld1 { v28.s }[2], [x25], #0x4\n"
241 "tbz x27, #1, 26f\n"
242 "ld1 { v31.h }[6], [x26], #0x2\n"
243 "ld1 { v28.h }[6], [x25], #0x2\n"
244 "tbz x27, #0, 33f\n"
245 "ld1 { v31.b }[14], [x26]\n"
246 "ld1 { v28.b }[14], [x25]\n"
247 "b 33f\n"
248 "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12
249 "tbz x27, #0, 33f\n"
250 "ld1 { v31.b }[12], [x26]\n"
251 "ld1 { v28.b }[12], [x25]\n"
252 "b 33f\n"
253 "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8
254 "tbz x27, #1, 28f\n"
255 "ld1 { v31.h }[4], [x26], #0x2\n"
256 "ld1 { v28.h }[4], [x25], #0x2\n"
257 "tbz x27, #0, 33f\n"
258 "ld1 { v31.b }[10], [x26]\n"
259 "ld1 { v28.b }[10], [x25]\n"
260 "b 33f\n"
261 "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8
262 "tbz x27, #0, 33f\n"
263 "ld1 { v31.b }[8], [x26]\n"
264 "ld1 { v28.b }[8], [x25]\n"
265 "b 33f\n"
266 "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0
267 "tbz x27, #2, 31f\n"
268 "ldr s31, [x26], #0x4\n"
269 "ldr s28, [x25], #0x4\n"
270 "tbz x27, #1, 30f\n"
271 "ld1 { v31.h }[2], [x26], #0x2\n"
272 "ld1 { v28.h }[2], [x25], #0x2\n"
273 "tbz x27, #0, 33f\n"
274 "ld1 { v31.b }[6], [x26]\n"
275 "ld1 { v28.b }[6], [x25]\n"
276 "b 33f\n"
277 "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
278 "tbz x27, #0, 33f\n"
279 "ld1 { v31.b }[4], [x26]\n"
280 "ld1 { v28.b }[4], [x25]\n"
281 "b 33f\n"
282 "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
283 "tbz x27, #1, 32f\n"
284 "ldr h31, [x26], #0x2\n"
285 "ldr h28, [x25], #0x2\n"
286 "tbz x27, #0, 33f\n"
287 "ld1 { v31.b }[2], [x26]\n"
288 "ld1 { v28.b }[2], [x25]\n"
289 "b 33f\n"
290 "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
291 "ldr b31, [x26, #0x0]\n"
292 "ldr b28, [x25, #0x0]\n"
293 "33:" // Height 2: Multiply loop: Ragged operand read: Done
294 "sadalp v1.8h, v31.16b\n"
295 "sadalp v30.8h, v28.16b\n"
296 "34:" // Height 2: Multiply loop: No odd multiplies
297 "add x28, x28, #0x1\n"
298 "cmp x28, x20\n"
299 "bne 19b\n"
300 "sadalp v0.4s, v1.8h\n"
301 "sadalp v29.4s, v30.8h\n"
302 "addp v0.4s, v0.4s, v29.4s\n"
303 "addp v0.4s, v0.4s, v0.4s\n"
304 "mul v0.4s, v0.4s, v2.4s\n"
305 "str d0, [%x[out_ptr]], #0x8\n"
306 "b 104f\n"
307 "35:" // Height 3
308 "movi v1.8h, #0x0\n"
309 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
310 "mov x9, #0x0\n"
311 "movi v0.4s, #0x0\n"
312 "mov x28, #0x0\n"
313 "movi v30.8h, #0x0\n"
314 "movi v29.4s, #0x0\n"
315 "movi v27.8h, #0x0\n"
316 "movi v26.4s, #0x0\n"
317 "36:" // Height 3: String loop
318 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
319 "ldr w27, [x19, x28, LSL #0x2]\n"
320 "tbz %x[flags], #3, 37f\n"
321 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
322 "add x19, x19, %x[input_offset], LSL #3\n"
323 "ldr x26, [x19, #0x0]\n"
324 "ldr x25, [x19, #0x8]\n"
325 "ldr x24, [x19, #0x10]\n"
326 "cbnz x28, 38f\n"
327 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
328 "add x26, x26, x19\n"
329 "add x25, x25, x19\n"
330 "add x24, x24, x19\n"
331 "b 38f\n"
332 "37:" // Height 3: setup direct input
333 "mov x26, %x[input_ptr]\n"
334 "add x25, x26, %x[input_offset]\n"
335 "add x24, x25, %x[input_offset]\n"
336 "38:" // Height 3: input setup done
337 "cmp x27, #0x10\n"
338 "blt 42f\n"
339 "cmp x27, #0x20\n"
340 "blt 41f\n"
341 "39:" // Height 3: Multiply loop: Main loop head
342 "ldr q31, [x26, #0x0]\n"
343 "ldr q28, [x25, #0x0]\n"
344 "ldr q25, [x24, #0x0]\n"
345 "cmp x9, #0x7e\n"
346 "add x26, x26, #0x10\n"
347 "add x25, x25, #0x10\n"
348 "add x24, x24, #0x10\n"
349 "blt 40f\n"
350 "sadalp v0.4s, v1.8h\n"
351 "movi v1.8h, #0x0\n"
352 "sadalp v29.4s, v30.8h\n"
353 "movi v30.8h, #0x0\n"
354 "sadalp v26.4s, v27.8h\n"
355 "movi v27.8h, #0x0\n"
356 "mov x9, #0x0\n"
357 "40:" // Height 3: Multiply loop: unique 3: no collapse
358 "sadalp v1.8h, v31.16b\n"
359 "sadalp v30.8h, v28.16b\n"
360 "sadalp v27.8h, v25.16b\n"
361 "add x9, x9, #0x1\n"
362 "sub x27, x27, #0x10\n"
363 "cmp x27, #0x20\n"
364 "bge 39b\n"
365 "41:" // Height 3: Multiply loop: Single iteration only
366 "sub x27, x27, #0x10\n"
367 "ldr q31, [x26, #0x0]\n"
368 "ldr q28, [x25, #0x0]\n"
369 "ldr q25, [x24, #0x0]\n"
370 "add x26, x26, #0x10\n"
371 "add x25, x25, #0x10\n"
372 "sadalp v1.8h, v31.16b\n"
373 "sadalp v30.8h, v28.16b\n"
374 "sadalp v27.8h, v25.16b\n"
375 "add x24, x24, #0x10\n"
376 "42:" // Height 3: Multiply loop: Main loop skip
377 "cbz x27, 51f\n"
378 "tbz x27, #3, 46f\n"
379 "ldr d31, [x26], #0x8\n"
380 "ldr d28, [x25], #0x8\n"
381 "ldr d25, [x24], #0x8\n"
382 "tbz x27, #2, 44f\n"
383 "ld1 { v31.s }[2], [x26], #0x4\n"
384 "ld1 { v28.s }[2], [x25], #0x4\n"
385 "ld1 { v25.s }[2], [x24], #0x4\n"
386 "tbz x27, #1, 43f\n"
387 "ld1 { v31.h }[6], [x26], #0x2\n"
388 "ld1 { v28.h }[6], [x25], #0x2\n"
389 "ld1 { v25.h }[6], [x24], #0x2\n"
390 "tbz x27, #0, 50f\n"
391 "ld1 { v31.b }[14], [x26]\n"
392 "ld1 { v28.b }[14], [x25]\n"
393 "ld1 { v25.b }[14], [x24]\n"
394 "b 50f\n"
395 "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12
396 "tbz x27, #0, 50f\n"
397 "ld1 { v31.b }[12], [x26]\n"
398 "ld1 { v28.b }[12], [x25]\n"
399 "ld1 { v25.b }[12], [x24]\n"
400 "b 50f\n"
401 "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8
402 "tbz x27, #1, 45f\n"
403 "ld1 { v31.h }[4], [x26], #0x2\n"
404 "ld1 { v28.h }[4], [x25], #0x2\n"
405 "ld1 { v25.h }[4], [x24], #0x2\n"
406 "tbz x27, #0, 50f\n"
407 "ld1 { v31.b }[10], [x26]\n"
408 "ld1 { v28.b }[10], [x25]\n"
409 "ld1 { v25.b }[10], [x24]\n"
410 "b 50f\n"
411 "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8
412 "tbz x27, #0, 50f\n"
413 "ld1 { v31.b }[8], [x26]\n"
414 "ld1 { v28.b }[8], [x25]\n"
415 "ld1 { v25.b }[8], [x24]\n"
416 "b 50f\n"
417 "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0
418 "tbz x27, #2, 48f\n"
419 "ldr s31, [x26], #0x4\n"
420 "ldr s28, [x25], #0x4\n"
421 "ldr s25, [x24], #0x4\n"
422 "tbz x27, #1, 47f\n"
423 "ld1 { v31.h }[2], [x26], #0x2\n"
424 "ld1 { v28.h }[2], [x25], #0x2\n"
425 "ld1 { v25.h }[2], [x24], #0x2\n"
426 "tbz x27, #0, 50f\n"
427 "ld1 { v31.b }[6], [x26]\n"
428 "ld1 { v28.b }[6], [x25]\n"
429 "ld1 { v25.b }[6], [x24]\n"
430 "b 50f\n"
431 "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
432 "tbz x27, #0, 50f\n"
433 "ld1 { v31.b }[4], [x26]\n"
434 "ld1 { v28.b }[4], [x25]\n"
435 "ld1 { v25.b }[4], [x24]\n"
436 "b 50f\n"
437 "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
438 "tbz x27, #1, 49f\n"
439 "ldr h31, [x26], #0x2\n"
440 "ldr h28, [x25], #0x2\n"
441 "ldr h25, [x24], #0x2\n"
442 "tbz x27, #0, 50f\n"
443 "ld1 { v31.b }[2], [x26]\n"
444 "ld1 { v28.b }[2], [x25]\n"
445 "ld1 { v25.b }[2], [x24]\n"
446 "b 50f\n"
447 "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
448 "ldr b31, [x26, #0x0]\n"
449 "ldr b28, [x25, #0x0]\n"
450 "ldr b25, [x24, #0x0]\n"
451 "50:" // Height 3: Multiply loop: Ragged operand read: Done
452 "sadalp v1.8h, v31.16b\n"
453 "sadalp v30.8h, v28.16b\n"
454 "sadalp v27.8h, v25.16b\n"
455 "51:" // Height 3: Multiply loop: No odd multiplies
456 "add x28, x28, #0x1\n"
457 "cmp x28, x20\n"
458 "bne 36b\n"
459 "sadalp v0.4s, v1.8h\n"
460 "sadalp v29.4s, v30.8h\n"
461 "addp v0.4s, v0.4s, v29.4s\n"
462 "sadalp v26.4s, v27.8h\n"
463 "addp v0.4s, v0.4s, v0.4s\n"
464 "addp v26.4s, v26.4s, v26.4s\n"
465 "mul v0.4s, v0.4s, v2.4s\n"
466 "str d0, [%x[out_ptr]], #0x8\n"
467 "addp v26.4s, v26.4s, v26.4s\n"
468 "mul v26.4s, v26.4s, v2.4s\n"
469 "str s26, [%x[out_ptr]], #0x4\n"
470 "b 104f\n"
471 "52:" // Height 4
472 "movi v1.8h, #0x0\n"
473 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
474 "mov x9, #0x0\n"
475 "movi v0.4s, #0x0\n"
476 "mov x28, #0x0\n"
477 "movi v30.8h, #0x0\n"
478 "movi v29.4s, #0x0\n"
479 "movi v27.8h, #0x0\n"
480 "movi v26.4s, #0x0\n"
481 "movi v24.8h, #0x0\n"
482 "movi v23.4s, #0x0\n"
483 "53:" // Height 4: String loop
484 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
485 "ldr w27, [x19, x28, LSL #0x2]\n"
486 "tbz %x[flags], #3, 54f\n"
487 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
488 "add x19, x19, %x[input_offset], LSL #3\n"
489 "ldr x26, [x19, #0x0]\n"
490 "ldr x25, [x19, #0x8]\n"
491 "ldr x24, [x19, #0x10]\n"
492 "ldr x23, [x19, #0x18]\n"
493 "cbnz x28, 55f\n"
494 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
495 "add x26, x26, x19\n"
496 "add x25, x25, x19\n"
497 "add x24, x24, x19\n"
498 "add x23, x23, x19\n"
499 "b 55f\n"
500 "54:" // Height 4: setup direct input
501 "mov x26, %x[input_ptr]\n"
502 "add x25, x26, %x[input_offset]\n"
503 "add x24, x25, %x[input_offset]\n"
504 "add x23, x24, %x[input_offset]\n"
505 "55:" // Height 4: input setup done
506 "cmp x27, #0x10\n"
507 "blt 59f\n"
508 "cmp x27, #0x20\n"
509 "blt 58f\n"
510 "56:" // Height 4: Multiply loop: Main loop head
511 "ldr q31, [x26, #0x0]\n"
512 "ldr q28, [x25, #0x0]\n"
513 "ldr q25, [x24, #0x0]\n"
514 "ldr q22, [x23, #0x0]\n"
515 "cmp x9, #0x7e\n"
516 "add x26, x26, #0x10\n"
517 "add x25, x25, #0x10\n"
518 "add x24, x24, #0x10\n"
519 "add x23, x23, #0x10\n"
520 "blt 57f\n"
521 "sadalp v0.4s, v1.8h\n"
522 "movi v1.8h, #0x0\n"
523 "sadalp v29.4s, v30.8h\n"
524 "movi v30.8h, #0x0\n"
525 "sadalp v26.4s, v27.8h\n"
526 "movi v27.8h, #0x0\n"
527 "sadalp v23.4s, v24.8h\n"
528 "movi v24.8h, #0x0\n"
529 "mov x9, #0x0\n"
530 "57:" // Height 4: Multiply loop: unique 4: no collapse
531 "sadalp v1.8h, v31.16b\n"
532 "sadalp v30.8h, v28.16b\n"
533 "sadalp v27.8h, v25.16b\n"
534 "sadalp v24.8h, v22.16b\n"
535 "add x9, x9, #0x1\n"
536 "sub x27, x27, #0x10\n"
537 "cmp x27, #0x20\n"
538 "bge 56b\n"
539 "58:" // Height 4: Multiply loop: Single iteration only
540 "sub x27, x27, #0x10\n"
541 "ldr q31, [x26, #0x0]\n"
542 "ldr q28, [x25, #0x0]\n"
543 "ldr q25, [x24, #0x0]\n"
544 "ldr q22, [x23, #0x0]\n"
545 "add x26, x26, #0x10\n"
546 "sadalp v1.8h, v31.16b\n"
547 "sadalp v30.8h, v28.16b\n"
548 "sadalp v27.8h, v25.16b\n"
549 "sadalp v24.8h, v22.16b\n"
550 "add x25, x25, #0x10\n"
551 "add x24, x24, #0x10\n"
552 "add x23, x23, #0x10\n"
553 "59:" // Height 4: Multiply loop: Main loop skip
554 "cbz x27, 68f\n"
555 "tbz x27, #3, 63f\n"
556 "ldr d31, [x26], #0x8\n"
557 "ldr d28, [x25], #0x8\n"
558 "ldr d25, [x24], #0x8\n"
559 "ldr d22, [x23], #0x8\n"
560 "tbz x27, #2, 61f\n"
561 "ld1 { v31.s }[2], [x26], #0x4\n"
562 "ld1 { v28.s }[2], [x25], #0x4\n"
563 "ld1 { v25.s }[2], [x24], #0x4\n"
564 "ld1 { v22.s }[2], [x23], #0x4\n"
565 "tbz x27, #1, 60f\n"
566 "ld1 { v31.h }[6], [x26], #0x2\n"
567 "ld1 { v28.h }[6], [x25], #0x2\n"
568 "ld1 { v25.h }[6], [x24], #0x2\n"
569 "ld1 { v22.h }[6], [x23], #0x2\n"
570 "tbz x27, #0, 67f\n"
571 "ld1 { v31.b }[14], [x26]\n"
572 "ld1 { v28.b }[14], [x25]\n"
573 "ld1 { v25.b }[14], [x24]\n"
574 "ld1 { v22.b }[14], [x23]\n"
575 "b 67f\n"
576 "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12
577 "tbz x27, #0, 67f\n"
578 "ld1 { v31.b }[12], [x26]\n"
579 "ld1 { v28.b }[12], [x25]\n"
580 "ld1 { v25.b }[12], [x24]\n"
581 "ld1 { v22.b }[12], [x23]\n"
582 "b 67f\n"
583 "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8
584 "tbz x27, #1, 62f\n"
585 "ld1 { v31.h }[4], [x26], #0x2\n"
586 "ld1 { v28.h }[4], [x25], #0x2\n"
587 "ld1 { v25.h }[4], [x24], #0x2\n"
588 "ld1 { v22.h }[4], [x23], #0x2\n"
589 "tbz x27, #0, 67f\n"
590 "ld1 { v31.b }[10], [x26]\n"
591 "ld1 { v28.b }[10], [x25]\n"
592 "ld1 { v25.b }[10], [x24]\n"
593 "ld1 { v22.b }[10], [x23]\n"
594 "b 67f\n"
595 "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8
596 "tbz x27, #0, 67f\n"
597 "ld1 { v31.b }[8], [x26]\n"
598 "ld1 { v28.b }[8], [x25]\n"
599 "ld1 { v25.b }[8], [x24]\n"
600 "ld1 { v22.b }[8], [x23]\n"
601 "b 67f\n"
602 "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0
603 "tbz x27, #2, 65f\n"
604 "ldr s31, [x26], #0x4\n"
605 "ldr s28, [x25], #0x4\n"
606 "ldr s25, [x24], #0x4\n"
607 "ldr s22, [x23], #0x4\n"
608 "tbz x27, #1, 64f\n"
609 "ld1 { v31.h }[2], [x26], #0x2\n"
610 "ld1 { v28.h }[2], [x25], #0x2\n"
611 "ld1 { v25.h }[2], [x24], #0x2\n"
612 "ld1 { v22.h }[2], [x23], #0x2\n"
613 "tbz x27, #0, 67f\n"
614 "ld1 { v31.b }[6], [x26]\n"
615 "ld1 { v28.b }[6], [x25]\n"
616 "ld1 { v25.b }[6], [x24]\n"
617 "ld1 { v22.b }[6], [x23]\n"
618 "b 67f\n"
619 "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
620 "tbz x27, #0, 67f\n"
621 "ld1 { v31.b }[4], [x26]\n"
622 "ld1 { v28.b }[4], [x25]\n"
623 "ld1 { v25.b }[4], [x24]\n"
624 "ld1 { v22.b }[4], [x23]\n"
625 "b 67f\n"
626 "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
627 "tbz x27, #1, 66f\n"
628 "ldr h31, [x26], #0x2\n"
629 "ldr h28, [x25], #0x2\n"
630 "ldr h25, [x24], #0x2\n"
631 "ldr h22, [x23], #0x2\n"
632 "tbz x27, #0, 67f\n"
633 "ld1 { v31.b }[2], [x26]\n"
634 "ld1 { v28.b }[2], [x25]\n"
635 "ld1 { v25.b }[2], [x24]\n"
636 "ld1 { v22.b }[2], [x23]\n"
637 "b 67f\n"
638 "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
639 "ldr b31, [x26, #0x0]\n"
640 "ldr b28, [x25, #0x0]\n"
641 "ldr b25, [x24, #0x0]\n"
642 "ldr b22, [x23, #0x0]\n"
643 "67:" // Height 4: Multiply loop: Ragged operand read: Done
644 "sadalp v1.8h, v31.16b\n"
645 "sadalp v30.8h, v28.16b\n"
646 "sadalp v27.8h, v25.16b\n"
647 "sadalp v24.8h, v22.16b\n"
648 "68:" // Height 4: Multiply loop: No odd multiplies
649 "add x28, x28, #0x1\n"
650 "cmp x28, x20\n"
651 "bne 53b\n"
652 "sadalp v0.4s, v1.8h\n"
653 "sadalp v29.4s, v30.8h\n"
654 "addp v0.4s, v0.4s, v29.4s\n"
655 "sadalp v26.4s, v27.8h\n"
656 "sadalp v23.4s, v24.8h\n"
657 "addp v29.4s, v26.4s, v23.4s\n"
658 "addp v0.4s, v0.4s, v29.4s\n"
659 "mul v0.4s, v0.4s, v2.4s\n"
660 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
661 "b 104f\n"
662 "69:" // Height 5
663 "movi v1.8h, #0x0\n"
664 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
665 "mov x9, #0x0\n"
666 "movi v0.4s, #0x0\n"
667 "mov x28, #0x0\n"
668 "movi v30.8h, #0x0\n"
669 "movi v29.4s, #0x0\n"
670 "movi v27.8h, #0x0\n"
671 "movi v26.4s, #0x0\n"
672 "movi v24.8h, #0x0\n"
673 "movi v23.4s, #0x0\n"
674 "movi v21.8h, #0x0\n"
675 "movi v20.4s, #0x0\n"
676 "70:" // Height 5: String loop
677 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
678 "ldr w27, [x19, x28, LSL #0x2]\n"
679 "tbz %x[flags], #3, 71f\n"
680 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
681 "add x19, x19, %x[input_offset], LSL #3\n"
682 "ldr x26, [x19, #0x0]\n"
683 "ldr x25, [x19, #0x8]\n"
684 "ldr x24, [x19, #0x10]\n"
685 "ldr x23, [x19, #0x18]\n"
686 "ldr x22, [x19, #0x20]\n"
687 "cbnz x28, 72f\n"
688 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
689 "add x26, x26, x19\n"
690 "add x25, x25, x19\n"
691 "add x24, x24, x19\n"
692 "add x23, x23, x19\n"
693 "add x22, x22, x19\n"
694 "b 72f\n"
695 "71:" // Height 5: setup direct input
696 "mov x26, %x[input_ptr]\n"
697 "add x25, x26, %x[input_offset]\n"
698 "add x24, x25, %x[input_offset]\n"
699 "add x23, x24, %x[input_offset]\n"
700 "add x22, x23, %x[input_offset]\n"
701 "72:" // Height 5: input setup done
702 "cmp x27, #0x10\n"
703 "blt 76f\n"
704 "cmp x27, #0x20\n"
705 "blt 75f\n"
706 "73:" // Height 5: Multiply loop: Main loop head
707 "ldr q31, [x26, #0x0]\n"
708 "ldr q28, [x25, #0x0]\n"
709 "ldr q25, [x24, #0x0]\n"
710 "ldr q22, [x23, #0x0]\n"
711 "ldr q19, [x22, #0x0]\n"
712 "cmp x9, #0x7e\n"
713 "add x26, x26, #0x10\n"
714 "add x25, x25, #0x10\n"
715 "add x24, x24, #0x10\n"
716 "add x23, x23, #0x10\n"
717 "add x22, x22, #0x10\n"
718 "blt 74f\n"
719 "sadalp v0.4s, v1.8h\n"
720 "movi v1.8h, #0x0\n"
721 "sadalp v29.4s, v30.8h\n"
722 "movi v30.8h, #0x0\n"
723 "sadalp v26.4s, v27.8h\n"
724 "movi v27.8h, #0x0\n"
725 "sadalp v23.4s, v24.8h\n"
726 "movi v24.8h, #0x0\n"
727 "sadalp v20.4s, v21.8h\n"
728 "movi v21.8h, #0x0\n"
729 "mov x9, #0x0\n"
730 "74:" // Height 5: Multiply loop: unique 5: no collapse
731 "sadalp v1.8h, v31.16b\n"
732 "sadalp v30.8h, v28.16b\n"
733 "sadalp v27.8h, v25.16b\n"
734 "sadalp v24.8h, v22.16b\n"
735 "sadalp v21.8h, v19.16b\n"
736 "add x9, x9, #0x1\n"
737 "sub x27, x27, #0x10\n"
738 "cmp x27, #0x20\n"
739 "bge 73b\n"
740 "75:" // Height 5: Multiply loop: Single iteration only
741 "sub x27, x27, #0x10\n"
742 "ldr q31, [x26, #0x0]\n"
743 "ldr q28, [x25, #0x0]\n"
744 "ldr q25, [x24, #0x0]\n"
745 "ldr q22, [x23, #0x0]\n"
746 "ldr q19, [x22, #0x0]\n"
747 "sadalp v1.8h, v31.16b\n"
748 "sadalp v30.8h, v28.16b\n"
749 "sadalp v27.8h, v25.16b\n"
750 "sadalp v24.8h, v22.16b\n"
751 "sadalp v21.8h, v19.16b\n"
752 "add x26, x26, #0x10\n"
753 "add x25, x25, #0x10\n"
754 "add x24, x24, #0x10\n"
755 "add x23, x23, #0x10\n"
756 "add x22, x22, #0x10\n"
757 "76:" // Height 5: Multiply loop: Main loop skip
758 "cbz x27, 85f\n"
759 "tbz x27, #3, 80f\n"
760 "ldr d31, [x26], #0x8\n"
761 "ldr d28, [x25], #0x8\n"
762 "ldr d25, [x24], #0x8\n"
763 "ldr d22, [x23], #0x8\n"
764 "ldr d19, [x22], #0x8\n"
765 "tbz x27, #2, 78f\n"
766 "ld1 { v31.s }[2], [x26], #0x4\n"
767 "ld1 { v28.s }[2], [x25], #0x4\n"
768 "ld1 { v25.s }[2], [x24], #0x4\n"
769 "ld1 { v22.s }[2], [x23], #0x4\n"
770 "ld1 { v19.s }[2], [x22], #0x4\n"
771 "tbz x27, #1, 77f\n"
772 "ld1 { v31.h }[6], [x26], #0x2\n"
773 "ld1 { v28.h }[6], [x25], #0x2\n"
774 "ld1 { v25.h }[6], [x24], #0x2\n"
775 "ld1 { v22.h }[6], [x23], #0x2\n"
776 "ld1 { v19.h }[6], [x22], #0x2\n"
777 "tbz x27, #0, 84f\n"
778 "ld1 { v31.b }[14], [x26]\n"
779 "ld1 { v28.b }[14], [x25]\n"
780 "ld1 { v25.b }[14], [x24]\n"
781 "ld1 { v22.b }[14], [x23]\n"
782 "ld1 { v19.b }[14], [x22]\n"
783 "b 84f\n"
784 "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12
785 "tbz x27, #0, 84f\n"
786 "ld1 { v31.b }[12], [x26]\n"
787 "ld1 { v28.b }[12], [x25]\n"
788 "ld1 { v25.b }[12], [x24]\n"
789 "ld1 { v22.b }[12], [x23]\n"
790 "ld1 { v19.b }[12], [x22]\n"
791 "b 84f\n"
792 "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8
793 "tbz x27, #1, 79f\n"
794 "ld1 { v31.h }[4], [x26], #0x2\n"
795 "ld1 { v28.h }[4], [x25], #0x2\n"
796 "ld1 { v25.h }[4], [x24], #0x2\n"
797 "ld1 { v22.h }[4], [x23], #0x2\n"
798 "ld1 { v19.h }[4], [x22], #0x2\n"
799 "tbz x27, #0, 84f\n"
800 "ld1 { v31.b }[10], [x26]\n"
801 "ld1 { v28.b }[10], [x25]\n"
802 "ld1 { v25.b }[10], [x24]\n"
803 "ld1 { v22.b }[10], [x23]\n"
804 "ld1 { v19.b }[10], [x22]\n"
805 "b 84f\n"
806 "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8
807 "tbz x27, #0, 84f\n"
808 "ld1 { v31.b }[8], [x26]\n"
809 "ld1 { v28.b }[8], [x25]\n"
810 "ld1 { v25.b }[8], [x24]\n"
811 "ld1 { v22.b }[8], [x23]\n"
812 "ld1 { v19.b }[8], [x22]\n"
813 "b 84f\n"
814 "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0
815 "tbz x27, #2, 82f\n"
816 "ldr s31, [x26], #0x4\n"
817 "ldr s28, [x25], #0x4\n"
818 "ldr s25, [x24], #0x4\n"
819 "ldr s22, [x23], #0x4\n"
820 "ldr s19, [x22], #0x4\n"
821 "tbz x27, #1, 81f\n"
822 "ld1 { v31.h }[2], [x26], #0x2\n"
823 "ld1 { v28.h }[2], [x25], #0x2\n"
824 "ld1 { v25.h }[2], [x24], #0x2\n"
825 "ld1 { v22.h }[2], [x23], #0x2\n"
826 "ld1 { v19.h }[2], [x22], #0x2\n"
827 "tbz x27, #0, 84f\n"
828 "ld1 { v31.b }[6], [x26]\n"
829 "ld1 { v28.b }[6], [x25]\n"
830 "ld1 { v25.b }[6], [x24]\n"
831 "ld1 { v22.b }[6], [x23]\n"
832 "ld1 { v19.b }[6], [x22]\n"
833 "b 84f\n"
834 "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4
835 "tbz x27, #0, 84f\n"
836 "ld1 { v31.b }[4], [x26]\n"
837 "ld1 { v28.b }[4], [x25]\n"
838 "ld1 { v25.b }[4], [x24]\n"
839 "ld1 { v22.b }[4], [x23]\n"
840 "ld1 { v19.b }[4], [x22]\n"
841 "b 84f\n"
842 "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0
843 "tbz x27, #1, 83f\n"
844 "ldr h31, [x26], #0x2\n"
845 "ldr h28, [x25], #0x2\n"
846 "ldr h25, [x24], #0x2\n"
847 "ldr h22, [x23], #0x2\n"
848 "ldr h19, [x22], #0x2\n"
849 "tbz x27, #0, 84f\n"
850 "ld1 { v31.b }[2], [x26]\n"
851 "ld1 { v28.b }[2], [x25]\n"
852 "ld1 { v25.b }[2], [x24]\n"
853 "ld1 { v22.b }[2], [x23]\n"
854 "ld1 { v19.b }[2], [x22]\n"
855 "b 84f\n"
856 "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
857 "ldr b31, [x26, #0x0]\n"
858 "ldr b28, [x25, #0x0]\n"
859 "ldr b25, [x24, #0x0]\n"
860 "ldr b22, [x23, #0x0]\n"
861 "ldr b19, [x22, #0x0]\n"
862 "84:" // Height 5: Multiply loop: Ragged operand read: Done
863 "sadalp v1.8h, v31.16b\n"
864 "sadalp v30.8h, v28.16b\n"
865 "sadalp v27.8h, v25.16b\n"
866 "sadalp v24.8h, v22.16b\n"
867 "sadalp v21.8h, v19.16b\n"
868 "85:" // Height 5: Multiply loop: No odd multiplies
869 "add x28, x28, #0x1\n"
870 "cmp x28, x20\n"
871 "bne 70b\n"
872 "sadalp v0.4s, v1.8h\n"
873 "sadalp v29.4s, v30.8h\n"
874 "addp v0.4s, v0.4s, v29.4s\n"
875 "sadalp v26.4s, v27.8h\n"
876 "sadalp v23.4s, v24.8h\n"
877 "addp v29.4s, v26.4s, v23.4s\n"
878 "sadalp v20.4s, v21.8h\n"
879 "addp v0.4s, v0.4s, v29.4s\n"
880 "addp v20.4s, v20.4s, v20.4s\n"
881 "mul v0.4s, v0.4s, v2.4s\n"
882 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
883 "addp v20.4s, v20.4s, v20.4s\n"
884 "mul v20.4s, v20.4s, v2.4s\n"
885 "str s20, [%x[out_ptr]], #0x4\n"
886 "b 104f\n"
887 "86:" // Height 6
888 "movi v1.8h, #0x0\n"
889 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
890 "mov x9, #0x0\n"
891 "movi v0.4s, #0x0\n"
892 "mov x28, #0x0\n"
893 "movi v30.8h, #0x0\n"
894 "movi v29.4s, #0x0\n"
895 "movi v27.8h, #0x0\n"
896 "movi v26.4s, #0x0\n"
897 "movi v24.8h, #0x0\n"
898 "movi v23.4s, #0x0\n"
899 "movi v21.8h, #0x0\n"
900 "movi v20.4s, #0x0\n"
901 "movi v18.8h, #0x0\n"
902 "movi v17.4s, #0x0\n"
903 "87:" // Height 6: String loop
904 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
905 "ldr w27, [x19, x28, LSL #0x2]\n"
906 "tbz %x[flags], #3, 88f\n"
907 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
908 "add x19, x19, %x[input_offset], LSL #3\n"
909 "ldr x26, [x19, #0x0]\n"
910 "ldr x25, [x19, #0x8]\n"
911 "ldr x24, [x19, #0x10]\n"
912 "ldr x23, [x19, #0x18]\n"
913 "ldr x22, [x19, #0x20]\n"
914 "ldr x20, [x19, #0x28]\n"
915 "cbnz x28, 89f\n"
916 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
917 "add x26, x26, x19\n"
918 "add x25, x25, x19\n"
919 "add x24, x24, x19\n"
920 "add x23, x23, x19\n"
921 "add x22, x22, x19\n"
922 "add x20, x20, x19\n"
923 "b 89f\n"
924 "88:" // Height 6: setup direct input
925 "mov x26, %x[input_ptr]\n"
926 "add x25, x26, %x[input_offset]\n"
927 "add x24, x25, %x[input_offset]\n"
928 "add x23, x24, %x[input_offset]\n"
929 "add x22, x23, %x[input_offset]\n"
930 "add x20, x22, %x[input_offset]\n"
931 "89:" // Height 6: input setup done
932 "cmp x27, #0x10\n"
933 "blt 93f\n"
934 "cmp x27, #0x20\n"
935 "blt 92f\n"
936 "90:" // Height 6: Multiply loop: Main loop head
937 "ldr q31, [x26, #0x0]\n"
938 "ldr q28, [x25, #0x0]\n"
939 "ldr q25, [x24, #0x0]\n"
940 "ldr q22, [x23, #0x0]\n"
941 "ldr q19, [x22, #0x0]\n"
942 "ldr q16, [x20, #0x0]\n"
943 "cmp x9, #0x7e\n"
944 "add x26, x26, #0x10\n"
945 "add x25, x25, #0x10\n"
946 "add x24, x24, #0x10\n"
947 "add x23, x23, #0x10\n"
948 "add x22, x22, #0x10\n"
949 "add x20, x20, #0x10\n"
950 "blt 91f\n"
951 "sadalp v0.4s, v1.8h\n"
952 "movi v1.8h, #0x0\n"
953 "sadalp v29.4s, v30.8h\n"
954 "movi v30.8h, #0x0\n"
955 "sadalp v26.4s, v27.8h\n"
956 "movi v27.8h, #0x0\n"
957 "sadalp v23.4s, v24.8h\n"
958 "movi v24.8h, #0x0\n"
959 "sadalp v20.4s, v21.8h\n"
960 "movi v21.8h, #0x0\n"
961 "sadalp v17.4s, v18.8h\n"
962 "movi v18.8h, #0x0\n"
963 "mov x9, #0x0\n"
964 "91:" // Height 6: Multiply loop: unique 6: no collapse
965 "sadalp v1.8h, v31.16b\n"
966 "sadalp v30.8h, v28.16b\n"
967 "sadalp v27.8h, v25.16b\n"
968 "sadalp v24.8h, v22.16b\n"
969 "sadalp v21.8h, v19.16b\n"
970 "sadalp v18.8h, v16.16b\n"
971 "add x9, x9, #0x1\n"
972 "sub x27, x27, #0x10\n"
973 "cmp x27, #0x20\n"
974 "bge 90b\n"
975 "92:" // Height 6: Multiply loop: Single iteration only
976 "sub x27, x27, #0x10\n"
977 "ldr q31, [x26, #0x0]\n"
978 "ldr q28, [x25, #0x0]\n"
979 "ldr q25, [x24, #0x0]\n"
980 "ldr q22, [x23, #0x0]\n"
981 "ldr q19, [x22, #0x0]\n"
982 "ldr q16, [x20, #0x0]\n"
983 "sadalp v1.8h, v31.16b\n"
984 "sadalp v30.8h, v28.16b\n"
985 "sadalp v27.8h, v25.16b\n"
986 "sadalp v24.8h, v22.16b\n"
987 "sadalp v21.8h, v19.16b\n"
988 "sadalp v18.8h, v16.16b\n"
989 "add x26, x26, #0x10\n"
990 "add x25, x25, #0x10\n"
991 "add x24, x24, #0x10\n"
992 "add x23, x23, #0x10\n"
993 "add x22, x22, #0x10\n"
994 "add x20, x20, #0x10\n"
995 "93:" // Height 6: Multiply loop: Main loop skip
996 "cbz x27, 102f\n"
997 "tbz x27, #3, 97f\n"
998 "ldr d31, [x26], #0x8\n"
999 "ldr d28, [x25], #0x8\n"
1000 "ldr d25, [x24], #0x8\n"
1001 "ldr d22, [x23], #0x8\n"
1002 "ldr d19, [x22], #0x8\n"
1003 "ldr d16, [x20], #0x8\n"
1004 "tbz x27, #2, 95f\n"
1005 "ld1 { v31.s }[2], [x26], #0x4\n"
1006 "ld1 { v28.s }[2], [x25], #0x4\n"
1007 "ld1 { v25.s }[2], [x24], #0x4\n"
1008 "ld1 { v22.s }[2], [x23], #0x4\n"
1009 "ld1 { v19.s }[2], [x22], #0x4\n"
1010 "ld1 { v16.s }[2], [x20], #0x4\n"
1011 "tbz x27, #1, 94f\n"
1012 "ld1 { v31.h }[6], [x26], #0x2\n"
1013 "ld1 { v28.h }[6], [x25], #0x2\n"
1014 "ld1 { v25.h }[6], [x24], #0x2\n"
1015 "ld1 { v22.h }[6], [x23], #0x2\n"
1016 "ld1 { v19.h }[6], [x22], #0x2\n"
1017 "ld1 { v16.h }[6], [x20], #0x2\n"
1018 "tbz x27, #0, 101f\n"
1019 "ld1 { v31.b }[14], [x26]\n"
1020 "ld1 { v28.b }[14], [x25]\n"
1021 "ld1 { v25.b }[14], [x24]\n"
1022 "ld1 { v22.b }[14], [x23]\n"
1023 "ld1 { v19.b }[14], [x22]\n"
1024 "ld1 { v16.b }[14], [x20]\n"
1025 "b 101f\n"
1026 "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12
1027 "tbz x27, #0, 101f\n"
1028 "ld1 { v31.b }[12], [x26]\n"
1029 "ld1 { v28.b }[12], [x25]\n"
1030 "ld1 { v25.b }[12], [x24]\n"
1031 "ld1 { v22.b }[12], [x23]\n"
1032 "ld1 { v19.b }[12], [x22]\n"
1033 "ld1 { v16.b }[12], [x20]\n"
1034 "b 101f\n"
1035 "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8
1036 "tbz x27, #1, 96f\n"
1037 "ld1 { v31.h }[4], [x26], #0x2\n"
1038 "ld1 { v28.h }[4], [x25], #0x2\n"
1039 "ld1 { v25.h }[4], [x24], #0x2\n"
1040 "ld1 { v22.h }[4], [x23], #0x2\n"
1041 "ld1 { v19.h }[4], [x22], #0x2\n"
1042 "ld1 { v16.h }[4], [x20], #0x2\n"
1043 "tbz x27, #0, 101f\n"
1044 "ld1 { v31.b }[10], [x26]\n"
1045 "ld1 { v28.b }[10], [x25]\n"
1046 "ld1 { v25.b }[10], [x24]\n"
1047 "ld1 { v22.b }[10], [x23]\n"
1048 "ld1 { v19.b }[10], [x22]\n"
1049 "ld1 { v16.b }[10], [x20]\n"
1050 "b 101f\n"
1051 "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8
1052 "tbz x27, #0, 101f\n"
1053 "ld1 { v31.b }[8], [x26]\n"
1054 "ld1 { v28.b }[8], [x25]\n"
1055 "ld1 { v25.b }[8], [x24]\n"
1056 "ld1 { v22.b }[8], [x23]\n"
1057 "ld1 { v19.b }[8], [x22]\n"
1058 "ld1 { v16.b }[8], [x20]\n"
1059 "b 101f\n"
1060 "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0
1061 "tbz x27, #2, 99f\n"
1062 "ldr s31, [x26], #0x4\n"
1063 "ldr s28, [x25], #0x4\n"
1064 "ldr s25, [x24], #0x4\n"
1065 "ldr s22, [x23], #0x4\n"
1066 "ldr s19, [x22], #0x4\n"
1067 "ldr s16, [x20], #0x4\n"
1068 "tbz x27, #1, 98f\n"
1069 "ld1 { v31.h }[2], [x26], #0x2\n"
1070 "ld1 { v28.h }[2], [x25], #0x2\n"
1071 "ld1 { v25.h }[2], [x24], #0x2\n"
1072 "ld1 { v22.h }[2], [x23], #0x2\n"
1073 "ld1 { v19.h }[2], [x22], #0x2\n"
1074 "ld1 { v16.h }[2], [x20], #0x2\n"
1075 "tbz x27, #0, 101f\n"
1076 "ld1 { v31.b }[6], [x26]\n"
1077 "ld1 { v28.b }[6], [x25]\n"
1078 "ld1 { v25.b }[6], [x24]\n"
1079 "ld1 { v22.b }[6], [x23]\n"
1080 "ld1 { v19.b }[6], [x22]\n"
1081 "ld1 { v16.b }[6], [x20]\n"
1082 "b 101f\n"
1083 "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4
1084 "tbz x27, #0, 101f\n"
1085 "ld1 { v31.b }[4], [x26]\n"
1086 "ld1 { v28.b }[4], [x25]\n"
1087 "ld1 { v25.b }[4], [x24]\n"
1088 "ld1 { v22.b }[4], [x23]\n"
1089 "ld1 { v19.b }[4], [x22]\n"
1090 "ld1 { v16.b }[4], [x20]\n"
1091 "b 101f\n"
1092 "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0
1093 "tbz x27, #1, 100f\n"
1094 "ldr h31, [x26], #0x2\n"
1095 "ldr h28, [x25], #0x2\n"
1096 "ldr h25, [x24], #0x2\n"
1097 "ldr h22, [x23], #0x2\n"
1098 "ldr h19, [x22], #0x2\n"
1099 "ldr h16, [x20], #0x2\n"
1100 "tbz x27, #0, 101f\n"
1101 "ld1 { v31.b }[2], [x26]\n"
1102 "ld1 { v28.b }[2], [x25]\n"
1103 "ld1 { v25.b }[2], [x24]\n"
1104 "ld1 { v22.b }[2], [x23]\n"
1105 "ld1 { v19.b }[2], [x22]\n"
1106 "ld1 { v16.b }[2], [x20]\n"
1107 "b 101f\n"
1108 "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
1109 "ldr b31, [x26, #0x0]\n"
1110 "ldr b28, [x25, #0x0]\n"
1111 "ldr b25, [x24, #0x0]\n"
1112 "ldr b22, [x23, #0x0]\n"
1113 "ldr b19, [x22, #0x0]\n"
1114 "ldr b16, [x20, #0x0]\n"
1115 "101:" // Height 6: Multiply loop: Ragged operand read: Done
1116 "sadalp v1.8h, v31.16b\n"
1117 "sadalp v30.8h, v28.16b\n"
1118 "sadalp v27.8h, v25.16b\n"
1119 "sadalp v24.8h, v22.16b\n"
1120 "sadalp v21.8h, v19.16b\n"
1121 "sadalp v18.8h, v16.16b\n"
1122 "102:" // Height 6: Multiply loop: No odd multiplies
1123 "add x28, x28, #0x1\n"
1124 "cmp x28, x21\n"
1125 "bne 87b\n"
1126 "sadalp v0.4s, v1.8h\n"
1127 "sadalp v29.4s, v30.8h\n"
1128 "addp v0.4s, v0.4s, v29.4s\n"
1129 "sadalp v26.4s, v27.8h\n"
1130 "sadalp v23.4s, v24.8h\n"
1131 "addp v29.4s, v26.4s, v23.4s\n"
1132 "sadalp v20.4s, v21.8h\n"
1133 "sadalp v17.4s, v18.8h\n"
1134 "addp v0.4s, v0.4s, v29.4s\n"
1135 "subs %x[M], %x[M], #0x6\n"
1136 "addp v20.4s, v20.4s, v17.4s\n"
1137 "mul v0.4s, v0.4s, v2.4s\n"
1138 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
1139 "addp v20.4s, v20.4s, v20.4s\n"
1140 "mul v20.4s, v20.4s, v2.4s\n"
1141 "str d20, [%x[out_ptr]], #0x8\n"
1142 "beq 104f\n"
1143 "tbz %x[flags], #3, 103f\n"
1144 "add %x[input_offset], %x[input_offset], #0x6\n"
1145 "b 1b\n"
1146 "103:" // Update direct input
1147 "mov x19, #0x6\n"
1148 "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
1149 "b 1b\n"
1150 "104:" // Exit
1151
1152 : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
1153 : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
1154 : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1155 );
1156}
1157
1158} // namespace arm_gemm
1159
1160#endif // __aarch64__