blob: 7345793f938b19bce5d296854538ce2dd1a6cc2b [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
Michael Tyler7d9a6262023-02-01 16:37:07 +00002 * Copyright (c) 2019-2020, 2023 Arm Limited.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
Michael Tyler7d9a6262023-02-01 16:37:07 +000013 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000015 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
Michael Tyler7d9a6262023-02-01 16:37:07 +000020 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000023 */
24
25#ifdef __aarch64__
26
27#include "arm_gemm.hpp"
28#include "quantized.hpp"
29#include "utils.hpp"
30
31#include <cassert>
32
33namespace arm_gemm {
34
35template<>
36void row_sums_indirect(
37 unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
38 size_t M, int32_t *out_ptr, const Requantize32 *qp
39)
40{
41 struct KernelArgs {
42 unsigned int num_strings;
43 const unsigned int *string_lengths;
44 unsigned int input_initial_col;
45 } ka;
46
47 unsigned long flags=0;
48 void *input_ptr;
49 size_t input_offset;
50
51 if (A_arg.is_indirect) {
52 input_ptr=(void *)(A_arg.indirect.ptr);
53 input_offset=A_arg.indirect.start_row;
54 ka.input_initial_col=A_arg.indirect.start_col;
55 flags |= 0x8;
56 } else {
57 assert(num_strings==1);
58 input_ptr=(void *)(A_arg.direct.base);
59 input_offset=A_arg.direct.stride;
60 }
61
62 ka.num_strings = num_strings;
63 ka.string_lengths = string_lengths;
64
65 __asm__ __volatile__(
Michael Tyler7d9a6262023-02-01 16:37:07 +000066 "add x20, %x[qp], %[b_offset]\n"
67 "ld1r { v2.4s }, [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000068 "neg v2.4s, v2.4s\n"
69 "1:" // Row loop
70 "cmp %x[M], #0x6\n"
71 "bge 86f\n"
72 "cmp %x[M], #0x4\n"
73 "bgt 69f\n"
74 "beq 52f\n"
75 "cmp %x[M], #0x2\n"
76 "bgt 35f\n"
77 "beq 18f\n"
78 "movi v1.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000079 "movi v0.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000080 "mov x10, #0x0\n"
81 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000082 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000083 "2:" // Height 1: String loop
Michael Tyler7d9a6262023-02-01 16:37:07 +000084 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
85 "ldr w28, [x20, x9, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000086 "tbz %x[flags], #3, 3f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000087 "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
88 "add x20, x20, %x[input_offset], LSL #3\n"
89 "ldr x27, [x20, #0x0]\n"
90 "cbnz x9, 4f\n"
91 "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
92 "add x27, x27, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000093 "b 4f\n"
94 "3:" // Height 1: setup direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +000095 "mov x27, %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000096 "4:" // Height 1: input setup done
Michael Tyler7d9a6262023-02-01 16:37:07 +000097 "cmp x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000098 "blt 8f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +000099 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000100 "blt 7f\n"
101 "5:" // Height 1: Multiply loop: Main loop head
Michael Tyler7d9a6262023-02-01 16:37:07 +0000102 "ldr q31, [x27, #0x0]\n"
103 "cmp x10, #0x7e\n"
104 "add x27, x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000105 "blt 6f\n"
106 "sadalp v0.4s, v1.8h\n"
107 "movi v1.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000108 "mov x10, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000109 "6:" // Height 1: Multiply loop: unique 1: no collapse
Michael Tyler7d9a6262023-02-01 16:37:07 +0000110 "sub x28, x28, #0x10\n"
111 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000112 "sadalp v1.8h, v31.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000113 "add x10, x10, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000114 "bge 5b\n"
115 "7:" // Height 1: Multiply loop: Single iteration only
Michael Tyler7d9a6262023-02-01 16:37:07 +0000116 "ldr q31, [x27, #0x0]\n"
117 "sub x28, x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000118 "sadalp v1.8h, v31.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000119 "add x27, x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000120 "8:" // Height 1: Multiply loop: Main loop skip
Michael Tyler7d9a6262023-02-01 16:37:07 +0000121 "cbz x28, 17f\n"
122 "tbz x28, #3, 12f\n"
123 "ldr d31, [x27], #0x8\n"
124 "tbz x28, #2, 10f\n"
125 "ld1 { v31.s }[2], [x27], #0x4\n"
126 "tbz x28, #1, 9f\n"
127 "ld1 { v31.h }[6], [x27], #0x2\n"
128 "tbz x28, #0, 16f\n"
129 "ld1 { v31.b }[14], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000130 "b 16f\n"
131 "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12
Michael Tyler7d9a6262023-02-01 16:37:07 +0000132 "tbz x28, #0, 16f\n"
133 "ld1 { v31.b }[12], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000134 "b 16f\n"
135 "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000136 "tbz x28, #1, 11f\n"
137 "ld1 { v31.h }[4], [x27], #0x2\n"
138 "tbz x28, #0, 16f\n"
139 "ld1 { v31.b }[10], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000140 "b 16f\n"
141 "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000142 "tbz x28, #0, 16f\n"
143 "ld1 { v31.b }[8], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000144 "b 16f\n"
145 "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000146 "tbz x28, #2, 14f\n"
147 "ldr s31, [x27], #0x4\n"
148 "tbz x28, #1, 13f\n"
149 "ld1 { v31.h }[2], [x27], #0x2\n"
150 "tbz x28, #0, 16f\n"
151 "ld1 { v31.b }[6], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000152 "b 16f\n"
153 "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
Michael Tyler7d9a6262023-02-01 16:37:07 +0000154 "tbz x28, #0, 16f\n"
155 "ld1 { v31.b }[4], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000156 "b 16f\n"
157 "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000158 "tbz x28, #1, 15f\n"
159 "ldr h31, [x27], #0x2\n"
160 "tbz x28, #0, 16f\n"
161 "ld1 { v31.b }[2], [x27]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000162 "b 16f\n"
163 "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000164 "ldr b31, [x27, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000165 "16:" // Height 1: Multiply loop: Ragged operand read: Done
166 "sadalp v1.8h, v31.16b\n"
167 "17:" // Height 1: Multiply loop: No odd multiplies
Michael Tyler7d9a6262023-02-01 16:37:07 +0000168 "add x9, x9, #0x1\n"
169 "cmp x9, x21\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000170 "bne 2b\n"
171 "sadalp v0.4s, v1.8h\n"
172 "addp v0.4s, v0.4s, v0.4s\n"
173 "addp v0.4s, v0.4s, v0.4s\n"
174 "mul v0.4s, v0.4s, v2.4s\n"
175 "str s0, [%x[out_ptr]], #0x4\n"
176 "b 104f\n"
177 "18:" // Height 2
178 "movi v1.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000179 "movi v0.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000180 "mov x10, #0x0\n"
181 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000182 "movi v30.8h, #0x0\n"
183 "movi v29.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000184 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000185 "19:" // Height 2: String loop
Michael Tyler7d9a6262023-02-01 16:37:07 +0000186 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
187 "ldr w28, [x20, x9, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000188 "tbz %x[flags], #3, 20f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000189 "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
190 "add x20, x20, %x[input_offset], LSL #3\n"
191 "ldr x27, [x20, #0x0]\n"
192 "ldr x26, [x20, #0x8]\n"
193 "cbnz x9, 21f\n"
194 "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
195 "add x27, x27, x20\n"
196 "add x26, x26, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000197 "b 21f\n"
198 "20:" // Height 2: setup direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +0000199 "mov x27, %x[input_ptr]\n"
200 "add x26, x27, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000201 "21:" // Height 2: input setup done
Michael Tyler7d9a6262023-02-01 16:37:07 +0000202 "cmp x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000203 "blt 25f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000204 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000205 "blt 24f\n"
206 "22:" // Height 2: Multiply loop: Main loop head
Michael Tyler7d9a6262023-02-01 16:37:07 +0000207 "ldr q31, [x27, #0x0]\n"
208 "ldr q28, [x26, #0x0]\n"
209 "cmp x10, #0x7e\n"
210 "add x27, x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000211 "add x26, x26, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000212 "blt 23f\n"
213 "sadalp v0.4s, v1.8h\n"
214 "movi v1.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000215 "mov x10, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000216 "sadalp v29.4s, v30.8h\n"
217 "movi v30.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000218 "23:" // Height 2: Multiply loop: unique 2: no collapse
Michael Tyler7d9a6262023-02-01 16:37:07 +0000219 "sub x28, x28, #0x10\n"
220 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000221 "sadalp v1.8h, v31.16b\n"
222 "sadalp v30.8h, v28.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000223 "add x10, x10, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000224 "bge 22b\n"
225 "24:" // Height 2: Multiply loop: Single iteration only
Michael Tyler7d9a6262023-02-01 16:37:07 +0000226 "ldr q31, [x27, #0x0]\n"
227 "ldr q28, [x26, #0x0]\n"
228 "sub x28, x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000229 "sadalp v1.8h, v31.16b\n"
230 "sadalp v30.8h, v28.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000231 "add x27, x27, #0x10\n"
232 "add x26, x26, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000233 "25:" // Height 2: Multiply loop: Main loop skip
Michael Tyler7d9a6262023-02-01 16:37:07 +0000234 "cbz x28, 34f\n"
235 "tbz x28, #3, 29f\n"
236 "ldr d31, [x27], #0x8\n"
237 "ldr d28, [x26], #0x8\n"
238 "tbz x28, #2, 27f\n"
239 "ld1 { v31.s }[2], [x27], #0x4\n"
240 "ld1 { v28.s }[2], [x26], #0x4\n"
241 "tbz x28, #1, 26f\n"
242 "ld1 { v31.h }[6], [x27], #0x2\n"
243 "ld1 { v28.h }[6], [x26], #0x2\n"
244 "tbz x28, #0, 33f\n"
245 "ld1 { v31.b }[14], [x27]\n"
246 "ld1 { v28.b }[14], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000247 "b 33f\n"
248 "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12
Michael Tyler7d9a6262023-02-01 16:37:07 +0000249 "tbz x28, #0, 33f\n"
250 "ld1 { v31.b }[12], [x27]\n"
251 "ld1 { v28.b }[12], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000252 "b 33f\n"
253 "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000254 "tbz x28, #1, 28f\n"
255 "ld1 { v31.h }[4], [x27], #0x2\n"
256 "ld1 { v28.h }[4], [x26], #0x2\n"
257 "tbz x28, #0, 33f\n"
258 "ld1 { v31.b }[10], [x27]\n"
259 "ld1 { v28.b }[10], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000260 "b 33f\n"
261 "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000262 "tbz x28, #0, 33f\n"
263 "ld1 { v31.b }[8], [x27]\n"
264 "ld1 { v28.b }[8], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000265 "b 33f\n"
266 "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000267 "tbz x28, #2, 31f\n"
268 "ldr s31, [x27], #0x4\n"
269 "ldr s28, [x26], #0x4\n"
270 "tbz x28, #1, 30f\n"
271 "ld1 { v31.h }[2], [x27], #0x2\n"
272 "ld1 { v28.h }[2], [x26], #0x2\n"
273 "tbz x28, #0, 33f\n"
274 "ld1 { v31.b }[6], [x27]\n"
275 "ld1 { v28.b }[6], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000276 "b 33f\n"
277 "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
Michael Tyler7d9a6262023-02-01 16:37:07 +0000278 "tbz x28, #0, 33f\n"
279 "ld1 { v31.b }[4], [x27]\n"
280 "ld1 { v28.b }[4], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000281 "b 33f\n"
282 "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000283 "tbz x28, #1, 32f\n"
284 "ldr h31, [x27], #0x2\n"
285 "ldr h28, [x26], #0x2\n"
286 "tbz x28, #0, 33f\n"
287 "ld1 { v31.b }[2], [x27]\n"
288 "ld1 { v28.b }[2], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000289 "b 33f\n"
290 "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000291 "ldr b31, [x27, #0x0]\n"
292 "ldr b28, [x26, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000293 "33:" // Height 2: Multiply loop: Ragged operand read: Done
294 "sadalp v1.8h, v31.16b\n"
295 "sadalp v30.8h, v28.16b\n"
296 "34:" // Height 2: Multiply loop: No odd multiplies
Michael Tyler7d9a6262023-02-01 16:37:07 +0000297 "add x9, x9, #0x1\n"
298 "cmp x9, x21\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000299 "bne 19b\n"
300 "sadalp v0.4s, v1.8h\n"
301 "sadalp v29.4s, v30.8h\n"
302 "addp v0.4s, v0.4s, v29.4s\n"
303 "addp v0.4s, v0.4s, v0.4s\n"
304 "mul v0.4s, v0.4s, v2.4s\n"
305 "str d0, [%x[out_ptr]], #0x8\n"
306 "b 104f\n"
307 "35:" // Height 3
308 "movi v1.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000309 "movi v0.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000310 "mov x10, #0x0\n"
311 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000312 "movi v30.8h, #0x0\n"
313 "movi v29.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000314 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000315 "movi v27.8h, #0x0\n"
316 "movi v26.4s, #0x0\n"
317 "36:" // Height 3: String loop
Michael Tyler7d9a6262023-02-01 16:37:07 +0000318 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
319 "ldr w28, [x20, x9, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000320 "tbz %x[flags], #3, 37f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000321 "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
322 "add x20, x20, %x[input_offset], LSL #3\n"
323 "ldr x27, [x20, #0x0]\n"
324 "ldr x26, [x20, #0x8]\n"
325 "ldr x25, [x20, #0x10]\n"
326 "cbnz x9, 38f\n"
327 "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
328 "add x27, x27, x20\n"
329 "add x26, x26, x20\n"
330 "add x25, x25, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000331 "b 38f\n"
332 "37:" // Height 3: setup direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +0000333 "mov x27, %x[input_ptr]\n"
334 "add x26, x27, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000335 "add x25, x26, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000336 "38:" // Height 3: input setup done
Michael Tyler7d9a6262023-02-01 16:37:07 +0000337 "cmp x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000338 "blt 42f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000339 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000340 "blt 41f\n"
341 "39:" // Height 3: Multiply loop: Main loop head
Michael Tyler7d9a6262023-02-01 16:37:07 +0000342 "ldr q31, [x27, #0x0]\n"
343 "ldr q28, [x26, #0x0]\n"
344 "cmp x10, #0x7e\n"
345 "add x27, x27, #0x10\n"
346 "ldr q25, [x25, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000347 "add x26, x26, #0x10\n"
348 "add x25, x25, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000349 "blt 40f\n"
350 "sadalp v0.4s, v1.8h\n"
351 "movi v1.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000352 "mov x10, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000353 "sadalp v29.4s, v30.8h\n"
354 "movi v30.8h, #0x0\n"
355 "sadalp v26.4s, v27.8h\n"
356 "movi v27.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000357 "40:" // Height 3: Multiply loop: unique 3: no collapse
Michael Tyler7d9a6262023-02-01 16:37:07 +0000358 "sub x28, x28, #0x10\n"
359 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000360 "sadalp v1.8h, v31.16b\n"
361 "sadalp v30.8h, v28.16b\n"
362 "sadalp v27.8h, v25.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000363 "add x10, x10, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000364 "bge 39b\n"
365 "41:" // Height 3: Multiply loop: Single iteration only
Michael Tyler7d9a6262023-02-01 16:37:07 +0000366 "ldr q31, [x27, #0x0]\n"
367 "ldr q28, [x26, #0x0]\n"
368 "sub x28, x28, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000369 "sadalp v1.8h, v31.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000370 "ldr q25, [x25, #0x0]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000371 "sadalp v30.8h, v28.16b\n"
372 "sadalp v27.8h, v25.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000373 "add x27, x27, #0x10\n"
374 "add x26, x26, #0x10\n"
375 "add x25, x25, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000376 "42:" // Height 3: Multiply loop: Main loop skip
Michael Tyler7d9a6262023-02-01 16:37:07 +0000377 "cbz x28, 51f\n"
378 "tbz x28, #3, 46f\n"
379 "ldr d31, [x27], #0x8\n"
380 "ldr d28, [x26], #0x8\n"
381 "ldr d25, [x25], #0x8\n"
382 "tbz x28, #2, 44f\n"
383 "ld1 { v31.s }[2], [x27], #0x4\n"
384 "ld1 { v28.s }[2], [x26], #0x4\n"
385 "ld1 { v25.s }[2], [x25], #0x4\n"
386 "tbz x28, #1, 43f\n"
387 "ld1 { v31.h }[6], [x27], #0x2\n"
388 "ld1 { v28.h }[6], [x26], #0x2\n"
389 "ld1 { v25.h }[6], [x25], #0x2\n"
390 "tbz x28, #0, 50f\n"
391 "ld1 { v31.b }[14], [x27]\n"
392 "ld1 { v28.b }[14], [x26]\n"
393 "ld1 { v25.b }[14], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000394 "b 50f\n"
395 "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12
Michael Tyler7d9a6262023-02-01 16:37:07 +0000396 "tbz x28, #0, 50f\n"
397 "ld1 { v31.b }[12], [x27]\n"
398 "ld1 { v28.b }[12], [x26]\n"
399 "ld1 { v25.b }[12], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000400 "b 50f\n"
401 "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000402 "tbz x28, #1, 45f\n"
403 "ld1 { v31.h }[4], [x27], #0x2\n"
404 "ld1 { v28.h }[4], [x26], #0x2\n"
405 "ld1 { v25.h }[4], [x25], #0x2\n"
406 "tbz x28, #0, 50f\n"
407 "ld1 { v31.b }[10], [x27]\n"
408 "ld1 { v28.b }[10], [x26]\n"
409 "ld1 { v25.b }[10], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000410 "b 50f\n"
411 "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000412 "tbz x28, #0, 50f\n"
413 "ld1 { v31.b }[8], [x27]\n"
414 "ld1 { v28.b }[8], [x26]\n"
415 "ld1 { v25.b }[8], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000416 "b 50f\n"
417 "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000418 "tbz x28, #2, 48f\n"
419 "ldr s31, [x27], #0x4\n"
420 "ldr s28, [x26], #0x4\n"
421 "ldr s25, [x25], #0x4\n"
422 "tbz x28, #1, 47f\n"
423 "ld1 { v31.h }[2], [x27], #0x2\n"
424 "ld1 { v28.h }[2], [x26], #0x2\n"
425 "ld1 { v25.h }[2], [x25], #0x2\n"
426 "tbz x28, #0, 50f\n"
427 "ld1 { v31.b }[6], [x27]\n"
428 "ld1 { v28.b }[6], [x26]\n"
429 "ld1 { v25.b }[6], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000430 "b 50f\n"
431 "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
Michael Tyler7d9a6262023-02-01 16:37:07 +0000432 "tbz x28, #0, 50f\n"
433 "ld1 { v31.b }[4], [x27]\n"
434 "ld1 { v28.b }[4], [x26]\n"
435 "ld1 { v25.b }[4], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000436 "b 50f\n"
437 "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000438 "tbz x28, #1, 49f\n"
439 "ldr h31, [x27], #0x2\n"
440 "ldr h28, [x26], #0x2\n"
441 "ldr h25, [x25], #0x2\n"
442 "tbz x28, #0, 50f\n"
443 "ld1 { v31.b }[2], [x27]\n"
444 "ld1 { v28.b }[2], [x26]\n"
445 "ld1 { v25.b }[2], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000446 "b 50f\n"
447 "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000448 "ldr b31, [x27, #0x0]\n"
449 "ldr b28, [x26, #0x0]\n"
450 "ldr b25, [x25, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000451 "50:" // Height 3: Multiply loop: Ragged operand read: Done
452 "sadalp v1.8h, v31.16b\n"
453 "sadalp v30.8h, v28.16b\n"
454 "sadalp v27.8h, v25.16b\n"
455 "51:" // Height 3: Multiply loop: No odd multiplies
Michael Tyler7d9a6262023-02-01 16:37:07 +0000456 "add x9, x9, #0x1\n"
457 "cmp x9, x21\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000458 "bne 36b\n"
459 "sadalp v0.4s, v1.8h\n"
460 "sadalp v29.4s, v30.8h\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000461 "sadalp v26.4s, v27.8h\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000462 "addp v0.4s, v0.4s, v29.4s\n"
463 "addp v26.4s, v26.4s, v26.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000464 "addp v0.4s, v0.4s, v0.4s\n"
465 "addp v26.4s, v26.4s, v26.4s\n"
466 "mul v0.4s, v0.4s, v2.4s\n"
467 "str d0, [%x[out_ptr]], #0x8\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000468 "mul v26.4s, v26.4s, v2.4s\n"
469 "str s26, [%x[out_ptr]], #0x4\n"
470 "b 104f\n"
471 "52:" // Height 4
472 "movi v1.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000473 "movi v0.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000474 "mov x10, #0x0\n"
475 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000476 "movi v30.8h, #0x0\n"
477 "movi v29.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000478 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000479 "movi v27.8h, #0x0\n"
480 "movi v26.4s, #0x0\n"
481 "movi v24.8h, #0x0\n"
482 "movi v23.4s, #0x0\n"
483 "53:" // Height 4: String loop
Michael Tyler7d9a6262023-02-01 16:37:07 +0000484 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
485 "ldr w28, [x20, x9, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000486 "tbz %x[flags], #3, 54f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000487 "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
488 "add x20, x20, %x[input_offset], LSL #3\n"
489 "ldr x27, [x20, #0x0]\n"
490 "ldr x26, [x20, #0x8]\n"
491 "ldr x25, [x20, #0x10]\n"
492 "ldr x24, [x20, #0x18]\n"
493 "cbnz x9, 55f\n"
494 "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
495 "add x27, x27, x20\n"
496 "add x26, x26, x20\n"
497 "add x25, x25, x20\n"
498 "add x24, x24, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000499 "b 55f\n"
500 "54:" // Height 4: setup direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +0000501 "mov x27, %x[input_ptr]\n"
502 "add x26, x27, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000503 "add x25, x26, %x[input_offset]\n"
504 "add x24, x25, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000505 "55:" // Height 4: input setup done
Michael Tyler7d9a6262023-02-01 16:37:07 +0000506 "cmp x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000507 "blt 59f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000508 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000509 "blt 58f\n"
510 "56:" // Height 4: Multiply loop: Main loop head
Michael Tyler7d9a6262023-02-01 16:37:07 +0000511 "ldr q31, [x27, #0x0]\n"
512 "ldr q28, [x26, #0x0]\n"
513 "cmp x10, #0x7e\n"
514 "add x27, x27, #0x10\n"
515 "ldr q25, [x25, #0x0]\n"
516 "ldr q22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000517 "add x26, x26, #0x10\n"
518 "add x25, x25, #0x10\n"
519 "add x24, x24, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000520 "blt 57f\n"
521 "sadalp v0.4s, v1.8h\n"
522 "movi v1.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000523 "mov x10, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000524 "sadalp v29.4s, v30.8h\n"
525 "movi v30.8h, #0x0\n"
526 "sadalp v26.4s, v27.8h\n"
527 "movi v27.8h, #0x0\n"
528 "sadalp v23.4s, v24.8h\n"
529 "movi v24.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000530 "57:" // Height 4: Multiply loop: unique 4: no collapse
Michael Tyler7d9a6262023-02-01 16:37:07 +0000531 "sub x28, x28, #0x10\n"
532 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000533 "sadalp v1.8h, v31.16b\n"
534 "sadalp v30.8h, v28.16b\n"
535 "sadalp v27.8h, v25.16b\n"
536 "sadalp v24.8h, v22.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000537 "add x10, x10, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000538 "bge 56b\n"
539 "58:" // Height 4: Multiply loop: Single iteration only
Michael Tyler7d9a6262023-02-01 16:37:07 +0000540 "ldr q31, [x27, #0x0]\n"
541 "ldr q28, [x26, #0x0]\n"
542 "sub x28, x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000543 "sadalp v1.8h, v31.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000544 "ldr q25, [x25, #0x0]\n"
545 "ldr q22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000546 "sadalp v30.8h, v28.16b\n"
547 "sadalp v27.8h, v25.16b\n"
548 "sadalp v24.8h, v22.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000549 "add x27, x27, #0x10\n"
550 "add x26, x26, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000551 "add x25, x25, #0x10\n"
552 "add x24, x24, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000553 "59:" // Height 4: Multiply loop: Main loop skip
Michael Tyler7d9a6262023-02-01 16:37:07 +0000554 "cbz x28, 68f\n"
555 "tbz x28, #3, 63f\n"
556 "ldr d31, [x27], #0x8\n"
557 "ldr d28, [x26], #0x8\n"
558 "ldr d25, [x25], #0x8\n"
559 "ldr d22, [x24], #0x8\n"
560 "tbz x28, #2, 61f\n"
561 "ld1 { v31.s }[2], [x27], #0x4\n"
562 "ld1 { v28.s }[2], [x26], #0x4\n"
563 "ld1 { v25.s }[2], [x25], #0x4\n"
564 "ld1 { v22.s }[2], [x24], #0x4\n"
565 "tbz x28, #1, 60f\n"
566 "ld1 { v31.h }[6], [x27], #0x2\n"
567 "ld1 { v28.h }[6], [x26], #0x2\n"
568 "ld1 { v25.h }[6], [x25], #0x2\n"
569 "ld1 { v22.h }[6], [x24], #0x2\n"
570 "tbz x28, #0, 67f\n"
571 "ld1 { v31.b }[14], [x27]\n"
572 "ld1 { v28.b }[14], [x26]\n"
573 "ld1 { v25.b }[14], [x25]\n"
574 "ld1 { v22.b }[14], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000575 "b 67f\n"
576 "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12
Michael Tyler7d9a6262023-02-01 16:37:07 +0000577 "tbz x28, #0, 67f\n"
578 "ld1 { v31.b }[12], [x27]\n"
579 "ld1 { v28.b }[12], [x26]\n"
580 "ld1 { v25.b }[12], [x25]\n"
581 "ld1 { v22.b }[12], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000582 "b 67f\n"
583 "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000584 "tbz x28, #1, 62f\n"
585 "ld1 { v31.h }[4], [x27], #0x2\n"
586 "ld1 { v28.h }[4], [x26], #0x2\n"
587 "ld1 { v25.h }[4], [x25], #0x2\n"
588 "ld1 { v22.h }[4], [x24], #0x2\n"
589 "tbz x28, #0, 67f\n"
590 "ld1 { v31.b }[10], [x27]\n"
591 "ld1 { v28.b }[10], [x26]\n"
592 "ld1 { v25.b }[10], [x25]\n"
593 "ld1 { v22.b }[10], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000594 "b 67f\n"
595 "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000596 "tbz x28, #0, 67f\n"
597 "ld1 { v31.b }[8], [x27]\n"
598 "ld1 { v28.b }[8], [x26]\n"
599 "ld1 { v25.b }[8], [x25]\n"
600 "ld1 { v22.b }[8], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000601 "b 67f\n"
602 "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000603 "tbz x28, #2, 65f\n"
604 "ldr s31, [x27], #0x4\n"
605 "ldr s28, [x26], #0x4\n"
606 "ldr s25, [x25], #0x4\n"
607 "ldr s22, [x24], #0x4\n"
608 "tbz x28, #1, 64f\n"
609 "ld1 { v31.h }[2], [x27], #0x2\n"
610 "ld1 { v28.h }[2], [x26], #0x2\n"
611 "ld1 { v25.h }[2], [x25], #0x2\n"
612 "ld1 { v22.h }[2], [x24], #0x2\n"
613 "tbz x28, #0, 67f\n"
614 "ld1 { v31.b }[6], [x27]\n"
615 "ld1 { v28.b }[6], [x26]\n"
616 "ld1 { v25.b }[6], [x25]\n"
617 "ld1 { v22.b }[6], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000618 "b 67f\n"
619 "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
Michael Tyler7d9a6262023-02-01 16:37:07 +0000620 "tbz x28, #0, 67f\n"
621 "ld1 { v31.b }[4], [x27]\n"
622 "ld1 { v28.b }[4], [x26]\n"
623 "ld1 { v25.b }[4], [x25]\n"
624 "ld1 { v22.b }[4], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000625 "b 67f\n"
626 "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000627 "tbz x28, #1, 66f\n"
628 "ldr h31, [x27], #0x2\n"
629 "ldr h28, [x26], #0x2\n"
630 "ldr h25, [x25], #0x2\n"
631 "ldr h22, [x24], #0x2\n"
632 "tbz x28, #0, 67f\n"
633 "ld1 { v31.b }[2], [x27]\n"
634 "ld1 { v28.b }[2], [x26]\n"
635 "ld1 { v25.b }[2], [x25]\n"
636 "ld1 { v22.b }[2], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000637 "b 67f\n"
638 "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000639 "ldr b31, [x27, #0x0]\n"
640 "ldr b28, [x26, #0x0]\n"
641 "ldr b25, [x25, #0x0]\n"
642 "ldr b22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000643 "67:" // Height 4: Multiply loop: Ragged operand read: Done
644 "sadalp v1.8h, v31.16b\n"
645 "sadalp v30.8h, v28.16b\n"
646 "sadalp v27.8h, v25.16b\n"
647 "sadalp v24.8h, v22.16b\n"
648 "68:" // Height 4: Multiply loop: No odd multiplies
Michael Tyler7d9a6262023-02-01 16:37:07 +0000649 "add x9, x9, #0x1\n"
650 "cmp x9, x21\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000651 "bne 53b\n"
652 "sadalp v0.4s, v1.8h\n"
653 "sadalp v29.4s, v30.8h\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000654 "sadalp v26.4s, v27.8h\n"
655 "sadalp v23.4s, v24.8h\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000656 "addp v0.4s, v0.4s, v29.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000657 "addp v29.4s, v26.4s, v23.4s\n"
658 "addp v0.4s, v0.4s, v29.4s\n"
659 "mul v0.4s, v0.4s, v2.4s\n"
660 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
661 "b 104f\n"
662 "69:" // Height 5
663 "movi v1.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000664 "movi v0.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000665 "mov x10, #0x0\n"
666 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000667 "movi v30.8h, #0x0\n"
668 "movi v29.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000669 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000670 "movi v27.8h, #0x0\n"
671 "movi v26.4s, #0x0\n"
672 "movi v24.8h, #0x0\n"
673 "movi v23.4s, #0x0\n"
674 "movi v21.8h, #0x0\n"
675 "movi v20.4s, #0x0\n"
676 "70:" // Height 5: String loop
Michael Tyler7d9a6262023-02-01 16:37:07 +0000677 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
678 "ldr w28, [x20, x9, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000679 "tbz %x[flags], #3, 71f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000680 "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
681 "add x20, x20, %x[input_offset], LSL #3\n"
682 "ldr x27, [x20, #0x0]\n"
683 "ldr x26, [x20, #0x8]\n"
684 "ldr x25, [x20, #0x10]\n"
685 "ldr x24, [x20, #0x18]\n"
686 "ldr x23, [x20, #0x20]\n"
687 "cbnz x9, 72f\n"
688 "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
689 "add x27, x27, x20\n"
690 "add x26, x26, x20\n"
691 "add x25, x25, x20\n"
692 "add x24, x24, x20\n"
693 "add x23, x23, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000694 "b 72f\n"
695 "71:" // Height 5: setup direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +0000696 "mov x27, %x[input_ptr]\n"
697 "add x26, x27, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000698 "add x25, x26, %x[input_offset]\n"
699 "add x24, x25, %x[input_offset]\n"
700 "add x23, x24, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000701 "72:" // Height 5: input setup done
Michael Tyler7d9a6262023-02-01 16:37:07 +0000702 "cmp x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000703 "blt 76f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000704 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000705 "blt 75f\n"
706 "73:" // Height 5: Multiply loop: Main loop head
Michael Tyler7d9a6262023-02-01 16:37:07 +0000707 "ldr q31, [x27, #0x0]\n"
708 "ldr q28, [x26, #0x0]\n"
709 "cmp x10, #0x7e\n"
710 "add x27, x27, #0x10\n"
711 "ldr q25, [x25, #0x0]\n"
712 "ldr q22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000713 "add x26, x26, #0x10\n"
714 "add x25, x25, #0x10\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000715 "ldr q19, [x23, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000716 "add x24, x24, #0x10\n"
717 "add x23, x23, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000718 "blt 74f\n"
719 "sadalp v0.4s, v1.8h\n"
720 "movi v1.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000721 "mov x10, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000722 "sadalp v29.4s, v30.8h\n"
723 "movi v30.8h, #0x0\n"
724 "sadalp v26.4s, v27.8h\n"
725 "movi v27.8h, #0x0\n"
726 "sadalp v23.4s, v24.8h\n"
727 "movi v24.8h, #0x0\n"
728 "sadalp v20.4s, v21.8h\n"
729 "movi v21.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000730 "74:" // Height 5: Multiply loop: unique 5: no collapse
Michael Tyler7d9a6262023-02-01 16:37:07 +0000731 "sub x28, x28, #0x10\n"
732 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000733 "sadalp v1.8h, v31.16b\n"
734 "sadalp v30.8h, v28.16b\n"
735 "sadalp v27.8h, v25.16b\n"
736 "sadalp v24.8h, v22.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000737 "add x10, x10, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000738 "sadalp v21.8h, v19.16b\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000739 "bge 73b\n"
740 "75:" // Height 5: Multiply loop: Single iteration only
Michael Tyler7d9a6262023-02-01 16:37:07 +0000741 "ldr q31, [x27, #0x0]\n"
742 "ldr q28, [x26, #0x0]\n"
743 "sub x28, x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000744 "sadalp v1.8h, v31.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000745 "ldr q25, [x25, #0x0]\n"
746 "ldr q22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000747 "sadalp v30.8h, v28.16b\n"
748 "sadalp v27.8h, v25.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000749 "ldr q19, [x23, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000750 "sadalp v24.8h, v22.16b\n"
751 "sadalp v21.8h, v19.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000752 "add x27, x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000753 "add x26, x26, #0x10\n"
754 "add x25, x25, #0x10\n"
755 "add x24, x24, #0x10\n"
756 "add x23, x23, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000757 "76:" // Height 5: Multiply loop: Main loop skip
Michael Tyler7d9a6262023-02-01 16:37:07 +0000758 "cbz x28, 85f\n"
759 "tbz x28, #3, 80f\n"
760 "ldr d31, [x27], #0x8\n"
761 "ldr d28, [x26], #0x8\n"
762 "ldr d25, [x25], #0x8\n"
763 "ldr d22, [x24], #0x8\n"
764 "ldr d19, [x23], #0x8\n"
765 "tbz x28, #2, 78f\n"
766 "ld1 { v31.s }[2], [x27], #0x4\n"
767 "ld1 { v28.s }[2], [x26], #0x4\n"
768 "ld1 { v25.s }[2], [x25], #0x4\n"
769 "ld1 { v22.s }[2], [x24], #0x4\n"
770 "ld1 { v19.s }[2], [x23], #0x4\n"
771 "tbz x28, #1, 77f\n"
772 "ld1 { v31.h }[6], [x27], #0x2\n"
773 "ld1 { v28.h }[6], [x26], #0x2\n"
774 "ld1 { v25.h }[6], [x25], #0x2\n"
775 "ld1 { v22.h }[6], [x24], #0x2\n"
776 "ld1 { v19.h }[6], [x23], #0x2\n"
777 "tbz x28, #0, 84f\n"
778 "ld1 { v31.b }[14], [x27]\n"
779 "ld1 { v28.b }[14], [x26]\n"
780 "ld1 { v25.b }[14], [x25]\n"
781 "ld1 { v22.b }[14], [x24]\n"
782 "ld1 { v19.b }[14], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000783 "b 84f\n"
784 "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12
Michael Tyler7d9a6262023-02-01 16:37:07 +0000785 "tbz x28, #0, 84f\n"
786 "ld1 { v31.b }[12], [x27]\n"
787 "ld1 { v28.b }[12], [x26]\n"
788 "ld1 { v25.b }[12], [x25]\n"
789 "ld1 { v22.b }[12], [x24]\n"
790 "ld1 { v19.b }[12], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000791 "b 84f\n"
792 "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000793 "tbz x28, #1, 79f\n"
794 "ld1 { v31.h }[4], [x27], #0x2\n"
795 "ld1 { v28.h }[4], [x26], #0x2\n"
796 "ld1 { v25.h }[4], [x25], #0x2\n"
797 "ld1 { v22.h }[4], [x24], #0x2\n"
798 "ld1 { v19.h }[4], [x23], #0x2\n"
799 "tbz x28, #0, 84f\n"
800 "ld1 { v31.b }[10], [x27]\n"
801 "ld1 { v28.b }[10], [x26]\n"
802 "ld1 { v25.b }[10], [x25]\n"
803 "ld1 { v22.b }[10], [x24]\n"
804 "ld1 { v19.b }[10], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000805 "b 84f\n"
806 "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8
Michael Tyler7d9a6262023-02-01 16:37:07 +0000807 "tbz x28, #0, 84f\n"
808 "ld1 { v31.b }[8], [x27]\n"
809 "ld1 { v28.b }[8], [x26]\n"
810 "ld1 { v25.b }[8], [x25]\n"
811 "ld1 { v22.b }[8], [x24]\n"
812 "ld1 { v19.b }[8], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000813 "b 84f\n"
814 "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000815 "tbz x28, #2, 82f\n"
816 "ldr s31, [x27], #0x4\n"
817 "ldr s28, [x26], #0x4\n"
818 "ldr s25, [x25], #0x4\n"
819 "ldr s22, [x24], #0x4\n"
820 "ldr s19, [x23], #0x4\n"
821 "tbz x28, #1, 81f\n"
822 "ld1 { v31.h }[2], [x27], #0x2\n"
823 "ld1 { v28.h }[2], [x26], #0x2\n"
824 "ld1 { v25.h }[2], [x25], #0x2\n"
825 "ld1 { v22.h }[2], [x24], #0x2\n"
826 "ld1 { v19.h }[2], [x23], #0x2\n"
827 "tbz x28, #0, 84f\n"
828 "ld1 { v31.b }[6], [x27]\n"
829 "ld1 { v28.b }[6], [x26]\n"
830 "ld1 { v25.b }[6], [x25]\n"
831 "ld1 { v22.b }[6], [x24]\n"
832 "ld1 { v19.b }[6], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000833 "b 84f\n"
834 "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4
Michael Tyler7d9a6262023-02-01 16:37:07 +0000835 "tbz x28, #0, 84f\n"
836 "ld1 { v31.b }[4], [x27]\n"
837 "ld1 { v28.b }[4], [x26]\n"
838 "ld1 { v25.b }[4], [x25]\n"
839 "ld1 { v22.b }[4], [x24]\n"
840 "ld1 { v19.b }[4], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000841 "b 84f\n"
842 "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000843 "tbz x28, #1, 83f\n"
844 "ldr h31, [x27], #0x2\n"
845 "ldr h28, [x26], #0x2\n"
846 "ldr h25, [x25], #0x2\n"
847 "ldr h22, [x24], #0x2\n"
848 "ldr h19, [x23], #0x2\n"
849 "tbz x28, #0, 84f\n"
850 "ld1 { v31.b }[2], [x27]\n"
851 "ld1 { v28.b }[2], [x26]\n"
852 "ld1 { v25.b }[2], [x25]\n"
853 "ld1 { v22.b }[2], [x24]\n"
854 "ld1 { v19.b }[2], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000855 "b 84f\n"
856 "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +0000857 "ldr b31, [x27, #0x0]\n"
858 "ldr b28, [x26, #0x0]\n"
859 "ldr b25, [x25, #0x0]\n"
860 "ldr b22, [x24, #0x0]\n"
861 "ldr b19, [x23, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000862 "84:" // Height 5: Multiply loop: Ragged operand read: Done
863 "sadalp v1.8h, v31.16b\n"
864 "sadalp v30.8h, v28.16b\n"
865 "sadalp v27.8h, v25.16b\n"
866 "sadalp v24.8h, v22.16b\n"
867 "sadalp v21.8h, v19.16b\n"
868 "85:" // Height 5: Multiply loop: No odd multiplies
Michael Tyler7d9a6262023-02-01 16:37:07 +0000869 "add x9, x9, #0x1\n"
870 "cmp x9, x21\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000871 "bne 70b\n"
872 "sadalp v0.4s, v1.8h\n"
873 "sadalp v29.4s, v30.8h\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000874 "sadalp v26.4s, v27.8h\n"
875 "sadalp v23.4s, v24.8h\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000876 "sadalp v20.4s, v21.8h\n"
Michael Tylerba209752022-12-15 12:39:29 +0000877 "addp v0.4s, v0.4s, v29.4s\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000878 "addp v29.4s, v26.4s, v23.4s\n"
879 "addp v20.4s, v20.4s, v20.4s\n"
880 "addp v0.4s, v0.4s, v29.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000881 "addp v20.4s, v20.4s, v20.4s\n"
882 "mul v0.4s, v0.4s, v2.4s\n"
883 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000884 "mul v20.4s, v20.4s, v2.4s\n"
885 "str s20, [%x[out_ptr]], #0x4\n"
886 "b 104f\n"
887 "86:" // Height 6
888 "movi v1.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000889 "movi v0.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000890 "mov x10, #0x0\n"
891 "ldr w22, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000892 "movi v30.8h, #0x0\n"
893 "movi v29.4s, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000894 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000895 "movi v27.8h, #0x0\n"
896 "movi v26.4s, #0x0\n"
897 "movi v24.8h, #0x0\n"
898 "movi v23.4s, #0x0\n"
899 "movi v21.8h, #0x0\n"
900 "movi v20.4s, #0x0\n"
901 "movi v18.8h, #0x0\n"
902 "movi v17.4s, #0x0\n"
903 "87:" // Height 6: String loop
Michael Tyler7d9a6262023-02-01 16:37:07 +0000904 "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
905 "ldr w28, [x20, x9, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000906 "tbz %x[flags], #3, 88f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000907 "ldr x20, [%x[input_ptr], x9, LSL #0x3]\n"
908 "add x20, x20, %x[input_offset], LSL #3\n"
909 "ldr x27, [x20, #0x0]\n"
910 "ldr x26, [x20, #0x8]\n"
911 "ldr x25, [x20, #0x10]\n"
912 "ldr x24, [x20, #0x18]\n"
913 "ldr x23, [x20, #0x20]\n"
914 "ldr x21, [x20, #0x28]\n"
915 "cbnz x9, 89f\n"
916 "ldr w20, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
917 "add x27, x27, x20\n"
918 "add x26, x26, x20\n"
919 "add x25, x25, x20\n"
920 "add x24, x24, x20\n"
921 "add x23, x23, x20\n"
922 "add x21, x21, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000923 "b 89f\n"
924 "88:" // Height 6: setup direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +0000925 "mov x27, %x[input_ptr]\n"
926 "add x26, x27, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000927 "add x25, x26, %x[input_offset]\n"
928 "add x24, x25, %x[input_offset]\n"
929 "add x23, x24, %x[input_offset]\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000930 "add x21, x23, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000931 "89:" // Height 6: input setup done
Michael Tyler7d9a6262023-02-01 16:37:07 +0000932 "cmp x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000933 "blt 93f\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000934 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000935 "blt 92f\n"
936 "90:" // Height 6: Multiply loop: Main loop head
Michael Tyler7d9a6262023-02-01 16:37:07 +0000937 "ldr q31, [x27, #0x0]\n"
938 "ldr q28, [x26, #0x0]\n"
939 "cmp x10, #0x7e\n"
940 "add x27, x27, #0x10\n"
941 "ldr q25, [x25, #0x0]\n"
942 "ldr q22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000943 "add x26, x26, #0x10\n"
944 "add x25, x25, #0x10\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000945 "ldr q19, [x23, #0x0]\n"
946 "ldr q16, [x21, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000947 "add x24, x24, #0x10\n"
948 "add x23, x23, #0x10\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000949 "add x21, x21, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000950 "blt 91f\n"
951 "sadalp v0.4s, v1.8h\n"
952 "movi v1.8h, #0x0\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000953 "mov x10, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000954 "sadalp v29.4s, v30.8h\n"
955 "movi v30.8h, #0x0\n"
956 "sadalp v26.4s, v27.8h\n"
957 "movi v27.8h, #0x0\n"
958 "sadalp v23.4s, v24.8h\n"
959 "movi v24.8h, #0x0\n"
960 "sadalp v20.4s, v21.8h\n"
961 "movi v21.8h, #0x0\n"
962 "sadalp v17.4s, v18.8h\n"
963 "movi v18.8h, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000964 "91:" // Height 6: Multiply loop: unique 6: no collapse
Michael Tyler7d9a6262023-02-01 16:37:07 +0000965 "sub x28, x28, #0x10\n"
966 "cmp x28, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000967 "sadalp v1.8h, v31.16b\n"
968 "sadalp v30.8h, v28.16b\n"
969 "sadalp v27.8h, v25.16b\n"
970 "sadalp v24.8h, v22.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000971 "add x10, x10, #0x1\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000972 "sadalp v21.8h, v19.16b\n"
973 "sadalp v18.8h, v16.16b\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000974 "bge 90b\n"
975 "92:" // Height 6: Multiply loop: Single iteration only
Michael Tyler7d9a6262023-02-01 16:37:07 +0000976 "ldr q31, [x27, #0x0]\n"
977 "ldr q28, [x26, #0x0]\n"
978 "sub x28, x28, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000979 "sadalp v1.8h, v31.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000980 "ldr q25, [x25, #0x0]\n"
981 "ldr q22, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000982 "sadalp v30.8h, v28.16b\n"
983 "sadalp v27.8h, v25.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000984 "ldr q19, [x23, #0x0]\n"
985 "ldr q16, [x21, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000986 "sadalp v24.8h, v22.16b\n"
987 "sadalp v21.8h, v19.16b\n"
988 "sadalp v18.8h, v16.16b\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000989 "add x27, x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000990 "add x26, x26, #0x10\n"
991 "add x25, x25, #0x10\n"
992 "add x24, x24, #0x10\n"
993 "add x23, x23, #0x10\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +0000994 "add x21, x21, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000995 "93:" // Height 6: Multiply loop: Main loop skip
Michael Tyler7d9a6262023-02-01 16:37:07 +0000996 "cbz x28, 102f\n"
997 "tbz x28, #3, 97f\n"
998 "ldr d31, [x27], #0x8\n"
999 "ldr d28, [x26], #0x8\n"
1000 "ldr d25, [x25], #0x8\n"
1001 "ldr d22, [x24], #0x8\n"
1002 "ldr d19, [x23], #0x8\n"
1003 "ldr d16, [x21], #0x8\n"
1004 "tbz x28, #2, 95f\n"
1005 "ld1 { v31.s }[2], [x27], #0x4\n"
1006 "ld1 { v28.s }[2], [x26], #0x4\n"
1007 "ld1 { v25.s }[2], [x25], #0x4\n"
1008 "ld1 { v22.s }[2], [x24], #0x4\n"
1009 "ld1 { v19.s }[2], [x23], #0x4\n"
1010 "ld1 { v16.s }[2], [x21], #0x4\n"
1011 "tbz x28, #1, 94f\n"
1012 "ld1 { v31.h }[6], [x27], #0x2\n"
1013 "ld1 { v28.h }[6], [x26], #0x2\n"
1014 "ld1 { v25.h }[6], [x25], #0x2\n"
1015 "ld1 { v22.h }[6], [x24], #0x2\n"
1016 "ld1 { v19.h }[6], [x23], #0x2\n"
1017 "ld1 { v16.h }[6], [x21], #0x2\n"
1018 "tbz x28, #0, 101f\n"
1019 "ld1 { v31.b }[14], [x27]\n"
1020 "ld1 { v28.b }[14], [x26]\n"
1021 "ld1 { v25.b }[14], [x25]\n"
1022 "ld1 { v22.b }[14], [x24]\n"
1023 "ld1 { v19.b }[14], [x23]\n"
1024 "ld1 { v16.b }[14], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001025 "b 101f\n"
1026 "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12
Michael Tyler7d9a6262023-02-01 16:37:07 +00001027 "tbz x28, #0, 101f\n"
1028 "ld1 { v31.b }[12], [x27]\n"
1029 "ld1 { v28.b }[12], [x26]\n"
1030 "ld1 { v25.b }[12], [x25]\n"
1031 "ld1 { v22.b }[12], [x24]\n"
1032 "ld1 { v19.b }[12], [x23]\n"
1033 "ld1 { v16.b }[12], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001034 "b 101f\n"
1035 "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8
Michael Tyler7d9a6262023-02-01 16:37:07 +00001036 "tbz x28, #1, 96f\n"
1037 "ld1 { v31.h }[4], [x27], #0x2\n"
1038 "ld1 { v28.h }[4], [x26], #0x2\n"
1039 "ld1 { v25.h }[4], [x25], #0x2\n"
1040 "ld1 { v22.h }[4], [x24], #0x2\n"
1041 "ld1 { v19.h }[4], [x23], #0x2\n"
1042 "ld1 { v16.h }[4], [x21], #0x2\n"
1043 "tbz x28, #0, 101f\n"
1044 "ld1 { v31.b }[10], [x27]\n"
1045 "ld1 { v28.b }[10], [x26]\n"
1046 "ld1 { v25.b }[10], [x25]\n"
1047 "ld1 { v22.b }[10], [x24]\n"
1048 "ld1 { v19.b }[10], [x23]\n"
1049 "ld1 { v16.b }[10], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001050 "b 101f\n"
1051 "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8
Michael Tyler7d9a6262023-02-01 16:37:07 +00001052 "tbz x28, #0, 101f\n"
1053 "ld1 { v31.b }[8], [x27]\n"
1054 "ld1 { v28.b }[8], [x26]\n"
1055 "ld1 { v25.b }[8], [x25]\n"
1056 "ld1 { v22.b }[8], [x24]\n"
1057 "ld1 { v19.b }[8], [x23]\n"
1058 "ld1 { v16.b }[8], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001059 "b 101f\n"
1060 "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0
Michael Tyler7d9a6262023-02-01 16:37:07 +00001061 "tbz x28, #2, 99f\n"
1062 "ldr s31, [x27], #0x4\n"
1063 "ldr s28, [x26], #0x4\n"
1064 "ldr s25, [x25], #0x4\n"
1065 "ldr s22, [x24], #0x4\n"
1066 "ldr s19, [x23], #0x4\n"
1067 "ldr s16, [x21], #0x4\n"
1068 "tbz x28, #1, 98f\n"
1069 "ld1 { v31.h }[2], [x27], #0x2\n"
1070 "ld1 { v28.h }[2], [x26], #0x2\n"
1071 "ld1 { v25.h }[2], [x25], #0x2\n"
1072 "ld1 { v22.h }[2], [x24], #0x2\n"
1073 "ld1 { v19.h }[2], [x23], #0x2\n"
1074 "ld1 { v16.h }[2], [x21], #0x2\n"
1075 "tbz x28, #0, 101f\n"
1076 "ld1 { v31.b }[6], [x27]\n"
1077 "ld1 { v28.b }[6], [x26]\n"
1078 "ld1 { v25.b }[6], [x25]\n"
1079 "ld1 { v22.b }[6], [x24]\n"
1080 "ld1 { v19.b }[6], [x23]\n"
1081 "ld1 { v16.b }[6], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001082 "b 101f\n"
1083 "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4
Michael Tyler7d9a6262023-02-01 16:37:07 +00001084 "tbz x28, #0, 101f\n"
1085 "ld1 { v31.b }[4], [x27]\n"
1086 "ld1 { v28.b }[4], [x26]\n"
1087 "ld1 { v25.b }[4], [x25]\n"
1088 "ld1 { v22.b }[4], [x24]\n"
1089 "ld1 { v19.b }[4], [x23]\n"
1090 "ld1 { v16.b }[4], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001091 "b 101f\n"
1092 "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0
Michael Tyler7d9a6262023-02-01 16:37:07 +00001093 "tbz x28, #1, 100f\n"
1094 "ldr h31, [x27], #0x2\n"
1095 "ldr h28, [x26], #0x2\n"
1096 "ldr h25, [x25], #0x2\n"
1097 "ldr h22, [x24], #0x2\n"
1098 "ldr h19, [x23], #0x2\n"
1099 "ldr h16, [x21], #0x2\n"
1100 "tbz x28, #0, 101f\n"
1101 "ld1 { v31.b }[2], [x27]\n"
1102 "ld1 { v28.b }[2], [x26]\n"
1103 "ld1 { v25.b }[2], [x25]\n"
1104 "ld1 { v22.b }[2], [x24]\n"
1105 "ld1 { v19.b }[2], [x23]\n"
1106 "ld1 { v16.b }[2], [x21]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001107 "b 101f\n"
1108 "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
Michael Tyler7d9a6262023-02-01 16:37:07 +00001109 "ldr b31, [x27, #0x0]\n"
1110 "ldr b28, [x26, #0x0]\n"
1111 "ldr b25, [x25, #0x0]\n"
1112 "ldr b22, [x24, #0x0]\n"
1113 "ldr b19, [x23, #0x0]\n"
1114 "ldr b16, [x21, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001115 "101:" // Height 6: Multiply loop: Ragged operand read: Done
1116 "sadalp v1.8h, v31.16b\n"
1117 "sadalp v30.8h, v28.16b\n"
1118 "sadalp v27.8h, v25.16b\n"
1119 "sadalp v24.8h, v22.16b\n"
1120 "sadalp v21.8h, v19.16b\n"
1121 "sadalp v18.8h, v16.16b\n"
1122 "102:" // Height 6: Multiply loop: No odd multiplies
Michael Tyler7d9a6262023-02-01 16:37:07 +00001123 "add x9, x9, #0x1\n"
1124 "cmp x9, x22\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001125 "bne 87b\n"
1126 "sadalp v0.4s, v1.8h\n"
1127 "sadalp v29.4s, v30.8h\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +00001128 "subs %x[M], %x[M], #0x6\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001129 "sadalp v26.4s, v27.8h\n"
1130 "sadalp v23.4s, v24.8h\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001131 "sadalp v20.4s, v21.8h\n"
1132 "sadalp v17.4s, v18.8h\n"
1133 "addp v0.4s, v0.4s, v29.4s\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +00001134 "addp v29.4s, v26.4s, v23.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001135 "addp v20.4s, v20.4s, v17.4s\n"
Michael Tyler7d9a6262023-02-01 16:37:07 +00001136 "addp v0.4s, v0.4s, v29.4s\n"
1137 "addp v20.4s, v20.4s, v20.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001138 "mul v0.4s, v0.4s, v2.4s\n"
1139 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001140 "mul v20.4s, v20.4s, v2.4s\n"
1141 "str d20, [%x[out_ptr]], #0x8\n"
1142 "beq 104f\n"
1143 "tbz %x[flags], #3, 103f\n"
1144 "add %x[input_offset], %x[input_offset], #0x6\n"
1145 "b 1b\n"
1146 "103:" // Update direct input
Michael Tyler7d9a6262023-02-01 16:37:07 +00001147 "mov x20, #0x6\n"
1148 "madd %x[input_ptr], x20, %x[input_offset], %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001149 "b 1b\n"
1150 "104:" // Exit
1151
Michael Tyler7d9a6262023-02-01 16:37:07 +00001152 : [M] "+&r" (M), [input_offset] "+&r" (input_offset), [input_ptr] "+&r" (input_ptr), [out_ptr] "+&r" (out_ptr)
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001153 : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
Michael Tyler7d9a6262023-02-01 16:37:07 +00001154 : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001155 );
1156}
1157
1158} // namespace arm_gemm
1159
1160#endif // __aarch64__