blob: f5709d92acfa4fa22ff2ad2a95f633d2eb34180d [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
Michael Tylerbe13cea2023-01-17 11:04:14 +00002 * Copyright (c) 2019-2020 Arm Limited.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
Michael Tylerbe13cea2023-01-17 11:04:14 +000013 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000015 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
Michael Tylerbe13cea2023-01-17 11:04:14 +000020 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000023 */
24
25#ifdef __aarch64__
26
27#include "arm_gemm.hpp"
28#include "quantized.hpp"
29#include "utils.hpp"
30
31#include <cassert>
32
33namespace arm_gemm {
34
35template<>
36void row_sums_indirect(
37 unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
38 size_t M, int32_t *out_ptr, const Requantize32 *qp
39)
40{
41 struct KernelArgs {
42 unsigned int num_strings;
43 const unsigned int *string_lengths;
44 unsigned int input_initial_col;
45 } ka;
46
47 unsigned long flags=0;
48 void *input_ptr;
49 size_t input_offset;
50
51 if (A_arg.is_indirect) {
52 input_ptr=(void *)(A_arg.indirect.ptr);
53 input_offset=A_arg.indirect.start_row;
54 ka.input_initial_col=A_arg.indirect.start_col;
55 flags |= 0x8;
56 } else {
57 assert(num_strings==1);
58 input_ptr=(void *)(A_arg.direct.base);
59 input_offset=A_arg.direct.stride;
60 }
61
62 ka.num_strings = num_strings;
63 ka.string_lengths = string_lengths;
64
65 __asm__ __volatile__(
Michael Tylerbe13cea2023-01-17 11:04:14 +000066 "add x19, %x[qp], %[b_offset]\n"
67 "ld1r { v2.4s }, [x19]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000068 "neg v2.4s, v2.4s\n"
69 "1:" // Row loop
70 "cmp %x[M], #0x6\n"
71 "bge 86f\n"
72 "cmp %x[M], #0x4\n"
73 "bgt 69f\n"
74 "beq 52f\n"
75 "cmp %x[M], #0x2\n"
76 "bgt 35f\n"
77 "beq 18f\n"
78 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000079 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000080 "movi v0.4s, #0x0\n"
81 "mov x9, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000082 "mov x28, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000083 "2:" // Height 1: String loop
Michael Tylerbe13cea2023-01-17 11:04:14 +000084 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
85 "ldr w27, [x19, x28, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000086 "tbz %x[flags], #3, 3f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000087 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
88 "add x19, x19, %x[input_offset], LSL #3\n"
89 "ldr x26, [x19, #0x0]\n"
90 "cbnz x28, 4f\n"
91 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
92 "add x26, x26, x19\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000093 "b 4f\n"
94 "3:" // Height 1: setup direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +000095 "mov x26, %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000096 "4:" // Height 1: input setup done
Michael Tylerbe13cea2023-01-17 11:04:14 +000097 "cmp x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000098 "blt 8f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +000099 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000100 "blt 7f\n"
101 "5:" // Height 1: Multiply loop: Main loop head
Michael Tylerbe13cea2023-01-17 11:04:14 +0000102 "ldr q31, [x26, #0x0]\n"
103 "cmp x9, #0x7e\n"
104 "add x26, x26, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000105 "blt 6f\n"
106 "uadalp v0.4s, v1.8h\n"
107 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000108 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000109 "6:" // Height 1: Multiply loop: unique 1: no collapse
110 "uadalp v1.8h, v31.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000111 "add x9, x9, #0x1\n"
112 "sub x27, x27, #0x10\n"
113 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000114 "bge 5b\n"
115 "7:" // Height 1: Multiply loop: Single iteration only
Michael Tylerbe13cea2023-01-17 11:04:14 +0000116 "sub x27, x27, #0x10\n"
117 "ldr q31, [x26, #0x0]\n"
118 "add x26, x26, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000119 "uadalp v1.8h, v31.16b\n"
120 "8:" // Height 1: Multiply loop: Main loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000121 "cbz x27, 17f\n"
122 "tbz x27, #3, 12f\n"
123 "ldr d31, [x26], #0x8\n"
124 "tbz x27, #2, 10f\n"
125 "ld1 { v31.s }[2], [x26], #0x4\n"
126 "tbz x27, #1, 9f\n"
127 "ld1 { v31.h }[6], [x26], #0x2\n"
128 "tbz x27, #0, 16f\n"
129 "ld1 { v31.b }[14], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000130 "b 16f\n"
131 "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12
Michael Tylerbe13cea2023-01-17 11:04:14 +0000132 "tbz x27, #0, 16f\n"
133 "ld1 { v31.b }[12], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000134 "b 16f\n"
135 "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000136 "tbz x27, #1, 11f\n"
137 "ld1 { v31.h }[4], [x26], #0x2\n"
138 "tbz x27, #0, 16f\n"
139 "ld1 { v31.b }[10], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000140 "b 16f\n"
141 "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000142 "tbz x27, #0, 16f\n"
143 "ld1 { v31.b }[8], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000144 "b 16f\n"
145 "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000146 "tbz x27, #2, 14f\n"
147 "ldr s31, [x26], #0x4\n"
148 "tbz x27, #1, 13f\n"
149 "ld1 { v31.h }[2], [x26], #0x2\n"
150 "tbz x27, #0, 16f\n"
151 "ld1 { v31.b }[6], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000152 "b 16f\n"
153 "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
Michael Tylerbe13cea2023-01-17 11:04:14 +0000154 "tbz x27, #0, 16f\n"
155 "ld1 { v31.b }[4], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000156 "b 16f\n"
157 "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000158 "tbz x27, #1, 15f\n"
159 "ldr h31, [x26], #0x2\n"
160 "tbz x27, #0, 16f\n"
161 "ld1 { v31.b }[2], [x26]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000162 "b 16f\n"
163 "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000164 "ldr b31, [x26, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000165 "16:" // Height 1: Multiply loop: Ragged operand read: Done
166 "uadalp v1.8h, v31.16b\n"
167 "17:" // Height 1: Multiply loop: No odd multiplies
Michael Tylerbe13cea2023-01-17 11:04:14 +0000168 "add x28, x28, #0x1\n"
169 "cmp x28, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000170 "bne 2b\n"
171 "uadalp v0.4s, v1.8h\n"
172 "addp v0.4s, v0.4s, v0.4s\n"
173 "addp v0.4s, v0.4s, v0.4s\n"
174 "mul v0.4s, v0.4s, v2.4s\n"
175 "str s0, [%x[out_ptr]], #0x4\n"
176 "b 104f\n"
177 "18:" // Height 2
178 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000179 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
180 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000181 "movi v0.4s, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000182 "mov x28, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000183 "movi v30.8h, #0x0\n"
184 "movi v29.4s, #0x0\n"
185 "19:" // Height 2: String loop
Michael Tylerbe13cea2023-01-17 11:04:14 +0000186 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
187 "ldr w27, [x19, x28, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000188 "tbz %x[flags], #3, 20f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000189 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
190 "add x19, x19, %x[input_offset], LSL #3\n"
191 "ldr x26, [x19, #0x0]\n"
192 "ldr x25, [x19, #0x8]\n"
193 "cbnz x28, 21f\n"
194 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
195 "add x26, x26, x19\n"
196 "add x25, x25, x19\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000197 "b 21f\n"
198 "20:" // Height 2: setup direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +0000199 "mov x26, %x[input_ptr]\n"
200 "add x25, x26, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000201 "21:" // Height 2: input setup done
Michael Tylerbe13cea2023-01-17 11:04:14 +0000202 "cmp x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000203 "blt 25f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000204 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000205 "blt 24f\n"
206 "22:" // Height 2: Multiply loop: Main loop head
Michael Tylerbe13cea2023-01-17 11:04:14 +0000207 "ldr q31, [x26, #0x0]\n"
208 "ldr q28, [x25, #0x0]\n"
209 "cmp x9, #0x7e\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000210 "add x26, x26, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000211 "add x25, x25, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000212 "blt 23f\n"
213 "uadalp v0.4s, v1.8h\n"
214 "movi v1.8h, #0x0\n"
215 "uadalp v29.4s, v30.8h\n"
216 "movi v30.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000217 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000218 "23:" // Height 2: Multiply loop: unique 2: no collapse
219 "uadalp v1.8h, v31.16b\n"
220 "uadalp v30.8h, v28.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000221 "add x9, x9, #0x1\n"
222 "sub x27, x27, #0x10\n"
223 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000224 "bge 22b\n"
225 "24:" // Height 2: Multiply loop: Single iteration only
Michael Tylerbe13cea2023-01-17 11:04:14 +0000226 "sub x27, x27, #0x10\n"
227 "ldr q31, [x26, #0x0]\n"
228 "ldr q28, [x25, #0x0]\n"
229 "add x26, x26, #0x10\n"
230 "add x25, x25, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000231 "uadalp v1.8h, v31.16b\n"
232 "uadalp v30.8h, v28.16b\n"
233 "25:" // Height 2: Multiply loop: Main loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000234 "cbz x27, 34f\n"
235 "tbz x27, #3, 29f\n"
236 "ldr d31, [x26], #0x8\n"
237 "ldr d28, [x25], #0x8\n"
238 "tbz x27, #2, 27f\n"
239 "ld1 { v31.s }[2], [x26], #0x4\n"
240 "ld1 { v28.s }[2], [x25], #0x4\n"
241 "tbz x27, #1, 26f\n"
242 "ld1 { v31.h }[6], [x26], #0x2\n"
243 "ld1 { v28.h }[6], [x25], #0x2\n"
244 "tbz x27, #0, 33f\n"
245 "ld1 { v31.b }[14], [x26]\n"
246 "ld1 { v28.b }[14], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000247 "b 33f\n"
248 "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12
Michael Tylerbe13cea2023-01-17 11:04:14 +0000249 "tbz x27, #0, 33f\n"
250 "ld1 { v31.b }[12], [x26]\n"
251 "ld1 { v28.b }[12], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000252 "b 33f\n"
253 "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000254 "tbz x27, #1, 28f\n"
255 "ld1 { v31.h }[4], [x26], #0x2\n"
256 "ld1 { v28.h }[4], [x25], #0x2\n"
257 "tbz x27, #0, 33f\n"
258 "ld1 { v31.b }[10], [x26]\n"
259 "ld1 { v28.b }[10], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000260 "b 33f\n"
261 "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000262 "tbz x27, #0, 33f\n"
263 "ld1 { v31.b }[8], [x26]\n"
264 "ld1 { v28.b }[8], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000265 "b 33f\n"
266 "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000267 "tbz x27, #2, 31f\n"
268 "ldr s31, [x26], #0x4\n"
269 "ldr s28, [x25], #0x4\n"
270 "tbz x27, #1, 30f\n"
271 "ld1 { v31.h }[2], [x26], #0x2\n"
272 "ld1 { v28.h }[2], [x25], #0x2\n"
273 "tbz x27, #0, 33f\n"
274 "ld1 { v31.b }[6], [x26]\n"
275 "ld1 { v28.b }[6], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000276 "b 33f\n"
277 "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
Michael Tylerbe13cea2023-01-17 11:04:14 +0000278 "tbz x27, #0, 33f\n"
279 "ld1 { v31.b }[4], [x26]\n"
280 "ld1 { v28.b }[4], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000281 "b 33f\n"
282 "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000283 "tbz x27, #1, 32f\n"
284 "ldr h31, [x26], #0x2\n"
285 "ldr h28, [x25], #0x2\n"
286 "tbz x27, #0, 33f\n"
287 "ld1 { v31.b }[2], [x26]\n"
288 "ld1 { v28.b }[2], [x25]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000289 "b 33f\n"
290 "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000291 "ldr b31, [x26, #0x0]\n"
292 "ldr b28, [x25, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000293 "33:" // Height 2: Multiply loop: Ragged operand read: Done
294 "uadalp v1.8h, v31.16b\n"
295 "uadalp v30.8h, v28.16b\n"
296 "34:" // Height 2: Multiply loop: No odd multiplies
Michael Tylerbe13cea2023-01-17 11:04:14 +0000297 "add x28, x28, #0x1\n"
298 "cmp x28, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000299 "bne 19b\n"
300 "uadalp v0.4s, v1.8h\n"
301 "uadalp v29.4s, v30.8h\n"
302 "addp v0.4s, v0.4s, v29.4s\n"
303 "addp v0.4s, v0.4s, v0.4s\n"
304 "mul v0.4s, v0.4s, v2.4s\n"
305 "str d0, [%x[out_ptr]], #0x8\n"
306 "b 104f\n"
307 "35:" // Height 3
308 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000309 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
310 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000311 "movi v0.4s, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000312 "mov x28, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000313 "movi v30.8h, #0x0\n"
314 "movi v29.4s, #0x0\n"
315 "movi v27.8h, #0x0\n"
316 "movi v26.4s, #0x0\n"
317 "36:" // Height 3: String loop
Michael Tylerbe13cea2023-01-17 11:04:14 +0000318 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
319 "ldr w27, [x19, x28, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000320 "tbz %x[flags], #3, 37f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000321 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
322 "add x19, x19, %x[input_offset], LSL #3\n"
323 "ldr x26, [x19, #0x0]\n"
324 "ldr x25, [x19, #0x8]\n"
325 "ldr x24, [x19, #0x10]\n"
326 "cbnz x28, 38f\n"
327 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
328 "add x26, x26, x19\n"
329 "add x25, x25, x19\n"
330 "add x24, x24, x19\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000331 "b 38f\n"
332 "37:" // Height 3: setup direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +0000333 "mov x26, %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000334 "add x25, x26, %x[input_offset]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000335 "add x24, x25, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000336 "38:" // Height 3: input setup done
Michael Tylerbe13cea2023-01-17 11:04:14 +0000337 "cmp x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000338 "blt 42f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000339 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000340 "blt 41f\n"
341 "39:" // Height 3: Multiply loop: Main loop head
Michael Tylerbe13cea2023-01-17 11:04:14 +0000342 "ldr q31, [x26, #0x0]\n"
343 "ldr q28, [x25, #0x0]\n"
344 "ldr q25, [x24, #0x0]\n"
345 "cmp x9, #0x7e\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000346 "add x26, x26, #0x10\n"
347 "add x25, x25, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000348 "add x24, x24, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000349 "blt 40f\n"
350 "uadalp v0.4s, v1.8h\n"
351 "movi v1.8h, #0x0\n"
352 "uadalp v29.4s, v30.8h\n"
353 "movi v30.8h, #0x0\n"
354 "uadalp v26.4s, v27.8h\n"
355 "movi v27.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000356 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000357 "40:" // Height 3: Multiply loop: unique 3: no collapse
358 "uadalp v1.8h, v31.16b\n"
359 "uadalp v30.8h, v28.16b\n"
360 "uadalp v27.8h, v25.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000361 "add x9, x9, #0x1\n"
362 "sub x27, x27, #0x10\n"
363 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000364 "bge 39b\n"
365 "41:" // Height 3: Multiply loop: Single iteration only
Michael Tylerbe13cea2023-01-17 11:04:14 +0000366 "sub x27, x27, #0x10\n"
367 "ldr q31, [x26, #0x0]\n"
368 "ldr q28, [x25, #0x0]\n"
369 "ldr q25, [x24, #0x0]\n"
Michael Tylerba209752022-12-15 12:39:29 +0000370 "add x26, x26, #0x10\n"
371 "add x25, x25, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000372 "uadalp v1.8h, v31.16b\n"
373 "uadalp v30.8h, v28.16b\n"
374 "uadalp v27.8h, v25.16b\n"
375 "add x24, x24, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000376 "42:" // Height 3: Multiply loop: Main loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000377 "cbz x27, 51f\n"
378 "tbz x27, #3, 46f\n"
379 "ldr d31, [x26], #0x8\n"
380 "ldr d28, [x25], #0x8\n"
381 "ldr d25, [x24], #0x8\n"
382 "tbz x27, #2, 44f\n"
383 "ld1 { v31.s }[2], [x26], #0x4\n"
384 "ld1 { v28.s }[2], [x25], #0x4\n"
385 "ld1 { v25.s }[2], [x24], #0x4\n"
386 "tbz x27, #1, 43f\n"
387 "ld1 { v31.h }[6], [x26], #0x2\n"
388 "ld1 { v28.h }[6], [x25], #0x2\n"
389 "ld1 { v25.h }[6], [x24], #0x2\n"
390 "tbz x27, #0, 50f\n"
391 "ld1 { v31.b }[14], [x26]\n"
392 "ld1 { v28.b }[14], [x25]\n"
393 "ld1 { v25.b }[14], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000394 "b 50f\n"
395 "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12
Michael Tylerbe13cea2023-01-17 11:04:14 +0000396 "tbz x27, #0, 50f\n"
397 "ld1 { v31.b }[12], [x26]\n"
398 "ld1 { v28.b }[12], [x25]\n"
399 "ld1 { v25.b }[12], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000400 "b 50f\n"
401 "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000402 "tbz x27, #1, 45f\n"
403 "ld1 { v31.h }[4], [x26], #0x2\n"
404 "ld1 { v28.h }[4], [x25], #0x2\n"
405 "ld1 { v25.h }[4], [x24], #0x2\n"
406 "tbz x27, #0, 50f\n"
407 "ld1 { v31.b }[10], [x26]\n"
408 "ld1 { v28.b }[10], [x25]\n"
409 "ld1 { v25.b }[10], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000410 "b 50f\n"
411 "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000412 "tbz x27, #0, 50f\n"
413 "ld1 { v31.b }[8], [x26]\n"
414 "ld1 { v28.b }[8], [x25]\n"
415 "ld1 { v25.b }[8], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000416 "b 50f\n"
417 "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000418 "tbz x27, #2, 48f\n"
419 "ldr s31, [x26], #0x4\n"
420 "ldr s28, [x25], #0x4\n"
421 "ldr s25, [x24], #0x4\n"
422 "tbz x27, #1, 47f\n"
423 "ld1 { v31.h }[2], [x26], #0x2\n"
424 "ld1 { v28.h }[2], [x25], #0x2\n"
425 "ld1 { v25.h }[2], [x24], #0x2\n"
426 "tbz x27, #0, 50f\n"
427 "ld1 { v31.b }[6], [x26]\n"
428 "ld1 { v28.b }[6], [x25]\n"
429 "ld1 { v25.b }[6], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000430 "b 50f\n"
431 "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
Michael Tylerbe13cea2023-01-17 11:04:14 +0000432 "tbz x27, #0, 50f\n"
433 "ld1 { v31.b }[4], [x26]\n"
434 "ld1 { v28.b }[4], [x25]\n"
435 "ld1 { v25.b }[4], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000436 "b 50f\n"
437 "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000438 "tbz x27, #1, 49f\n"
439 "ldr h31, [x26], #0x2\n"
440 "ldr h28, [x25], #0x2\n"
441 "ldr h25, [x24], #0x2\n"
442 "tbz x27, #0, 50f\n"
443 "ld1 { v31.b }[2], [x26]\n"
444 "ld1 { v28.b }[2], [x25]\n"
445 "ld1 { v25.b }[2], [x24]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000446 "b 50f\n"
447 "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000448 "ldr b31, [x26, #0x0]\n"
449 "ldr b28, [x25, #0x0]\n"
450 "ldr b25, [x24, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000451 "50:" // Height 3: Multiply loop: Ragged operand read: Done
452 "uadalp v1.8h, v31.16b\n"
453 "uadalp v30.8h, v28.16b\n"
454 "uadalp v27.8h, v25.16b\n"
455 "51:" // Height 3: Multiply loop: No odd multiplies
Michael Tylerbe13cea2023-01-17 11:04:14 +0000456 "add x28, x28, #0x1\n"
457 "cmp x28, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000458 "bne 36b\n"
459 "uadalp v0.4s, v1.8h\n"
460 "uadalp v29.4s, v30.8h\n"
Michael Tylerba209752022-12-15 12:39:29 +0000461 "addp v0.4s, v0.4s, v29.4s\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000462 "uadalp v26.4s, v27.8h\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000463 "addp v0.4s, v0.4s, v0.4s\n"
464 "addp v26.4s, v26.4s, v26.4s\n"
465 "mul v0.4s, v0.4s, v2.4s\n"
466 "str d0, [%x[out_ptr]], #0x8\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000467 "addp v26.4s, v26.4s, v26.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000468 "mul v26.4s, v26.4s, v2.4s\n"
469 "str s26, [%x[out_ptr]], #0x4\n"
470 "b 104f\n"
471 "52:" // Height 4
472 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000473 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
474 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000475 "movi v0.4s, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000476 "mov x28, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000477 "movi v30.8h, #0x0\n"
478 "movi v29.4s, #0x0\n"
479 "movi v27.8h, #0x0\n"
480 "movi v26.4s, #0x0\n"
481 "movi v24.8h, #0x0\n"
482 "movi v23.4s, #0x0\n"
483 "53:" // Height 4: String loop
Michael Tylerbe13cea2023-01-17 11:04:14 +0000484 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
485 "ldr w27, [x19, x28, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000486 "tbz %x[flags], #3, 54f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000487 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
488 "add x19, x19, %x[input_offset], LSL #3\n"
489 "ldr x26, [x19, #0x0]\n"
490 "ldr x25, [x19, #0x8]\n"
491 "ldr x24, [x19, #0x10]\n"
492 "ldr x23, [x19, #0x18]\n"
493 "cbnz x28, 55f\n"
494 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
495 "add x26, x26, x19\n"
496 "add x25, x25, x19\n"
497 "add x24, x24, x19\n"
498 "add x23, x23, x19\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000499 "b 55f\n"
500 "54:" // Height 4: setup direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +0000501 "mov x26, %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000502 "add x25, x26, %x[input_offset]\n"
503 "add x24, x25, %x[input_offset]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000504 "add x23, x24, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000505 "55:" // Height 4: input setup done
Michael Tylerbe13cea2023-01-17 11:04:14 +0000506 "cmp x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000507 "blt 59f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000508 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000509 "blt 58f\n"
510 "56:" // Height 4: Multiply loop: Main loop head
Michael Tylerbe13cea2023-01-17 11:04:14 +0000511 "ldr q31, [x26, #0x0]\n"
512 "ldr q28, [x25, #0x0]\n"
513 "ldr q25, [x24, #0x0]\n"
514 "ldr q22, [x23, #0x0]\n"
515 "cmp x9, #0x7e\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000516 "add x26, x26, #0x10\n"
517 "add x25, x25, #0x10\n"
518 "add x24, x24, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000519 "add x23, x23, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000520 "blt 57f\n"
521 "uadalp v0.4s, v1.8h\n"
522 "movi v1.8h, #0x0\n"
523 "uadalp v29.4s, v30.8h\n"
524 "movi v30.8h, #0x0\n"
525 "uadalp v26.4s, v27.8h\n"
526 "movi v27.8h, #0x0\n"
527 "uadalp v23.4s, v24.8h\n"
528 "movi v24.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000529 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000530 "57:" // Height 4: Multiply loop: unique 4: no collapse
531 "uadalp v1.8h, v31.16b\n"
532 "uadalp v30.8h, v28.16b\n"
533 "uadalp v27.8h, v25.16b\n"
534 "uadalp v24.8h, v22.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000535 "add x9, x9, #0x1\n"
536 "sub x27, x27, #0x10\n"
537 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000538 "bge 56b\n"
539 "58:" // Height 4: Multiply loop: Single iteration only
Michael Tylerbe13cea2023-01-17 11:04:14 +0000540 "sub x27, x27, #0x10\n"
541 "ldr q31, [x26, #0x0]\n"
542 "ldr q28, [x25, #0x0]\n"
543 "ldr q25, [x24, #0x0]\n"
544 "ldr q22, [x23, #0x0]\n"
545 "add x26, x26, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000546 "uadalp v1.8h, v31.16b\n"
547 "uadalp v30.8h, v28.16b\n"
548 "uadalp v27.8h, v25.16b\n"
549 "uadalp v24.8h, v22.16b\n"
550 "add x25, x25, #0x10\n"
551 "add x24, x24, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000552 "add x23, x23, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000553 "59:" // Height 4: Multiply loop: Main loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000554 "cbz x27, 68f\n"
555 "tbz x27, #3, 63f\n"
556 "ldr d31, [x26], #0x8\n"
557 "ldr d28, [x25], #0x8\n"
558 "ldr d25, [x24], #0x8\n"
559 "ldr d22, [x23], #0x8\n"
560 "tbz x27, #2, 61f\n"
561 "ld1 { v31.s }[2], [x26], #0x4\n"
562 "ld1 { v28.s }[2], [x25], #0x4\n"
563 "ld1 { v25.s }[2], [x24], #0x4\n"
564 "ld1 { v22.s }[2], [x23], #0x4\n"
565 "tbz x27, #1, 60f\n"
566 "ld1 { v31.h }[6], [x26], #0x2\n"
567 "ld1 { v28.h }[6], [x25], #0x2\n"
568 "ld1 { v25.h }[6], [x24], #0x2\n"
569 "ld1 { v22.h }[6], [x23], #0x2\n"
570 "tbz x27, #0, 67f\n"
571 "ld1 { v31.b }[14], [x26]\n"
572 "ld1 { v28.b }[14], [x25]\n"
573 "ld1 { v25.b }[14], [x24]\n"
574 "ld1 { v22.b }[14], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000575 "b 67f\n"
576 "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12
Michael Tylerbe13cea2023-01-17 11:04:14 +0000577 "tbz x27, #0, 67f\n"
578 "ld1 { v31.b }[12], [x26]\n"
579 "ld1 { v28.b }[12], [x25]\n"
580 "ld1 { v25.b }[12], [x24]\n"
581 "ld1 { v22.b }[12], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000582 "b 67f\n"
583 "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000584 "tbz x27, #1, 62f\n"
585 "ld1 { v31.h }[4], [x26], #0x2\n"
586 "ld1 { v28.h }[4], [x25], #0x2\n"
587 "ld1 { v25.h }[4], [x24], #0x2\n"
588 "ld1 { v22.h }[4], [x23], #0x2\n"
589 "tbz x27, #0, 67f\n"
590 "ld1 { v31.b }[10], [x26]\n"
591 "ld1 { v28.b }[10], [x25]\n"
592 "ld1 { v25.b }[10], [x24]\n"
593 "ld1 { v22.b }[10], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000594 "b 67f\n"
595 "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000596 "tbz x27, #0, 67f\n"
597 "ld1 { v31.b }[8], [x26]\n"
598 "ld1 { v28.b }[8], [x25]\n"
599 "ld1 { v25.b }[8], [x24]\n"
600 "ld1 { v22.b }[8], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000601 "b 67f\n"
602 "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000603 "tbz x27, #2, 65f\n"
604 "ldr s31, [x26], #0x4\n"
605 "ldr s28, [x25], #0x4\n"
606 "ldr s25, [x24], #0x4\n"
607 "ldr s22, [x23], #0x4\n"
608 "tbz x27, #1, 64f\n"
609 "ld1 { v31.h }[2], [x26], #0x2\n"
610 "ld1 { v28.h }[2], [x25], #0x2\n"
611 "ld1 { v25.h }[2], [x24], #0x2\n"
612 "ld1 { v22.h }[2], [x23], #0x2\n"
613 "tbz x27, #0, 67f\n"
614 "ld1 { v31.b }[6], [x26]\n"
615 "ld1 { v28.b }[6], [x25]\n"
616 "ld1 { v25.b }[6], [x24]\n"
617 "ld1 { v22.b }[6], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000618 "b 67f\n"
619 "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
Michael Tylerbe13cea2023-01-17 11:04:14 +0000620 "tbz x27, #0, 67f\n"
621 "ld1 { v31.b }[4], [x26]\n"
622 "ld1 { v28.b }[4], [x25]\n"
623 "ld1 { v25.b }[4], [x24]\n"
624 "ld1 { v22.b }[4], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000625 "b 67f\n"
626 "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000627 "tbz x27, #1, 66f\n"
628 "ldr h31, [x26], #0x2\n"
629 "ldr h28, [x25], #0x2\n"
630 "ldr h25, [x24], #0x2\n"
631 "ldr h22, [x23], #0x2\n"
632 "tbz x27, #0, 67f\n"
633 "ld1 { v31.b }[2], [x26]\n"
634 "ld1 { v28.b }[2], [x25]\n"
635 "ld1 { v25.b }[2], [x24]\n"
636 "ld1 { v22.b }[2], [x23]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000637 "b 67f\n"
638 "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000639 "ldr b31, [x26, #0x0]\n"
640 "ldr b28, [x25, #0x0]\n"
641 "ldr b25, [x24, #0x0]\n"
642 "ldr b22, [x23, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000643 "67:" // Height 4: Multiply loop: Ragged operand read: Done
644 "uadalp v1.8h, v31.16b\n"
645 "uadalp v30.8h, v28.16b\n"
646 "uadalp v27.8h, v25.16b\n"
647 "uadalp v24.8h, v22.16b\n"
648 "68:" // Height 4: Multiply loop: No odd multiplies
Michael Tylerbe13cea2023-01-17 11:04:14 +0000649 "add x28, x28, #0x1\n"
650 "cmp x28, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000651 "bne 53b\n"
652 "uadalp v0.4s, v1.8h\n"
653 "uadalp v29.4s, v30.8h\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000654 "addp v0.4s, v0.4s, v29.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000655 "uadalp v26.4s, v27.8h\n"
656 "uadalp v23.4s, v24.8h\n"
657 "addp v29.4s, v26.4s, v23.4s\n"
658 "addp v0.4s, v0.4s, v29.4s\n"
659 "mul v0.4s, v0.4s, v2.4s\n"
660 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
661 "b 104f\n"
662 "69:" // Height 5
663 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000664 "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
665 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000666 "movi v0.4s, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000667 "mov x28, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000668 "movi v30.8h, #0x0\n"
669 "movi v29.4s, #0x0\n"
670 "movi v27.8h, #0x0\n"
671 "movi v26.4s, #0x0\n"
672 "movi v24.8h, #0x0\n"
673 "movi v23.4s, #0x0\n"
674 "movi v21.8h, #0x0\n"
675 "movi v20.4s, #0x0\n"
676 "70:" // Height 5: String loop
Michael Tylerbe13cea2023-01-17 11:04:14 +0000677 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
678 "ldr w27, [x19, x28, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000679 "tbz %x[flags], #3, 71f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000680 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
681 "add x19, x19, %x[input_offset], LSL #3\n"
682 "ldr x26, [x19, #0x0]\n"
683 "ldr x25, [x19, #0x8]\n"
684 "ldr x24, [x19, #0x10]\n"
685 "ldr x23, [x19, #0x18]\n"
686 "ldr x22, [x19, #0x20]\n"
687 "cbnz x28, 72f\n"
688 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
689 "add x26, x26, x19\n"
690 "add x25, x25, x19\n"
691 "add x24, x24, x19\n"
692 "add x23, x23, x19\n"
693 "add x22, x22, x19\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000694 "b 72f\n"
695 "71:" // Height 5: setup direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +0000696 "mov x26, %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000697 "add x25, x26, %x[input_offset]\n"
698 "add x24, x25, %x[input_offset]\n"
699 "add x23, x24, %x[input_offset]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000700 "add x22, x23, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000701 "72:" // Height 5: input setup done
Michael Tylerbe13cea2023-01-17 11:04:14 +0000702 "cmp x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000703 "blt 76f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000704 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000705 "blt 75f\n"
706 "73:" // Height 5: Multiply loop: Main loop head
Michael Tylerbe13cea2023-01-17 11:04:14 +0000707 "ldr q31, [x26, #0x0]\n"
708 "ldr q28, [x25, #0x0]\n"
709 "ldr q25, [x24, #0x0]\n"
710 "ldr q22, [x23, #0x0]\n"
711 "ldr q19, [x22, #0x0]\n"
712 "cmp x9, #0x7e\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000713 "add x26, x26, #0x10\n"
714 "add x25, x25, #0x10\n"
715 "add x24, x24, #0x10\n"
716 "add x23, x23, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000717 "add x22, x22, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000718 "blt 74f\n"
719 "uadalp v0.4s, v1.8h\n"
720 "movi v1.8h, #0x0\n"
721 "uadalp v29.4s, v30.8h\n"
722 "movi v30.8h, #0x0\n"
723 "uadalp v26.4s, v27.8h\n"
724 "movi v27.8h, #0x0\n"
725 "uadalp v23.4s, v24.8h\n"
726 "movi v24.8h, #0x0\n"
727 "uadalp v20.4s, v21.8h\n"
728 "movi v21.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000729 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000730 "74:" // Height 5: Multiply loop: unique 5: no collapse
731 "uadalp v1.8h, v31.16b\n"
732 "uadalp v30.8h, v28.16b\n"
733 "uadalp v27.8h, v25.16b\n"
734 "uadalp v24.8h, v22.16b\n"
735 "uadalp v21.8h, v19.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000736 "add x9, x9, #0x1\n"
737 "sub x27, x27, #0x10\n"
738 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000739 "bge 73b\n"
740 "75:" // Height 5: Multiply loop: Single iteration only
Michael Tylerbe13cea2023-01-17 11:04:14 +0000741 "sub x27, x27, #0x10\n"
742 "ldr q31, [x26, #0x0]\n"
743 "ldr q28, [x25, #0x0]\n"
744 "ldr q25, [x24, #0x0]\n"
745 "ldr q22, [x23, #0x0]\n"
746 "ldr q19, [x22, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000747 "uadalp v1.8h, v31.16b\n"
748 "uadalp v30.8h, v28.16b\n"
749 "uadalp v27.8h, v25.16b\n"
750 "uadalp v24.8h, v22.16b\n"
751 "uadalp v21.8h, v19.16b\n"
752 "add x26, x26, #0x10\n"
753 "add x25, x25, #0x10\n"
754 "add x24, x24, #0x10\n"
755 "add x23, x23, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000756 "add x22, x22, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000757 "76:" // Height 5: Multiply loop: Main loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000758 "cbz x27, 85f\n"
759 "tbz x27, #3, 80f\n"
760 "ldr d31, [x26], #0x8\n"
761 "ldr d28, [x25], #0x8\n"
762 "ldr d25, [x24], #0x8\n"
763 "ldr d22, [x23], #0x8\n"
764 "ldr d19, [x22], #0x8\n"
765 "tbz x27, #2, 78f\n"
766 "ld1 { v31.s }[2], [x26], #0x4\n"
767 "ld1 { v28.s }[2], [x25], #0x4\n"
768 "ld1 { v25.s }[2], [x24], #0x4\n"
769 "ld1 { v22.s }[2], [x23], #0x4\n"
770 "ld1 { v19.s }[2], [x22], #0x4\n"
771 "tbz x27, #1, 77f\n"
772 "ld1 { v31.h }[6], [x26], #0x2\n"
773 "ld1 { v28.h }[6], [x25], #0x2\n"
774 "ld1 { v25.h }[6], [x24], #0x2\n"
775 "ld1 { v22.h }[6], [x23], #0x2\n"
776 "ld1 { v19.h }[6], [x22], #0x2\n"
777 "tbz x27, #0, 84f\n"
778 "ld1 { v31.b }[14], [x26]\n"
779 "ld1 { v28.b }[14], [x25]\n"
780 "ld1 { v25.b }[14], [x24]\n"
781 "ld1 { v22.b }[14], [x23]\n"
782 "ld1 { v19.b }[14], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000783 "b 84f\n"
784 "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12
Michael Tylerbe13cea2023-01-17 11:04:14 +0000785 "tbz x27, #0, 84f\n"
786 "ld1 { v31.b }[12], [x26]\n"
787 "ld1 { v28.b }[12], [x25]\n"
788 "ld1 { v25.b }[12], [x24]\n"
789 "ld1 { v22.b }[12], [x23]\n"
790 "ld1 { v19.b }[12], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000791 "b 84f\n"
792 "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000793 "tbz x27, #1, 79f\n"
794 "ld1 { v31.h }[4], [x26], #0x2\n"
795 "ld1 { v28.h }[4], [x25], #0x2\n"
796 "ld1 { v25.h }[4], [x24], #0x2\n"
797 "ld1 { v22.h }[4], [x23], #0x2\n"
798 "ld1 { v19.h }[4], [x22], #0x2\n"
799 "tbz x27, #0, 84f\n"
800 "ld1 { v31.b }[10], [x26]\n"
801 "ld1 { v28.b }[10], [x25]\n"
802 "ld1 { v25.b }[10], [x24]\n"
803 "ld1 { v22.b }[10], [x23]\n"
804 "ld1 { v19.b }[10], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000805 "b 84f\n"
806 "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8
Michael Tylerbe13cea2023-01-17 11:04:14 +0000807 "tbz x27, #0, 84f\n"
808 "ld1 { v31.b }[8], [x26]\n"
809 "ld1 { v28.b }[8], [x25]\n"
810 "ld1 { v25.b }[8], [x24]\n"
811 "ld1 { v22.b }[8], [x23]\n"
812 "ld1 { v19.b }[8], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000813 "b 84f\n"
814 "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000815 "tbz x27, #2, 82f\n"
816 "ldr s31, [x26], #0x4\n"
817 "ldr s28, [x25], #0x4\n"
818 "ldr s25, [x24], #0x4\n"
819 "ldr s22, [x23], #0x4\n"
820 "ldr s19, [x22], #0x4\n"
821 "tbz x27, #1, 81f\n"
822 "ld1 { v31.h }[2], [x26], #0x2\n"
823 "ld1 { v28.h }[2], [x25], #0x2\n"
824 "ld1 { v25.h }[2], [x24], #0x2\n"
825 "ld1 { v22.h }[2], [x23], #0x2\n"
826 "ld1 { v19.h }[2], [x22], #0x2\n"
827 "tbz x27, #0, 84f\n"
828 "ld1 { v31.b }[6], [x26]\n"
829 "ld1 { v28.b }[6], [x25]\n"
830 "ld1 { v25.b }[6], [x24]\n"
831 "ld1 { v22.b }[6], [x23]\n"
832 "ld1 { v19.b }[6], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000833 "b 84f\n"
834 "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4
Michael Tylerbe13cea2023-01-17 11:04:14 +0000835 "tbz x27, #0, 84f\n"
836 "ld1 { v31.b }[4], [x26]\n"
837 "ld1 { v28.b }[4], [x25]\n"
838 "ld1 { v25.b }[4], [x24]\n"
839 "ld1 { v22.b }[4], [x23]\n"
840 "ld1 { v19.b }[4], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000841 "b 84f\n"
842 "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000843 "tbz x27, #1, 83f\n"
844 "ldr h31, [x26], #0x2\n"
845 "ldr h28, [x25], #0x2\n"
846 "ldr h25, [x24], #0x2\n"
847 "ldr h22, [x23], #0x2\n"
848 "ldr h19, [x22], #0x2\n"
849 "tbz x27, #0, 84f\n"
850 "ld1 { v31.b }[2], [x26]\n"
851 "ld1 { v28.b }[2], [x25]\n"
852 "ld1 { v25.b }[2], [x24]\n"
853 "ld1 { v22.b }[2], [x23]\n"
854 "ld1 { v19.b }[2], [x22]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000855 "b 84f\n"
856 "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
Michael Tylerbe13cea2023-01-17 11:04:14 +0000857 "ldr b31, [x26, #0x0]\n"
858 "ldr b28, [x25, #0x0]\n"
859 "ldr b25, [x24, #0x0]\n"
860 "ldr b22, [x23, #0x0]\n"
861 "ldr b19, [x22, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000862 "84:" // Height 5: Multiply loop: Ragged operand read: Done
863 "uadalp v1.8h, v31.16b\n"
864 "uadalp v30.8h, v28.16b\n"
865 "uadalp v27.8h, v25.16b\n"
866 "uadalp v24.8h, v22.16b\n"
867 "uadalp v21.8h, v19.16b\n"
868 "85:" // Height 5: Multiply loop: No odd multiplies
Michael Tylerbe13cea2023-01-17 11:04:14 +0000869 "add x28, x28, #0x1\n"
870 "cmp x28, x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000871 "bne 70b\n"
872 "uadalp v0.4s, v1.8h\n"
873 "uadalp v29.4s, v30.8h\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000874 "addp v0.4s, v0.4s, v29.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000875 "uadalp v26.4s, v27.8h\n"
876 "uadalp v23.4s, v24.8h\n"
Michael Tylerba209752022-12-15 12:39:29 +0000877 "addp v29.4s, v26.4s, v23.4s\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000878 "uadalp v20.4s, v21.8h\n"
Michael Tylerba209752022-12-15 12:39:29 +0000879 "addp v0.4s, v0.4s, v29.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000880 "addp v20.4s, v20.4s, v20.4s\n"
881 "mul v0.4s, v0.4s, v2.4s\n"
882 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000883 "addp v20.4s, v20.4s, v20.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000884 "mul v20.4s, v20.4s, v2.4s\n"
885 "str s20, [%x[out_ptr]], #0x4\n"
886 "b 104f\n"
887 "86:" // Height 6
888 "movi v1.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000889 "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
890 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000891 "movi v0.4s, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000892 "mov x28, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000893 "movi v30.8h, #0x0\n"
894 "movi v29.4s, #0x0\n"
895 "movi v27.8h, #0x0\n"
896 "movi v26.4s, #0x0\n"
897 "movi v24.8h, #0x0\n"
898 "movi v23.4s, #0x0\n"
899 "movi v21.8h, #0x0\n"
900 "movi v20.4s, #0x0\n"
901 "movi v18.8h, #0x0\n"
902 "movi v17.4s, #0x0\n"
903 "87:" // Height 6: String loop
Michael Tylerbe13cea2023-01-17 11:04:14 +0000904 "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
905 "ldr w27, [x19, x28, LSL #0x2]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000906 "tbz %x[flags], #3, 88f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000907 "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
908 "add x19, x19, %x[input_offset], LSL #3\n"
909 "ldr x26, [x19, #0x0]\n"
910 "ldr x25, [x19, #0x8]\n"
911 "ldr x24, [x19, #0x10]\n"
912 "ldr x23, [x19, #0x18]\n"
913 "ldr x22, [x19, #0x20]\n"
914 "ldr x20, [x19, #0x28]\n"
915 "cbnz x28, 89f\n"
916 "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
917 "add x26, x26, x19\n"
918 "add x25, x25, x19\n"
919 "add x24, x24, x19\n"
920 "add x23, x23, x19\n"
921 "add x22, x22, x19\n"
922 "add x20, x20, x19\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000923 "b 89f\n"
924 "88:" // Height 6: setup direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +0000925 "mov x26, %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000926 "add x25, x26, %x[input_offset]\n"
927 "add x24, x25, %x[input_offset]\n"
928 "add x23, x24, %x[input_offset]\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000929 "add x22, x23, %x[input_offset]\n"
930 "add x20, x22, %x[input_offset]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000931 "89:" // Height 6: input setup done
Michael Tylerbe13cea2023-01-17 11:04:14 +0000932 "cmp x27, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000933 "blt 93f\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000934 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000935 "blt 92f\n"
936 "90:" // Height 6: Multiply loop: Main loop head
Michael Tylerbe13cea2023-01-17 11:04:14 +0000937 "ldr q31, [x26, #0x0]\n"
938 "ldr q28, [x25, #0x0]\n"
939 "ldr q25, [x24, #0x0]\n"
940 "ldr q22, [x23, #0x0]\n"
941 "ldr q19, [x22, #0x0]\n"
942 "ldr q16, [x20, #0x0]\n"
943 "cmp x9, #0x7e\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000944 "add x26, x26, #0x10\n"
945 "add x25, x25, #0x10\n"
946 "add x24, x24, #0x10\n"
947 "add x23, x23, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000948 "add x22, x22, #0x10\n"
949 "add x20, x20, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000950 "blt 91f\n"
951 "uadalp v0.4s, v1.8h\n"
952 "movi v1.8h, #0x0\n"
953 "uadalp v29.4s, v30.8h\n"
954 "movi v30.8h, #0x0\n"
955 "uadalp v26.4s, v27.8h\n"
956 "movi v27.8h, #0x0\n"
957 "uadalp v23.4s, v24.8h\n"
958 "movi v24.8h, #0x0\n"
959 "uadalp v20.4s, v21.8h\n"
960 "movi v21.8h, #0x0\n"
961 "uadalp v17.4s, v18.8h\n"
962 "movi v18.8h, #0x0\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000963 "mov x9, #0x0\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000964 "91:" // Height 6: Multiply loop: unique 6: no collapse
965 "uadalp v1.8h, v31.16b\n"
966 "uadalp v30.8h, v28.16b\n"
967 "uadalp v27.8h, v25.16b\n"
968 "uadalp v24.8h, v22.16b\n"
969 "uadalp v21.8h, v19.16b\n"
970 "uadalp v18.8h, v16.16b\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000971 "add x9, x9, #0x1\n"
972 "sub x27, x27, #0x10\n"
973 "cmp x27, #0x20\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000974 "bge 90b\n"
975 "92:" // Height 6: Multiply loop: Single iteration only
Michael Tylerbe13cea2023-01-17 11:04:14 +0000976 "sub x27, x27, #0x10\n"
977 "ldr q31, [x26, #0x0]\n"
978 "ldr q28, [x25, #0x0]\n"
979 "ldr q25, [x24, #0x0]\n"
980 "ldr q22, [x23, #0x0]\n"
981 "ldr q19, [x22, #0x0]\n"
982 "ldr q16, [x20, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000983 "uadalp v1.8h, v31.16b\n"
984 "uadalp v30.8h, v28.16b\n"
985 "uadalp v27.8h, v25.16b\n"
986 "uadalp v24.8h, v22.16b\n"
987 "uadalp v21.8h, v19.16b\n"
988 "uadalp v18.8h, v16.16b\n"
989 "add x26, x26, #0x10\n"
990 "add x25, x25, #0x10\n"
991 "add x24, x24, #0x10\n"
992 "add x23, x23, #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +0000993 "add x22, x22, #0x10\n"
994 "add x20, x20, #0x10\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000995 "93:" // Height 6: Multiply loop: Main loop skip
Michael Tylerbe13cea2023-01-17 11:04:14 +0000996 "cbz x27, 102f\n"
997 "tbz x27, #3, 97f\n"
998 "ldr d31, [x26], #0x8\n"
999 "ldr d28, [x25], #0x8\n"
1000 "ldr d25, [x24], #0x8\n"
1001 "ldr d22, [x23], #0x8\n"
1002 "ldr d19, [x22], #0x8\n"
1003 "ldr d16, [x20], #0x8\n"
1004 "tbz x27, #2, 95f\n"
1005 "ld1 { v31.s }[2], [x26], #0x4\n"
1006 "ld1 { v28.s }[2], [x25], #0x4\n"
1007 "ld1 { v25.s }[2], [x24], #0x4\n"
1008 "ld1 { v22.s }[2], [x23], #0x4\n"
1009 "ld1 { v19.s }[2], [x22], #0x4\n"
1010 "ld1 { v16.s }[2], [x20], #0x4\n"
1011 "tbz x27, #1, 94f\n"
1012 "ld1 { v31.h }[6], [x26], #0x2\n"
1013 "ld1 { v28.h }[6], [x25], #0x2\n"
1014 "ld1 { v25.h }[6], [x24], #0x2\n"
1015 "ld1 { v22.h }[6], [x23], #0x2\n"
1016 "ld1 { v19.h }[6], [x22], #0x2\n"
1017 "ld1 { v16.h }[6], [x20], #0x2\n"
1018 "tbz x27, #0, 101f\n"
1019 "ld1 { v31.b }[14], [x26]\n"
1020 "ld1 { v28.b }[14], [x25]\n"
1021 "ld1 { v25.b }[14], [x24]\n"
1022 "ld1 { v22.b }[14], [x23]\n"
1023 "ld1 { v19.b }[14], [x22]\n"
1024 "ld1 { v16.b }[14], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001025 "b 101f\n"
1026 "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12
Michael Tylerbe13cea2023-01-17 11:04:14 +00001027 "tbz x27, #0, 101f\n"
1028 "ld1 { v31.b }[12], [x26]\n"
1029 "ld1 { v28.b }[12], [x25]\n"
1030 "ld1 { v25.b }[12], [x24]\n"
1031 "ld1 { v22.b }[12], [x23]\n"
1032 "ld1 { v19.b }[12], [x22]\n"
1033 "ld1 { v16.b }[12], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001034 "b 101f\n"
1035 "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8
Michael Tylerbe13cea2023-01-17 11:04:14 +00001036 "tbz x27, #1, 96f\n"
1037 "ld1 { v31.h }[4], [x26], #0x2\n"
1038 "ld1 { v28.h }[4], [x25], #0x2\n"
1039 "ld1 { v25.h }[4], [x24], #0x2\n"
1040 "ld1 { v22.h }[4], [x23], #0x2\n"
1041 "ld1 { v19.h }[4], [x22], #0x2\n"
1042 "ld1 { v16.h }[4], [x20], #0x2\n"
1043 "tbz x27, #0, 101f\n"
1044 "ld1 { v31.b }[10], [x26]\n"
1045 "ld1 { v28.b }[10], [x25]\n"
1046 "ld1 { v25.b }[10], [x24]\n"
1047 "ld1 { v22.b }[10], [x23]\n"
1048 "ld1 { v19.b }[10], [x22]\n"
1049 "ld1 { v16.b }[10], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001050 "b 101f\n"
1051 "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8
Michael Tylerbe13cea2023-01-17 11:04:14 +00001052 "tbz x27, #0, 101f\n"
1053 "ld1 { v31.b }[8], [x26]\n"
1054 "ld1 { v28.b }[8], [x25]\n"
1055 "ld1 { v25.b }[8], [x24]\n"
1056 "ld1 { v22.b }[8], [x23]\n"
1057 "ld1 { v19.b }[8], [x22]\n"
1058 "ld1 { v16.b }[8], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001059 "b 101f\n"
1060 "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0
Michael Tylerbe13cea2023-01-17 11:04:14 +00001061 "tbz x27, #2, 99f\n"
1062 "ldr s31, [x26], #0x4\n"
1063 "ldr s28, [x25], #0x4\n"
1064 "ldr s25, [x24], #0x4\n"
1065 "ldr s22, [x23], #0x4\n"
1066 "ldr s19, [x22], #0x4\n"
1067 "ldr s16, [x20], #0x4\n"
1068 "tbz x27, #1, 98f\n"
1069 "ld1 { v31.h }[2], [x26], #0x2\n"
1070 "ld1 { v28.h }[2], [x25], #0x2\n"
1071 "ld1 { v25.h }[2], [x24], #0x2\n"
1072 "ld1 { v22.h }[2], [x23], #0x2\n"
1073 "ld1 { v19.h }[2], [x22], #0x2\n"
1074 "ld1 { v16.h }[2], [x20], #0x2\n"
1075 "tbz x27, #0, 101f\n"
1076 "ld1 { v31.b }[6], [x26]\n"
1077 "ld1 { v28.b }[6], [x25]\n"
1078 "ld1 { v25.b }[6], [x24]\n"
1079 "ld1 { v22.b }[6], [x23]\n"
1080 "ld1 { v19.b }[6], [x22]\n"
1081 "ld1 { v16.b }[6], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001082 "b 101f\n"
1083 "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4
Michael Tylerbe13cea2023-01-17 11:04:14 +00001084 "tbz x27, #0, 101f\n"
1085 "ld1 { v31.b }[4], [x26]\n"
1086 "ld1 { v28.b }[4], [x25]\n"
1087 "ld1 { v25.b }[4], [x24]\n"
1088 "ld1 { v22.b }[4], [x23]\n"
1089 "ld1 { v19.b }[4], [x22]\n"
1090 "ld1 { v16.b }[4], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001091 "b 101f\n"
1092 "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0
Michael Tylerbe13cea2023-01-17 11:04:14 +00001093 "tbz x27, #1, 100f\n"
1094 "ldr h31, [x26], #0x2\n"
1095 "ldr h28, [x25], #0x2\n"
1096 "ldr h25, [x24], #0x2\n"
1097 "ldr h22, [x23], #0x2\n"
1098 "ldr h19, [x22], #0x2\n"
1099 "ldr h16, [x20], #0x2\n"
1100 "tbz x27, #0, 101f\n"
1101 "ld1 { v31.b }[2], [x26]\n"
1102 "ld1 { v28.b }[2], [x25]\n"
1103 "ld1 { v25.b }[2], [x24]\n"
1104 "ld1 { v22.b }[2], [x23]\n"
1105 "ld1 { v19.b }[2], [x22]\n"
1106 "ld1 { v16.b }[2], [x20]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001107 "b 101f\n"
1108 "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
Michael Tylerbe13cea2023-01-17 11:04:14 +00001109 "ldr b31, [x26, #0x0]\n"
1110 "ldr b28, [x25, #0x0]\n"
1111 "ldr b25, [x24, #0x0]\n"
1112 "ldr b22, [x23, #0x0]\n"
1113 "ldr b19, [x22, #0x0]\n"
1114 "ldr b16, [x20, #0x0]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001115 "101:" // Height 6: Multiply loop: Ragged operand read: Done
1116 "uadalp v1.8h, v31.16b\n"
1117 "uadalp v30.8h, v28.16b\n"
1118 "uadalp v27.8h, v25.16b\n"
1119 "uadalp v24.8h, v22.16b\n"
1120 "uadalp v21.8h, v19.16b\n"
1121 "uadalp v18.8h, v16.16b\n"
1122 "102:" // Height 6: Multiply loop: No odd multiplies
Michael Tylerbe13cea2023-01-17 11:04:14 +00001123 "add x28, x28, #0x1\n"
1124 "cmp x28, x21\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001125 "bne 87b\n"
1126 "uadalp v0.4s, v1.8h\n"
1127 "uadalp v29.4s, v30.8h\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +00001128 "addp v0.4s, v0.4s, v29.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001129 "uadalp v26.4s, v27.8h\n"
1130 "uadalp v23.4s, v24.8h\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +00001131 "addp v29.4s, v26.4s, v23.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001132 "uadalp v20.4s, v21.8h\n"
1133 "uadalp v17.4s, v18.8h\n"
1134 "addp v0.4s, v0.4s, v29.4s\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +00001135 "subs %x[M], %x[M], #0x6\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001136 "addp v20.4s, v20.4s, v17.4s\n"
1137 "mul v0.4s, v0.4s, v2.4s\n"
1138 "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
Michael Tylerbe13cea2023-01-17 11:04:14 +00001139 "addp v20.4s, v20.4s, v20.4s\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001140 "mul v20.4s, v20.4s, v2.4s\n"
1141 "str d20, [%x[out_ptr]], #0x8\n"
1142 "beq 104f\n"
1143 "tbz %x[flags], #3, 103f\n"
1144 "add %x[input_offset], %x[input_offset], #0x6\n"
1145 "b 1b\n"
1146 "103:" // Update direct input
Michael Tylerbe13cea2023-01-17 11:04:14 +00001147 "mov x19, #0x6\n"
1148 "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001149 "b 1b\n"
1150 "104:" // Exit
1151
Michael Tylerbe13cea2023-01-17 11:04:14 +00001152 : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001153 : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
Michael Tylerbe13cea2023-01-17 11:04:14 +00001154 : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001155 );
1156}
1157
1158} // namespace arm_gemm
1159
1160#endif // __aarch64__