blob: 91870e2e5437f4f4bda4f547311701ebdd64cecf [file] [log] [blame]
Georgios Pinitas1d480652019-01-23 11:24:50 +00001/*
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01002 * Copyright (c) 2018-2020 Arm Limited.
Georgios Pinitas1d480652019-01-23 11:24:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifdef __aarch64__
25
26#include <algorithm>
27
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010028#include "arm_gemm.hpp"
Georgios Pinitas1d480652019-01-23 11:24:50 +000029#include <cstdint>
30#include "../../asmlib.hpp"
31#include "../../utils.hpp"
32
33namespace arm_gemm {
34
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010035void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) {
Georgios Pinitas1d480652019-01-23 11:24:50 +000036 const int K_stride = ((K + 3) / 4) * 4;
37 const long loops_count = ((K + 16) / 32) - 1;
38 K -= loops_count * 32;
39 const long regs_count = (K / 16) - 1;
Georgios Pinitas14613832019-03-01 19:07:11 +000040 K -= (regs_count + 1) * 16;
41 const long blocks_count = K / 4;
42 const long odds_count = K - (blocks_count * 4);
Georgios Pinitas1d480652019-01-23 11:24:50 +000043
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010044 int rows_to_compute;
45
46 for (int y=0; y<M; y+=rows_to_compute) {
Georgios Pinitas1d480652019-01-23 11:24:50 +000047 const uint8_t * const a_ptr0_base = A + (y * lda);
48 const unsigned long ldab = lda * sizeof(uint8_t);
49
50 uint32_t *c_ptr0 = C + (y * ldc);
Georgios Pinitas1d480652019-01-23 11:24:50 +000051
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010052 rows_to_compute = M-y;
53 if (rows_to_compute > 4) {
54 if (rows_to_compute % 4) {
55 rows_to_compute = 4 - 1;
56 } else {
57 rows_to_compute = 4;
58 }
59 }
60
Georgios Pinitas1d480652019-01-23 11:24:50 +000061 for (int x0=0; x0<N; x0+=16ul) {
62 const long width = std::min((unsigned long)N-x0, 16ul);
Georgios Pinitas1d480652019-01-23 11:24:50 +000063 long loops = loops_count;
64 long regs = regs_count;
Georgios Pinitas14613832019-03-01 19:07:11 +000065 long blocks = blocks_count;
66 long odds = odds_count;
Georgios Pinitas1d480652019-01-23 11:24:50 +000067 const uint8_t *a_ptr0 = a_ptr0_base;
68 const uint8_t *b_ptr0 = B + (K_stride * x0);
Georgios Pinitas14613832019-03-01 19:07:11 +000069 const bool use_result_buffer = (width < 16);
70 uint32_t result_buffer[64];
71 const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
72 uint32_t *c_ptr_real = c_ptr0;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010073 if (use_result_buffer && append) {
Georgios Pinitas14613832019-03-01 19:07:11 +000074 for(int cy=0; cy<std::min(M-y, 4); cy++) {
75 for(unsigned int cx=0; cx<width; cx++) {
76 result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
77 }
78 }
79 }
80 if (use_result_buffer) {
81 c_ptr0 = result_buffer;
82 }
Georgios Pinitas1d480652019-01-23 11:24:50 +000083
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010084 switch(rows_to_compute) {
Georgios Pinitas1d480652019-01-23 11:24:50 +000085 case 1:
86 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010087 "temploadreg0 .req X0\n"
88 "temploadreg1 .req X1\n"
89 "temploadreg2 .req X2\n"
90 "temploadreg3 .req X3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010091 "cbnz %[append], 1f\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010092 "movi v16.4s, #0\n"
93 "ldr q0, [%[a_ptr0]]\n"
94 "movi v17.4s, #0\n"
95 "ldr q8, [%[b_ptr0]]\n"
96 "movi v18.4s, #0\n"
97 "ldr q9, [%[b_ptr0], #0x10]\n"
98 "movi v19.4s, #0\n"
99 "ldr q10, [%[b_ptr0], #0x20]\n"
100 "ldr q11, [%[b_ptr0], #0x30]\n"
101 "add %[a_ptr0], %[a_ptr0], #0x10\n"
102 "ldr q12, [%[b_ptr0], #0x40]\n"
103 "ldr q13, [%[b_ptr0], #0x50]\n"
104 "ldr d14, [%[b_ptr0], #0x60]\n"
105 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
106 "add %[b_ptr0], %[b_ptr0], #0x80\n"
107 "cbz %[loops], 2f\n"
108 "b 3f\n"
109 "1:\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100110 "ldr q16, [%[c_ptr0]]\n"
111 "ldr q17, [%[c_ptr0], #0x10]\n"
112 "ldr q18, [%[c_ptr0], #0x20]\n"
113 "ldr q19, [%[c_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100114 "ldr q0, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100115 "add %[a_ptr0], %[a_ptr0], #0x10\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100116 "ldr q8, [%[b_ptr0]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100117 "ldr q9, [%[b_ptr0], #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100118 "ldr q10, [%[b_ptr0], #0x20]\n"
119 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100120 "ldr q12, [%[b_ptr0], #0x40]\n"
121 "ldr q13, [%[b_ptr0], #0x50]\n"
122 "ldr d14, [%[b_ptr0], #0x60]\n"
123 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
124 "add %[b_ptr0], %[b_ptr0], #0x80\n"
125 "cbz %[loops], 2f\n"
126 "3:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100127 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100128 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100129 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100130 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100131 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100132 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100133 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100134 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100135 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100136 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100137 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100138 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100139 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100140 "ldr d9, [%[b_ptr0], #0x10]\n"
141 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
142 "subs %[loops], %[loops], #0x1\n"
143 "ins v4.d[1], temploadreg0\n"
144 "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
145 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
146 "add %[a_ptr0], %[a_ptr0], #0x20\n"
147 "ldr d10, [%[b_ptr0], #0x20]\n"
148 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
149 "ldr d11, [%[b_ptr0], #0x30]\n"
150 "ins v15.d[1], temploadreg3\n"
151 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
152 "ldr d12, [%[b_ptr0], #0x40]\n"
153 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100154 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100155 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
156 "ldr d13, [%[b_ptr0], #0x50]\n"
157 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100158 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100159 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
160 "ldr d14, [%[b_ptr0], #0x60]\n"
161 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100162 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100163 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
164 "ldr d15, [%[b_ptr0], #0x70]\n"
165 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100166 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100167 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
168 "ins v12.d[1], temploadreg0\n"
169 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100170 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100171 "ldr d8, [%[b_ptr0], #-0x80]\n"
172 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100173 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100174 "ldr d9, [%[b_ptr0], #-0x70]\n"
175 "ins v13.d[1], temploadreg1\n"
176 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
177 "ldr d10, [%[b_ptr0], #-0x60]\n"
178 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100179 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100180 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
181 "ldr d11, [%[b_ptr0], #-0x50]\n"
182 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100183 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100184 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
185 "ldr d12, [%[b_ptr0], #-0x40]\n"
186 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100187 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100188 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
189 "ldr d13, [%[b_ptr0], #-0x30]\n"
190 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100191 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100192 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
193 "ldr d14, [%[b_ptr0], #-0x20]\n"
194 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100195 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100196 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
197 "ldr d15, [%[b_ptr0], #-0x10]\n"
198 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100199 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100200 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
201 "ldr d0, [%[a_ptr0], #-0x10]\n"
202 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100203 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100204 "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
205 "ldr d8, [%[b_ptr0]]\n"
206 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100207 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100208 "ins v0.d[1], temploadreg0\n"
209 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
210 "ins v13.d[1], temploadreg1\n"
211 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
212 "ldr d10, [%[b_ptr0], #0x20]\n"
213 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100214 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100215 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
216 "ldr d11, [%[b_ptr0], #0x30]\n"
217 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100218 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100219 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
220 "ldr d12, [%[b_ptr0], #0x40]\n"
221 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100222 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100223 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
224 "ldr d13, [%[b_ptr0], #0x50]\n"
225 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100226 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100227 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
228 "ldr d14, [%[b_ptr0], #0x60]\n"
229 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100230 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100231 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
232 "ldr d15, [%[b_ptr0], #0x70]\n"
233 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100234 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100235 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
236 "ins v12.d[1], temploadreg0\n"
237 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100238 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100239 "ldr d8, [%[b_ptr0], #-0x80]\n"
240 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100241 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100242 "ldr d9, [%[b_ptr0], #-0x70]\n"
243 "ins v13.d[1], temploadreg1\n"
244 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
245 "ldr d10, [%[b_ptr0], #-0x60]\n"
246 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100247 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100248 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
249 "ldr d11, [%[b_ptr0], #-0x50]\n"
250 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100251 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100252 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
253 "ldr d12, [%[b_ptr0], #-0x40]\n"
254 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100255 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100256 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
257 "ldr d13, [%[b_ptr0], #-0x30]\n"
258 "ins v9.d[1], temploadreg1\n"
259 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
260 "ldr d14, [%[b_ptr0], #-0x20]\n"
261 "ins v10.d[1], temploadreg2\n"
262 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
263 "ins v11.d[1], temploadreg3\n"
264 "ins v12.d[1], temploadreg0\n"
265 "ins v13.d[1], temploadreg1\n"
266 "b.ne 3b\n"
267 "2:\n"
268 "ins v14.d[1], temploadreg2\n"
269 "prfm PSTL1KEEP, [%[c_ptr0]]\n"
270 "ldr d15, [%[b_ptr0], #-0x10]\n"
271 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
272 "ins v15.d[1], temploadreg3\n"
273 "cbz %[regs], 4f\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100274 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100275 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100276 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100277 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100278 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100279 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100280 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100281 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100282 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100283 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100284 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100285 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100286 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100287 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100288 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100289 "ldr d10, [%[b_ptr0], #0x20]\n"
290 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
291 "add %[a_ptr0], %[a_ptr0], #0x10\n"
292 "ldr d11, [%[b_ptr0], #0x30]\n"
293 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
294 "ldr d12, [%[b_ptr0], #0x40]\n"
295 "ins v8.d[1], temploadreg0\n"
296 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
297 "ldr d13, [%[b_ptr0], #0x50]\n"
298 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100299 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100300 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
301 "ldr d14, [%[b_ptr0], #0x60]\n"
302 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100303 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100304 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
305 "ldr d15, [%[b_ptr0], #0x70]\n"
306 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100307 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100308 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
309 "ins v12.d[1], temploadreg0\n"
310 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100311 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100312 "ldr d8, [%[b_ptr0], #-0x80]\n"
313 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100314 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100315 "ldr d9, [%[b_ptr0], #-0x70]\n"
316 "ins v13.d[1], temploadreg1\n"
317 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
318 "ldr d10, [%[b_ptr0], #-0x60]\n"
319 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100320 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100321 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
322 "ldr d11, [%[b_ptr0], #-0x50]\n"
323 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100324 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100325 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
326 "ldr d12, [%[b_ptr0], #-0x40]\n"
327 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100328 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100329 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
330 "ldr d13, [%[b_ptr0], #-0x30]\n"
331 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100332 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100333 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
334 "ldr d14, [%[b_ptr0], #-0x20]\n"
335 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100336 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100337 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
338 "ldr d15, [%[b_ptr0], #-0x10]\n"
339 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100340 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100341 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
342 "ldr d8, [%[b_ptr0]]\n"
343 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100344 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100345 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
346 "ldr d9, [%[b_ptr0], #0x10]\n"
347 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100348 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100349 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
350 "ldr d10, [%[b_ptr0], #0x20]\n"
351 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100352 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100353 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
354 "ldr d11, [%[b_ptr0], #0x30]\n"
355 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100356 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100357 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
358 "ldr d12, [%[b_ptr0], #0x40]\n"
359 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100360 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100361 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
362 "ldr d13, [%[b_ptr0], #0x50]\n"
363 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100364 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100365 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
366 "ldr d14, [%[b_ptr0], #0x60]\n"
367 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100368 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100369 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
370 "ldr d15, [%[b_ptr0], #0x70]\n"
371 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100372 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100373 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
374 "ins v12.d[1], temploadreg0\n"
375 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100376 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100377 "ins v13.d[1], temploadreg1\n"
378 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100379 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100380 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100381 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
382 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
383 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100384 "b 5f\n"
385 "4:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100386 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100387 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100388 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100389 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100390 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100391 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100392 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100393 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100394 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100395 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100396 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100397 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100398 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100399 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100400 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100401 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
402 "ldr d12, [%[b_ptr0], #0x40]\n"
403 "ins v8.d[1], temploadreg0\n"
404 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
405 "ldr d13, [%[b_ptr0], #0x50]\n"
406 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100407 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100408 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
409 "ldr d14, [%[b_ptr0], #0x60]\n"
410 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100411 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100412 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
413 "ldr d15, [%[b_ptr0], #0x70]\n"
414 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100415 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100416 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
417 "ins v12.d[1], temploadreg0\n"
418 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100419 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100420 "ins v13.d[1], temploadreg1\n"
421 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100422 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100423 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100424 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
425 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
426 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100427 "5:\n"
428 "cbz %[blocks], 6f\n"
429 "7:\n"
430 "ldr q8, [%[b_ptr0]]\n"
431 "subs %[blocks], %[blocks], #0x1\n"
432 "ldr q9, [%[b_ptr0], #0x10]\n"
433 "ldr s0, [%[a_ptr0]]\n"
434 "ldr q10, [%[b_ptr0], #0x20]\n"
435 "add %[a_ptr0], %[a_ptr0], #0x4\n"
436 "ldr q11, [%[b_ptr0], #0x30]\n"
437 "add %[b_ptr0], %[b_ptr0], #0x40\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100438 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
439 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
440 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
441 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100442 "b.ne 7b\n"
443 "6:\n"
444 "cbz %[odds], 8f\n"
445 "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
446 "subs %[odds], %[odds], #0x1\n"
447 "b.eq 9f\n"
448 "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
449 "subs %[odds], %[odds], #0x1\n"
450 "b.eq 9f\n"
451 "ld1 {v0.b}[2], [%[a_ptr0]]\n"
452 "9:\n"
453 "ldr q8, [%[b_ptr0]]\n"
454 "ldr q9, [%[b_ptr0], #0x10]\n"
455 "ldr q10, [%[b_ptr0], #0x20]\n"
456 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100457 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
458 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
459 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
460 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100461 "8:\n"
462 "str q16, [%[c_ptr0]]\n"
463 "str q17, [%[c_ptr0], #0x10]\n"
464 "str q18, [%[c_ptr0], #0x20]\n"
465 "str q19, [%[c_ptr0], #0x30]\n"
466 "add %[c_ptr0], %[c_ptr0], #0x40\n"
467 ".unreq temploadreg0\n"
468 ".unreq temploadreg1\n"
469 ".unreq temploadreg2\n"
470 ".unreq temploadreg3\n"
471 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100472 : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100473 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
Georgios Pinitas1d480652019-01-23 11:24:50 +0000474 );
475 break;
476 case 2:
477 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100478 "a_ptr1 .req X0\n"
479 "c_ptr1 .req X1\n"
480 "temploadreg0 .req X2\n"
481 "temploadreg1 .req X3\n"
482 "temploadreg2 .req X4\n"
483 "temploadreg3 .req X5\n"
484 "add a_ptr1, %[a_ptr0], %[lda]\n"
485 "add c_ptr1, %[c_ptr0], %[ldc]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100486 "cbnz %[append], 1f\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100487 "movi v16.4s, #0\n"
488 "ldr q0, [%[a_ptr0]]\n"
489 "movi v17.4s, #0\n"
490 "ldr q1, [a_ptr1]\n"
491 "movi v18.4s, #0\n"
492 "ldr q8, [%[b_ptr0]]\n"
493 "movi v19.4s, #0\n"
494 "ldr q9, [%[b_ptr0], #0x10]\n"
495 "movi v20.4s, #0\n"
496 "ldr q10, [%[b_ptr0], #0x20]\n"
497 "movi v21.4s, #0\n"
498 "ldr q11, [%[b_ptr0], #0x30]\n"
499 "movi v22.4s, #0\n"
500 "ldr q12, [%[b_ptr0], #0x40]\n"
501 "movi v23.4s, #0\n"
502 "ldr q13, [%[b_ptr0], #0x50]\n"
503 "ldr d14, [%[b_ptr0], #0x60]\n"
504 "add %[a_ptr0], %[a_ptr0], #0x10\n"
505 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
506 "add a_ptr1, a_ptr1, #0x10\n"
507 "add %[b_ptr0], %[b_ptr0], #0x80\n"
508 "cbz %[loops], 2f\n"
509 "b 3f\n"
510 "1:\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100511 "ldr q16, [%[c_ptr0]]\n"
512 "ldr q17, [%[c_ptr0], #0x10]\n"
513 "ldr q18, [%[c_ptr0], #0x20]\n"
514 "ldr q19, [%[c_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100515 "ldr q20, [c_ptr1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100516 "ldr q21, [c_ptr1, #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100517 "ldr q22, [c_ptr1, #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100518 "ldr q23, [c_ptr1, #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100519 "ldr q0, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100520 "add %[a_ptr0], %[a_ptr0], #0x10\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100521 "ldr q1, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100522 "add a_ptr1, a_ptr1, #0x10\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100523 "ldr q8, [%[b_ptr0]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100524 "ldr q9, [%[b_ptr0], #0x10]\n"
525 "ldr q10, [%[b_ptr0], #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100526 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100527 "ldr q12, [%[b_ptr0], #0x40]\n"
528 "ldr q13, [%[b_ptr0], #0x50]\n"
529 "ldr d14, [%[b_ptr0], #0x60]\n"
530 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
531 "add %[b_ptr0], %[b_ptr0], #0x80\n"
532 "cbz %[loops], 2f\n"
533 "3:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100534 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100535 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100536 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100537 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100538 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100539 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100540 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100541 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100542 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100543 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100544 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100545 "ldr d5, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100546 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100547 "ldr temploadreg1, [a_ptr1, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100548 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100549 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100550 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100551 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100552 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100553 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100554 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100555 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100556 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100557 "ins v5.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100558 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100559 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100560 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100561 "ldr d10, [%[b_ptr0], #0x20]\n"
562 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
563 "subs %[loops], %[loops], #0x1\n"
564 "ldr d11, [%[b_ptr0], #0x30]\n"
565 "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
566 "ins v15.d[1], temploadreg3\n"
567 "add %[a_ptr0], %[a_ptr0], #0x20\n"
568 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
569 "add a_ptr1, a_ptr1, #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100570 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100571 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100572 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100573 "ins v8.d[1], temploadreg0\n"
574 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
575 "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
576 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100577 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100578 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100579 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100580 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
581 "ldr d14, [%[b_ptr0], #0x60]\n"
582 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100583 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100584 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100585 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100586 "ldr d15, [%[b_ptr0], #0x70]\n"
587 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100588 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100589 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100590 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100591 "ins v12.d[1], temploadreg0\n"
592 "ins v13.d[1], temploadreg1\n"
593 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100594 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100595 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100596 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100597 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100598 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100599 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100600 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100601 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100602 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100603 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100604 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100605 "ins v14.d[1], temploadreg2\n"
606 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
607 "ldr d11, [%[b_ptr0], #-0x50]\n"
608 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100609 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100610 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100611 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100612 "ldr d12, [%[b_ptr0], #-0x40]\n"
613 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100614 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100615 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100616 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100617 "ldr d13, [%[b_ptr0], #-0x30]\n"
618 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100619 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100620 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100621 ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100622 "ldr d14, [%[b_ptr0], #-0x20]\n"
623 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100624 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100625 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100626 ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100627 "ldr d15, [%[b_ptr0], #-0x10]\n"
628 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100629 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100630 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100631 ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100632 "ldr d0, [%[a_ptr0], #-0x10]\n"
633 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100634 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100635 "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100636 ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100637 "ldr d1, [a_ptr1, #-0x10]\n"
638 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100639 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100640 "ldr temploadreg1, [a_ptr1, #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100641 ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100642 "ldr d8, [%[b_ptr0]]\n"
643 "ins v0.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100644 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100645 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100646 ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100647 "ldr d9, [%[b_ptr0], #0x10]\n"
648 "ins v1.d[1], temploadreg1\n"
649 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
650 "ldr d10, [%[b_ptr0], #0x20]\n"
651 "ins v14.d[1], temploadreg2\n"
652 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
653 "ldr d11, [%[b_ptr0], #0x30]\n"
654 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100655 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100656 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100657 ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100658 "ldr d12, [%[b_ptr0], #0x40]\n"
659 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100660 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100661 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100662 ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100663 "ldr d13, [%[b_ptr0], #0x50]\n"
664 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100665 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100666 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100667 ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100668 "ldr d14, [%[b_ptr0], #0x60]\n"
669 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100670 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100671 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100672 ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100673 "ldr d15, [%[b_ptr0], #0x70]\n"
674 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100675 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100676 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100677 ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100678 "ins v12.d[1], temploadreg0\n"
679 "ins v13.d[1], temploadreg1\n"
680 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100681 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100682 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100683 ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100684 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100685 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100686 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100687 ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100688 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100689 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100690 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100691 ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100692 "ins v14.d[1], temploadreg2\n"
693 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
694 "ldr d11, [%[b_ptr0], #-0x50]\n"
695 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100696 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100697 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100698 ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100699 "ldr d12, [%[b_ptr0], #-0x40]\n"
700 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100701 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100702 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100703 ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100704 "ldr d13, [%[b_ptr0], #-0x30]\n"
705 "ins v9.d[1], temploadreg1\n"
706 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
707 "ldr d14, [%[b_ptr0], #-0x20]\n"
708 "ins v10.d[1], temploadreg2\n"
709 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
710 "ins v11.d[1], temploadreg3\n"
711 "ins v12.d[1], temploadreg0\n"
712 "ins v13.d[1], temploadreg1\n"
713 "b.ne 3b\n"
714 "2:\n"
715 "ins v14.d[1], temploadreg2\n"
716 "prfm PSTL1KEEP, [%[c_ptr0]]\n"
717 "ldr d15, [%[b_ptr0], #-0x10]\n"
718 "prfm PSTL1KEEP, [c_ptr1]\n"
719 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
720 "ins v15.d[1], temploadreg3\n"
721 "cbz %[regs], 4f\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100722 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100723 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100724 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100725 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100726 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100727 "ldr d5, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100728 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100729 "ldr temploadreg1, [a_ptr1, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100730 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100731 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100732 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100733 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100734 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100735 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100736 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100737 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100738 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100739 "ins v5.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100740 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100741 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100742 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100743 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100744 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100745 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100746 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100747 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100748 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100749 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100750 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100751 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100752 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100753 "ins v8.d[1], temploadreg0\n"
754 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
755 "add %[a_ptr0], %[a_ptr0], #0x10\n"
756 "ldr d13, [%[b_ptr0], #0x50]\n"
757 "add a_ptr1, a_ptr1, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100758 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100759 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100760 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100761 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
762 "ldr d14, [%[b_ptr0], #0x60]\n"
763 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100764 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100765 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100766 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100767 "ldr d15, [%[b_ptr0], #0x70]\n"
768 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100769 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100770 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100771 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100772 "ins v12.d[1], temploadreg0\n"
773 "ins v13.d[1], temploadreg1\n"
774 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100775 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100776 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100777 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100778 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100779 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100780 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100781 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100782 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100783 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100784 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100785 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100786 "ins v14.d[1], temploadreg2\n"
787 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
788 "ldr d11, [%[b_ptr0], #-0x50]\n"
789 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100790 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100791 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100792 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100793 "ldr d12, [%[b_ptr0], #-0x40]\n"
794 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100795 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100796 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100797 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100798 "ldr d13, [%[b_ptr0], #-0x30]\n"
799 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100800 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100801 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100802 ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100803 "ldr d14, [%[b_ptr0], #-0x20]\n"
804 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100805 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100806 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100807 ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100808 "ldr d15, [%[b_ptr0], #-0x10]\n"
809 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100810 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100811 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100812 ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100813 "ldr d8, [%[b_ptr0]]\n"
814 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100815 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100816 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100817 ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100818 "ldr d9, [%[b_ptr0], #0x10]\n"
819 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100820 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100821 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100822 ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100823 "ldr d10, [%[b_ptr0], #0x20]\n"
824 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100825 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100826 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100827 ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100828 "ldr d11, [%[b_ptr0], #0x30]\n"
829 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100830 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100831 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100832 ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100833 "ldr d12, [%[b_ptr0], #0x40]\n"
834 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100835 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100836 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100837 ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100838 "ldr d13, [%[b_ptr0], #0x50]\n"
839 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100840 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100841 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100842 ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100843 "ldr d14, [%[b_ptr0], #0x60]\n"
844 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100845 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100846 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100847 ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100848 "ldr d15, [%[b_ptr0], #0x70]\n"
849 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100850 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100851 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100852 ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100853 "ins v12.d[1], temploadreg0\n"
854 "ins v13.d[1], temploadreg1\n"
855 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100856 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100857 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100858 ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100859 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100860 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
861 ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
862 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
863 ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
864 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
865 ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
866 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
867 ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100868 "b 5f\n"
869 "4:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100870 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100871 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100872 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100873 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100874 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100875 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100876 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100877 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100878 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100879 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100880 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100881 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100882 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100883 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100884 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100885 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100886 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100887 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100888 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100889 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100890 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100891 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100892 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100893 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100894 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100895 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100896 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100897 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100898 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100899 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100900 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100901 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100902 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100903 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100904 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100905 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100906 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100907 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100908 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100909 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100910 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100911 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100912 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100913 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100914 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100915 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100916 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100917 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100918 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100919 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100920 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
921 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
922 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
923 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
924 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
925 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
926 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100927 "5:\n"
928 "cbz %[blocks], 6f\n"
929 "7:\n"
930 "ldr q8, [%[b_ptr0]]\n"
931 "subs %[blocks], %[blocks], #0x1\n"
932 "ldr q9, [%[b_ptr0], #0x10]\n"
933 "ldr s0, [%[a_ptr0]]\n"
934 "ldr q10, [%[b_ptr0], #0x20]\n"
935 "add %[a_ptr0], %[a_ptr0], #0x4\n"
936 "ldr q11, [%[b_ptr0], #0x30]\n"
937 "add %[b_ptr0], %[b_ptr0], #0x40\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100938 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100939 "ldr s1, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100940 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100941 "add a_ptr1, a_ptr1, #0x4\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100942 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
943 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
944 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
945 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
946 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
947 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100948 "b.ne 7b\n"
949 "6:\n"
950 "cbz %[odds], 8f\n"
951 "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
952 "ld1 {v1.b}[0], [a_ptr1], #1\n"
953 "subs %[odds], %[odds], #0x1\n"
954 "b.eq 9f\n"
955 "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
956 "ld1 {v1.b}[1], [a_ptr1], #1\n"
957 "subs %[odds], %[odds], #0x1\n"
958 "b.eq 9f\n"
959 "ld1 {v0.b}[2], [%[a_ptr0]]\n"
960 "ld1 {v1.b}[2], [a_ptr1]\n"
961 "9:\n"
962 "ldr q8, [%[b_ptr0]]\n"
963 "ldr q9, [%[b_ptr0], #0x10]\n"
964 "ldr q10, [%[b_ptr0], #0x20]\n"
965 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100966 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
967 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
968 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
969 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
970 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
971 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
972 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
973 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100974 "8:\n"
975 "str q16, [%[c_ptr0]]\n"
976 "str q17, [%[c_ptr0], #0x10]\n"
977 "str q18, [%[c_ptr0], #0x20]\n"
978 "str q19, [%[c_ptr0], #0x30]\n"
979 "add %[c_ptr0], %[c_ptr0], #0x40\n"
980 "str q20, [c_ptr1]\n"
981 "str q21, [c_ptr1, #0x10]\n"
982 "str q22, [c_ptr1, #0x20]\n"
983 "str q23, [c_ptr1, #0x30]\n"
984 ".unreq a_ptr1\n"
985 ".unreq c_ptr1\n"
986 ".unreq temploadreg0\n"
987 ".unreq temploadreg1\n"
988 ".unreq temploadreg2\n"
989 ".unreq temploadreg3\n"
990 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100991 : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100992 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
Georgios Pinitas1d480652019-01-23 11:24:50 +0000993 );
994 break;
995 case 3:
996 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100997 "a_ptr1 .req X0\n"
998 "a_ptr2 .req X1\n"
999 "c_ptr1 .req X2\n"
1000 "c_ptr2 .req X3\n"
1001 "temploadreg0 .req X4\n"
1002 "temploadreg1 .req X5\n"
1003 "temploadreg2 .req X6\n"
1004 "temploadreg3 .req X7\n"
1005 "add a_ptr1, %[a_ptr0], %[lda]\n"
1006 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1007 "add a_ptr2, a_ptr1, %[lda]\n"
1008 "add c_ptr2, c_ptr1, %[ldc]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001009 "cbnz %[append], 1f\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001010 "movi v16.4s, #0\n"
1011 "ldr q0, [%[a_ptr0]]\n"
1012 "movi v17.4s, #0\n"
1013 "ldr q1, [a_ptr1]\n"
1014 "movi v18.4s, #0\n"
1015 "ldr q2, [a_ptr2]\n"
1016 "movi v19.4s, #0\n"
1017 "ldr q8, [%[b_ptr0]]\n"
1018 "movi v20.4s, #0\n"
1019 "ldr q9, [%[b_ptr0], #0x10]\n"
1020 "movi v21.4s, #0\n"
1021 "ldr q10, [%[b_ptr0], #0x20]\n"
1022 "movi v22.4s, #0\n"
1023 "ldr q11, [%[b_ptr0], #0x30]\n"
1024 "movi v23.4s, #0\n"
1025 "ldr q12, [%[b_ptr0], #0x40]\n"
1026 "movi v24.4s, #0\n"
1027 "ldr q13, [%[b_ptr0], #0x50]\n"
1028 "movi v25.4s, #0\n"
1029 "ldr d14, [%[b_ptr0], #0x60]\n"
1030 "movi v26.4s, #0\n"
1031 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
1032 "movi v27.4s, #0\n"
1033 "add %[a_ptr0], %[a_ptr0], #0x10\n"
1034 "add a_ptr1, a_ptr1, #0x10\n"
1035 "ins v14.d[1], temploadreg2\n"
1036 "add a_ptr2, a_ptr2, #0x10\n"
1037 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1038 "cbz %[loops], 2f\n"
1039 "b 3f\n"
1040 "1:\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001041 "ldr q16, [%[c_ptr0]]\n"
1042 "ldr q17, [%[c_ptr0], #0x10]\n"
1043 "ldr q18, [%[c_ptr0], #0x20]\n"
1044 "ldr q19, [%[c_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001045 "ldr q20, [c_ptr1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001046 "ldr q21, [c_ptr1, #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001047 "ldr q22, [c_ptr1, #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001048 "ldr q23, [c_ptr1, #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001049 "ldr q24, [c_ptr2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001050 "ldr q25, [c_ptr2, #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001051 "ldr q26, [c_ptr2, #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001052 "ldr q27, [c_ptr2, #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001053 "ldr q0, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001054 "add %[a_ptr0], %[a_ptr0], #0x10\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001055 "ldr q1, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001056 "add a_ptr1, a_ptr1, #0x10\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001057 "ldr q2, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001058 "add a_ptr2, a_ptr2, #0x10\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001059 "ldr q8, [%[b_ptr0]]\n"
1060 "ldr q9, [%[b_ptr0], #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001061 "ldr q10, [%[b_ptr0], #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001062 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001063 "ldr q12, [%[b_ptr0], #0x40]\n"
1064 "ldr q13, [%[b_ptr0], #0x50]\n"
1065 "ldr d14, [%[b_ptr0], #0x60]\n"
1066 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
1067 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1068 "ins v14.d[1], temploadreg2\n"
1069 "cbz %[loops], 2f\n"
1070 "3:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001071 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001072 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001073 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001074 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001075 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001076 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001077 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001078 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001079 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001080 "ldr d5, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001081 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001082 "ldr temploadreg1, [a_ptr1, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001083 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001084 "ldr d6, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001085 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001086 "ldr temploadreg2, [a_ptr2, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001087 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001088 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001089 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001090 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001091 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001092 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001093 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001094 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001095 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001096 "ins v5.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001097 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001098 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001099 ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001100 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001101 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001102 "ins v6.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001103 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001104 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001105 ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001106 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001107 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001108 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001109 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001110 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001111 ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001112 "ldr d12, [%[b_ptr0], #0x40]\n"
1113 "ins v8.d[1], temploadreg0\n"
1114 "subs %[loops], %[loops], #0x1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001115 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001116 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001117 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001118 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001119 ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001120 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001121 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001122 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001123 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001124 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001125 ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001126 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001127 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001128 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001129 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001130 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001131 ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001132 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001133 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001134 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001135 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001136 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001137 ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001138 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001139 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001140 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001141 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001142 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001143 ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001144 "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001145 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001146 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001147 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001148 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001149 ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001150 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001151 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001152 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001153 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001154 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001155 ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001156 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001157 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001158 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001159 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001160 "ldr d11, [%[b_ptr0], #-0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001161 ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001162 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001163 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001164 "ldr d12, [%[b_ptr0], #-0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001165 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001166 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001167 ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001168 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
1169 "ldr d13, [%[b_ptr0], #-0x30]\n"
1170 "add %[a_ptr0], %[a_ptr0], #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001171 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001172 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001173 ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001174 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001175 ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001176 "ldr d14, [%[b_ptr0], #-0x20]\n"
1177 "ins v10.d[1], temploadreg2\n"
1178 "add a_ptr1, a_ptr1, #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001179 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001180 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001181 ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001182 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001183 ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001184 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001185 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001186 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001187 ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001188 "ldr d0, [%[a_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001189 ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001190 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001191 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001192 "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001193 ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001194 "ldr d1, [a_ptr1, #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001195 ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001196 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001197 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001198 "ldr temploadreg1, [a_ptr1, #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001199 ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001200 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001201 ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001202 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001203 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001204 "ins v0.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001205 ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001206 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001207 ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001208 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001209 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001210 "ins v1.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001211 ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001212 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001213 ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001214 "ldr d10, [%[b_ptr0], #0x20]\n"
1215 "ldr d11, [%[b_ptr0], #0x30]\n"
1216 "add a_ptr2, a_ptr2, #0x20\n"
1217 "ins v15.d[1], temploadreg3\n"
1218 "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
1219 "ldr d2, [a_ptr2, #-0x10]\n"
1220 "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001221 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001222 "ldr temploadreg2, [a_ptr2, #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001223 ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001224 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001225 ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001226 "ldr d12, [%[b_ptr0], #0x40]\n"
1227 "ins v8.d[1], temploadreg0\n"
1228 "ins v2.d[1], temploadreg2\n"
1229 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
1230 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001231 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001232 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001233 ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001234 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001235 ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001236 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
1237 "ldr d14, [%[b_ptr0], #0x60]\n"
1238 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001239 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001240 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001241 ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001242 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001243 ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001244 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001245 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001246 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001247 ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001248 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001249 ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001250 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001251 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001252 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001253 ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001254 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001255 ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001256 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001257 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001258 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001259 ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001260 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001261 ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001262 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001263 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001264 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001265 ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001266 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001267 ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001268 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001269 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001270 "ldr d11, [%[b_ptr0], #-0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001271 ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001272 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001273 ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001274 "ldr d12, [%[b_ptr0], #-0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001275 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001276 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001277 ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001278 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001279 ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001280 "ldr d13, [%[b_ptr0], #-0x30]\n"
1281 "ins v9.d[1], temploadreg1\n"
1282 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
1283 "ldr d14, [%[b_ptr0], #-0x20]\n"
1284 "ins v10.d[1], temploadreg2\n"
1285 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
1286 "ins v11.d[1], temploadreg3\n"
1287 "ins v12.d[1], temploadreg0\n"
1288 "ins v13.d[1], temploadreg1\n"
1289 "ins v14.d[1], temploadreg2\n"
1290 "b.ne 3b\n"
1291 "2:\n"
1292 "ldr d15, [%[b_ptr0], #-0x10]\n"
1293 "prfm PSTL1KEEP, [%[c_ptr0]]\n"
1294 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
1295 "prfm PSTL1KEEP, [c_ptr1]\n"
1296 "prfm PSTL1KEEP, [c_ptr2]\n"
1297 "ins v15.d[1], temploadreg3\n"
1298 "cbz %[regs], 4f\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001299 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001300 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001301 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001302 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001303 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001304 "ldr d5, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001305 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001306 "ldr temploadreg1, [a_ptr1, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001307 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001308 "ldr d6, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001309 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001310 "ldr temploadreg2, [a_ptr2, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001311 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001312 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001313 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001314 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001315 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001316 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001317 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001318 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001319 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001320 "ins v5.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001321 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001322 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001323 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001324 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001325 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001326 "ins v6.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001327 ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001328 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001329 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001330 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001331 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001332 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001333 ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001334 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001335 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001336 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001337 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001338 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001339 ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001340 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001341 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001342 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001343 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001344 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001345 ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001346 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001347 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001348 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001349 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001350 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001351 ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001352 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001353 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001354 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001355 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001356 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001357 ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001358 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001359 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001360 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001361 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001362 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001363 ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001364 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001365 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001366 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001367 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001368 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001369 ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001370 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001371 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001372 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001373 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001374 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001375 ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001376 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001377 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001378 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001379 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001380 "ldr d11, [%[b_ptr0], #-0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001381 ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001382 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001383 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001384 "ldr d12, [%[b_ptr0], #-0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001385 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001386 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001387 ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001388 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001389 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001390 "ldr d13, [%[b_ptr0], #-0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001391 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001392 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001393 ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001394 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001395 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001396 "ldr d14, [%[b_ptr0], #-0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001397 ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001398 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001399 ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001400 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001401 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001402 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001403 ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001404 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001405 ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001406 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001407 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001408 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001409 ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001410 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001411 ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001412 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001413 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001414 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001415 ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001416 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001417 ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001418 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001419 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001420 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001421 ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001422 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001423 ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001424 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001425 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001426 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001427 ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001428 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001429 ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001430 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001431 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001432 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001433 ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001434 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001435 ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001436 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001437 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001438 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001439 ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001440 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001441 ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001442 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001443 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001444 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001445 ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001446 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001447 ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001448 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001449 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001450 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001451 ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001452 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001453 ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001454 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001455 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001456 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001457 ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001458 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001459 ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001460 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001461 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001462 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001463 ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001464 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001465 ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001466 "add %[a_ptr0], %[a_ptr0], #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001467 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001468 "add a_ptr1, a_ptr1, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001469 ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001470 "add a_ptr2, a_ptr2, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001471 ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
1472 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
1473 ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
1474 ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
1475 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
1476 ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
1477 ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
1478 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
1479 ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
1480 ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001481 "b 5f\n"
1482 "4:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001483 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001484 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001485 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001486 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001487 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001488 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001489 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001490 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001491 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001492 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001493 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001494 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001495 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001496 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001497 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001498 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001499 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001500 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001501 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001502 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001503 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001504 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001505 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001506 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001507 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001508 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001509 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001510 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001511 ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001512 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001513 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001514 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001515 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001516 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001517 ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001518 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001519 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001520 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001521 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
1522 ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001523 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001524 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001525 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001526 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
1527 ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001528 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001529 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001530 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001531 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001532 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001533 ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001534 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001535 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
1536 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
1537 ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
1538 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
1539 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
1540 ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
1541 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
1542 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
1543 ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
1544 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
1545 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
1546 ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
1547 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
1548 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
1549 ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
1550 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
1551 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
1552 ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
1553 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
1554 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
1555 ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001556 "5:\n"
1557 "cbz %[blocks], 6f\n"
1558 "7:\n"
1559 "ldr q8, [%[b_ptr0]]\n"
1560 "subs %[blocks], %[blocks], #0x1\n"
1561 "ldr q9, [%[b_ptr0], #0x10]\n"
1562 "ldr s0, [%[a_ptr0]]\n"
1563 "ldr q10, [%[b_ptr0], #0x20]\n"
1564 "add %[a_ptr0], %[a_ptr0], #0x4\n"
1565 "ldr q11, [%[b_ptr0], #0x30]\n"
1566 "add %[b_ptr0], %[b_ptr0], #0x40\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001567 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001568 "ldr s1, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001569 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001570 "add a_ptr1, a_ptr1, #0x4\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001571 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001572 "ldr s2, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001573 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001574 "add a_ptr2, a_ptr2, #0x4\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001575 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
1576 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
1577 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
1578 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
1579 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
1580 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
1581 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
1582 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001583 "b.ne 7b\n"
1584 "6:\n"
1585 "cbz %[odds], 8f\n"
1586 "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
1587 "ld1 {v1.b}[0], [a_ptr1], #1\n"
1588 "ld1 {v2.b}[0], [a_ptr2], #1\n"
1589 "subs %[odds], %[odds], #0x1\n"
1590 "b.eq 9f\n"
1591 "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
1592 "ld1 {v1.b}[1], [a_ptr1], #1\n"
1593 "ld1 {v2.b}[1], [a_ptr2], #1\n"
1594 "subs %[odds], %[odds], #0x1\n"
1595 "b.eq 9f\n"
1596 "ld1 {v0.b}[2], [%[a_ptr0]]\n"
1597 "ld1 {v1.b}[2], [a_ptr1]\n"
1598 "ld1 {v2.b}[2], [a_ptr2]\n"
1599 "9:\n"
1600 "ldr q8, [%[b_ptr0]]\n"
1601 "ldr q9, [%[b_ptr0], #0x10]\n"
1602 "ldr q10, [%[b_ptr0], #0x20]\n"
1603 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001604 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
1605 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
1606 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
1607 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
1608 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
1609 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
1610 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
1611 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
1612 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
1613 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
1614 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
1615 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001616 "8:\n"
1617 "str q16, [%[c_ptr0]]\n"
1618 "str q17, [%[c_ptr0], #0x10]\n"
1619 "str q18, [%[c_ptr0], #0x20]\n"
1620 "str q19, [%[c_ptr0], #0x30]\n"
1621 "add %[c_ptr0], %[c_ptr0], #0x40\n"
1622 "str q20, [c_ptr1]\n"
1623 "str q21, [c_ptr1, #0x10]\n"
1624 "str q22, [c_ptr1, #0x20]\n"
1625 "str q23, [c_ptr1, #0x30]\n"
1626 "str q24, [c_ptr2]\n"
1627 "str q25, [c_ptr2, #0x10]\n"
1628 "str q26, [c_ptr2, #0x20]\n"
1629 "str q27, [c_ptr2, #0x30]\n"
1630 ".unreq a_ptr1\n"
1631 ".unreq a_ptr2\n"
1632 ".unreq c_ptr1\n"
1633 ".unreq c_ptr2\n"
1634 ".unreq temploadreg0\n"
1635 ".unreq temploadreg1\n"
1636 ".unreq temploadreg2\n"
1637 ".unreq temploadreg3\n"
1638 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001639 : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001640 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
Georgios Pinitas1d480652019-01-23 11:24:50 +00001641 );
1642 break;
1643 default:
1644 case 4:
1645 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001646 "a_ptr1 .req X0\n"
1647 "a_ptr2 .req X1\n"
1648 "a_ptr3 .req X2\n"
1649 "c_ptr1 .req X3\n"
1650 "c_ptr2 .req X4\n"
1651 "c_ptr3 .req X5\n"
1652 "temploadreg0 .req X6\n"
1653 "temploadreg1 .req X7\n"
1654 "temploadreg2 .req X8\n"
1655 "temploadreg3 .req X9\n"
1656 "add a_ptr1, %[a_ptr0], %[lda]\n"
1657 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1658 "add a_ptr2, a_ptr1, %[lda]\n"
1659 "add c_ptr2, c_ptr1, %[ldc]\n"
1660 "add a_ptr3, a_ptr2, %[lda]\n"
1661 "add c_ptr3, c_ptr2, %[ldc]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001662 "cbnz %[append], 1f\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001663 "movi v16.4s, #0\n"
1664 "ldr q0, [%[a_ptr0]]\n"
1665 "movi v17.4s, #0\n"
1666 "ldr q1, [a_ptr1]\n"
1667 "movi v18.4s, #0\n"
1668 "ldr q2, [a_ptr2]\n"
1669 "movi v19.4s, #0\n"
1670 "ldr q3, [a_ptr3]\n"
1671 "movi v20.4s, #0\n"
1672 "ldr q8, [%[b_ptr0]]\n"
1673 "movi v21.4s, #0\n"
1674 "ldr q9, [%[b_ptr0], #0x10]\n"
1675 "movi v22.4s, #0\n"
1676 "ldr q10, [%[b_ptr0], #0x20]\n"
1677 "movi v23.4s, #0\n"
1678 "ldr q11, [%[b_ptr0], #0x30]\n"
1679 "movi v24.4s, #0\n"
1680 "ldr q12, [%[b_ptr0], #0x40]\n"
1681 "movi v25.4s, #0\n"
1682 "ldr q13, [%[b_ptr0], #0x50]\n"
1683 "movi v26.4s, #0\n"
1684 "ldr d14, [%[b_ptr0], #0x60]\n"
1685 "movi v27.4s, #0\n"
1686 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
1687 "movi v28.4s, #0\n"
1688 "add %[a_ptr0], %[a_ptr0], #0x10\n"
1689 "movi v29.4s, #0\n"
1690 "ins v14.d[1], temploadreg2\n"
1691 "movi v30.4s, #0\n"
1692 "add a_ptr1, a_ptr1, #0x10\n"
1693 "movi v31.4s, #0\n"
1694 "add a_ptr2, a_ptr2, #0x10\n"
1695 "add a_ptr3, a_ptr3, #0x10\n"
1696 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1697 "cbz %[loops], 2f\n"
1698 "b 3f\n"
1699 "1:\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001700 "ldr q16, [%[c_ptr0]]\n"
1701 "ldr q17, [%[c_ptr0], #0x10]\n"
1702 "ldr q18, [%[c_ptr0], #0x20]\n"
1703 "ldr q19, [%[c_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001704 "ldr q20, [c_ptr1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001705 "ldr q21, [c_ptr1, #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001706 "ldr q22, [c_ptr1, #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001707 "ldr q23, [c_ptr1, #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001708 "ldr q24, [c_ptr2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001709 "ldr q25, [c_ptr2, #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001710 "ldr q26, [c_ptr2, #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001711 "ldr q27, [c_ptr2, #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001712 "ldr q28, [c_ptr3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001713 "ldr q29, [c_ptr3, #0x10]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001714 "ldr q30, [c_ptr3, #0x20]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001715 "ldr q31, [c_ptr3, #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001716 "ldr q0, [%[a_ptr0]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001717 "add %[a_ptr0], %[a_ptr0], #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001718 "ldr q1, [a_ptr1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001719 "add a_ptr1, a_ptr1, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001720 "ldr q2, [a_ptr2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001721 "add a_ptr2, a_ptr2, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001722 "ldr q3, [a_ptr3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001723 "add a_ptr3, a_ptr3, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001724 "ldr q8, [%[b_ptr0]]\n"
1725 "ldr q9, [%[b_ptr0], #0x10]\n"
1726 "ldr q10, [%[b_ptr0], #0x20]\n"
1727 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001728 "ldr q12, [%[b_ptr0], #0x40]\n"
1729 "ldr q13, [%[b_ptr0], #0x50]\n"
1730 "ldr d14, [%[b_ptr0], #0x60]\n"
1731 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
1732 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1733 "ins v14.d[1], temploadreg2\n"
1734 "cbz %[loops], 2f\n"
1735 "3:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001736 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001737 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001738 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001739 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001740 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001741 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001742 ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001743 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001744 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001745 "ldr d5, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001746 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001747 "ldr temploadreg1, [a_ptr1, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001748 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001749 "ldr d6, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001750 ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001751 "ldr temploadreg2, [a_ptr2, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001752 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001753 "ldr d7, [a_ptr3]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001754 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001755 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001756 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001757 "ldr temploadreg3, [a_ptr3, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001758 ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001759 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001760 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001761 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001762 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001763 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001764 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001765 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001766 ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001767 "ins v5.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001768 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001769 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001770 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001771 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001772 ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001773 "ins v6.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001774 ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001775 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001776 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001777 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001778 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001779 "ins v7.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001780 ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001781 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001782 ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001783 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001784 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001785 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001786 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001787 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001788 ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001789 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001790 ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001791 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001792 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001793 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001794 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001795 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001796 ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001797 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001798 ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001799 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001800 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001801 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001802 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001803 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001804 ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001805 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001806 ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001807 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001808 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001809 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001810 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001811 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001812 ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001813 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001814 ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001815 "subs %[loops], %[loops], #0x1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001816 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001817 "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001818 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001819 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001820 ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001821 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001822 ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001823 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001824 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001825 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001826 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001827 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001828 ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001829 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001830 ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001831 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001832 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001833 "ldr d11, [%[b_ptr0], #-0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001834 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001835 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001836 ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001837 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001838 ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001839 "ldr d12, [%[b_ptr0], #-0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001840 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001841 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001842 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001843 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001844 ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001845 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001846 ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001847 "ldr d13, [%[b_ptr0], #-0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001848 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001849 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001850 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001851 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001852 ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001853 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001854 ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001855 "ldr d14, [%[b_ptr0], #-0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001856 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001857 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001858 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001859 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001860 ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001861 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001862 ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001863 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001864 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001865 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001866 ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001867 "add %[a_ptr0], %[a_ptr0], #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001868 ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001869 "ldr d0, [%[a_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001870 ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001871 "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001872 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001873 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001874 ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001875 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001876 ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001877 "ins v0.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001878 ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001879 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001880 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001881 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001882 ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001883 "add a_ptr1, a_ptr1, #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001884 ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001885 "ldr d1, [a_ptr1, #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001886 ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001887 "ldr temploadreg1, [a_ptr1, #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001888 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001889 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001890 ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001891 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001892 ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001893 "ins v1.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001894 ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001895 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001896 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001897 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001898 ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001899 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001900 ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001901 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001902 ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001903 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001904 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001905 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001906 ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001907 "add a_ptr2, a_ptr2, #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001908 ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001909 "ldr d2, [a_ptr2, #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001910 ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001911 "ldr temploadreg2, [a_ptr2, #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001912 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001913 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001914 ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001915 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001916 ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001917 "ins v2.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001918 ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001919 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001920 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001921 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001922 ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001923 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001924 ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001925 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001926 ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001927 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001928 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001929 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001930 ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001931 "add a_ptr3, a_ptr3, #0x20\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001932 ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001933 "ldr d3, [a_ptr3, #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001934 ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001935 "ldr temploadreg3, [a_ptr3, #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001936 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001937 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001938 ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001939 "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001940 ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001941 "ins v3.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001942 ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001943 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001944 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001945 "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001946 ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001947 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001948 ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001949 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001950 ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001951 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001952 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001953 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001954 ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001955 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001956 ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001957 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001958 ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001959 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001960 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001961 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001962 ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001963 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001964 ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001965 "ldr d11, [%[b_ptr0], #-0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001966 ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001967 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001968 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001969 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001970 ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001971 "ldr d12, [%[b_ptr0], #-0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001972 ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001973 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001974 ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001975 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001976 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001977 "ldr d13, [%[b_ptr0], #-0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001978 ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001979 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001980 ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001981 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001982 ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001983 "ldr d14, [%[b_ptr0], #-0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001984 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001985 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001986 ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001987 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001988 ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001989 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001990 ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001991 "ins v12.d[1], temploadreg0\n"
1992 "ins v13.d[1], temploadreg1\n"
1993 "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
1994 "ins v14.d[1], temploadreg2\n"
1995 "b.ne 3b\n"
1996 "2:\n"
1997 "ldr d15, [%[b_ptr0], #-0x10]\n"
1998 "prfm PSTL1KEEP, [%[c_ptr0]]\n"
1999 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
2000 "prfm PSTL1KEEP, [c_ptr1]\n"
2001 "prfm PSTL1KEEP, [c_ptr2]\n"
2002 "prfm PSTL1KEEP, [c_ptr3]\n"
2003 "ins v15.d[1], temploadreg3\n"
2004 "cbz %[regs], 4f\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002005 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002006 "ldr d4, [%[a_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002007 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002008 "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002009 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002010 "ldr d5, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002011 ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002012 "ldr temploadreg1, [a_ptr1, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002013 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002014 "ldr d6, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002015 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002016 "ldr temploadreg2, [a_ptr2, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002017 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002018 "ldr d7, [a_ptr3]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002019 ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002020 "ldr temploadreg3, [a_ptr3, #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002021 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002022 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002023 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002024 "ins v4.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002025 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002026 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002027 ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002028 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002029 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002030 "ins v5.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002031 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002032 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002033 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002034 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002035 ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002036 "ins v6.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002037 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002038 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002039 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002040 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002041 ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002042 "ins v7.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002043 ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002044 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002045 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002046 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002047 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002048 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002049 ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002050 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002051 ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002052 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002053 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002054 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002055 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002056 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002057 ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002058 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002059 ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002060 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002061 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002062 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002063 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002064 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002065 ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002066 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002067 ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002068 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002069 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002070 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002071 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002072 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002073 ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002074 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002075 ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002076 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002077 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002078 "add %[b_ptr0], %[b_ptr0], #0x100\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002079 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002080 "ldr d8, [%[b_ptr0], #-0x80]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002081 ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002082 "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002083 ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002084 "ldr d9, [%[b_ptr0], #-0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002085 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002086 "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002087 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002088 "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002089 ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002090 "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002091 ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002092 "ldr d10, [%[b_ptr0], #-0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002093 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002094 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002095 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002096 "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002097 ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002098 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002099 ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002100 "ldr d11, [%[b_ptr0], #-0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002101 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002102 "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002103 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002104 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002105 ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002106 "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002107 ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002108 "ldr d12, [%[b_ptr0], #-0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002109 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002110 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002111 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002112 "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002113 ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002114 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002115 ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002116 "ldr d13, [%[b_ptr0], #-0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002117 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002118 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002119 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002120 "add %[a_ptr0], %[a_ptr0], #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002121 ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002122 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002123 ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002124 "ldr d14, [%[b_ptr0], #-0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002125 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002126 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002127 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002128 "add a_ptr1, a_ptr1, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002129 ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002130 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002131 ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002132 "ldr d15, [%[b_ptr0], #-0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002133 ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002134 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002135 ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002136 "add a_ptr2, a_ptr2, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002137 ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002138 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002139 ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002140 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002141 ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002142 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002143 ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002144 "add a_ptr3, a_ptr3, #0x10\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002145 ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002146 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002147 ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002148 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002149 ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002150 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002151 ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
2152 ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002153 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002154 ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002155 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002156 ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002157 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002158 ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
2159 ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002160 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002161 ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002162 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002163 ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002164 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002165 ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
2166 ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002167 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002168 ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002169 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002170 ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002171 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002172 ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
2173 ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002174 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002175 ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002176 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002177 ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
2178 ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
2179 ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002180 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002181 ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002182 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002183 ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
2184 ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
2185 ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002186 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002187 ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002188 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002189 ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002190 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002191 ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002192 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002193 ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
2194 ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
2195 ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
2196 ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
2197 ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
2198 ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
2199 ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
2200 ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
2201 ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
2202 ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
2203 ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
2204 ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
2205 ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
2206 ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
2207 ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
2208 ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
2209 ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
2210 ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
2211 ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
2212 ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
2213 ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
2214 ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
2215 ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
2216 ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
2217 ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
2218 ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
2219 ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
2220 ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
2221 ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
2222 ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002223 "b 5f\n"
2224 "4:\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002225 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002226 "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002227 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002228 "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002229 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002230 "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002231 ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002232 "ldr d8, [%[b_ptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002233 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002234 "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002235 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
2236 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002237 "ins v8.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002238 ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002239 "ldr d9, [%[b_ptr0], #0x10]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002240 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002241 "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002242 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
2243 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002244 "ins v9.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002245 ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002246 "ldr d10, [%[b_ptr0], #0x20]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002247 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002248 "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002249 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
2250 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002251 "ins v10.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002252 ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002253 "ldr d11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002254 ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002255 "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002256 ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
2257 ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002258 "ins v11.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002259 ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002260 "ldr d12, [%[b_ptr0], #0x40]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002261 ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002262 "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002263 ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
2264 ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002265 "ins v12.d[1], temploadreg0\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002266 ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002267 "ldr d13, [%[b_ptr0], #0x50]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002268 ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
2269 ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
2270 ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002271 "ins v13.d[1], temploadreg1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002272 ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002273 "ldr d14, [%[b_ptr0], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002274 ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
2275 ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
2276 ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002277 "ins v14.d[1], temploadreg2\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002278 ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002279 "ldr d15, [%[b_ptr0], #0x70]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002280 ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002281 "add %[b_ptr0], %[b_ptr0], #0x80\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002282 ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002283 "ins v15.d[1], temploadreg3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002284 ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
2285 ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
2286 ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
2287 ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
2288 ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
2289 ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
2290 ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
2291 ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
2292 ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
2293 ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
2294 ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
2295 ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
2296 ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
2297 ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
2298 ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
2299 ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
2300 ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
2301 ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
2302 ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
2303 ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
2304 ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
2305 ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
2306 ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
2307 ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
2308 ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
2309 ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
2310 ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
2311 ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
2312 ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
2313 ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002314 "5:\n"
2315 "cbz %[blocks], 6f\n"
2316 "7:\n"
2317 "ldr q8, [%[b_ptr0]]\n"
2318 "subs %[blocks], %[blocks], #0x1\n"
2319 "ldr q9, [%[b_ptr0], #0x10]\n"
2320 "ldr s0, [%[a_ptr0]]\n"
2321 "ldr q10, [%[b_ptr0], #0x20]\n"
2322 "add %[a_ptr0], %[a_ptr0], #0x4\n"
2323 "ldr q11, [%[b_ptr0], #0x30]\n"
2324 "add %[b_ptr0], %[b_ptr0], #0x40\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002325 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002326 "ldr s1, [a_ptr1]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002327 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002328 "add a_ptr1, a_ptr1, #0x4\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002329 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002330 "ldr s2, [a_ptr2]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002331 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002332 "add a_ptr2, a_ptr2, #0x4\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002333 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002334 "ldr s3, [a_ptr3]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002335 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002336 "add a_ptr3, a_ptr3, #0x4\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002337 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
2338 ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
2339 ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
2340 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
2341 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
2342 ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
2343 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
2344 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
2345 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
2346 ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002347 "b.ne 7b\n"
2348 "6:\n"
2349 "cbz %[odds], 8f\n"
2350 "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
2351 "ld1 {v1.b}[0], [a_ptr1], #1\n"
2352 "ld1 {v2.b}[0], [a_ptr2], #1\n"
2353 "ld1 {v3.b}[0], [a_ptr3], #1\n"
2354 "subs %[odds], %[odds], #0x1\n"
2355 "b.eq 9f\n"
2356 "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
2357 "ld1 {v1.b}[1], [a_ptr1], #1\n"
2358 "ld1 {v2.b}[1], [a_ptr2], #1\n"
2359 "ld1 {v3.b}[1], [a_ptr3], #1\n"
2360 "subs %[odds], %[odds], #0x1\n"
2361 "b.eq 9f\n"
2362 "ld1 {v0.b}[2], [%[a_ptr0]]\n"
2363 "ld1 {v1.b}[2], [a_ptr1]\n"
2364 "ld1 {v2.b}[2], [a_ptr2]\n"
2365 "ld1 {v3.b}[2], [a_ptr3]\n"
2366 "9:\n"
2367 "ldr q8, [%[b_ptr0]]\n"
2368 "ldr q9, [%[b_ptr0], #0x10]\n"
2369 "ldr q10, [%[b_ptr0], #0x20]\n"
2370 "ldr q11, [%[b_ptr0], #0x30]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002371 ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
2372 ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
2373 ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
2374 ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
2375 ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
2376 ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
2377 ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
2378 ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
2379 ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
2380 ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
2381 ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
2382 ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
2383 ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
2384 ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
2385 ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
2386 ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002387 "8:\n"
2388 "str q16, [%[c_ptr0]]\n"
2389 "str q17, [%[c_ptr0], #0x10]\n"
2390 "str q18, [%[c_ptr0], #0x20]\n"
2391 "str q19, [%[c_ptr0], #0x30]\n"
2392 "add %[c_ptr0], %[c_ptr0], #0x40\n"
2393 "str q20, [c_ptr1]\n"
2394 "str q21, [c_ptr1, #0x10]\n"
2395 "str q22, [c_ptr1, #0x20]\n"
2396 "str q23, [c_ptr1, #0x30]\n"
2397 "str q24, [c_ptr2]\n"
2398 "str q25, [c_ptr2, #0x10]\n"
2399 "str q26, [c_ptr2, #0x20]\n"
2400 "str q27, [c_ptr2, #0x30]\n"
2401 "str q28, [c_ptr3]\n"
2402 "str q29, [c_ptr3, #0x10]\n"
2403 "str q30, [c_ptr3, #0x20]\n"
2404 "str q31, [c_ptr3, #0x30]\n"
2405 ".unreq a_ptr1\n"
2406 ".unreq a_ptr2\n"
2407 ".unreq a_ptr3\n"
2408 ".unreq c_ptr1\n"
2409 ".unreq c_ptr2\n"
2410 ".unreq c_ptr3\n"
2411 ".unreq temploadreg0\n"
2412 ".unreq temploadreg1\n"
2413 ".unreq temploadreg2\n"
2414 ".unreq temploadreg3\n"
2415 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01002416 : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002417 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
Georgios Pinitas1d480652019-01-23 11:24:50 +00002418 );
2419 break;
2420 }
Georgios Pinitas14613832019-03-01 19:07:11 +00002421 if (use_result_buffer) {
2422 for(int cy=0; cy<std::min(M-y, 4); cy++) {
2423 for(unsigned int cx=0; cx<width; cx++) {
2424 c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
2425 }
2426 }
2427 }
Georgios Pinitas1d480652019-01-23 11:24:50 +00002428 }
2429 }
2430}
2431
2432} // namespace arm_gemm
2433
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002434#endif // __aarch64__