Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 1 | /* |
Georgios Pinitas | 5aa1a0b | 2020-07-02 20:02:20 +0100 | [diff] [blame] | 2 | * Copyright (c) 2018-2020 Arm Limited. |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #ifdef __aarch64__ |
| 25 | |
| 26 | #include <algorithm> |
| 27 | |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 28 | #include "arm_gemm.hpp" |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 29 | #include <cstdint> |
| 30 | #include "../../asmlib.hpp" |
| 31 | #include "../../utils.hpp" |
| 32 | |
| 33 | namespace arm_gemm { |
| 34 | |
Georgios Pinitas | 5aa1a0b | 2020-07-02 20:02:20 +0100 | [diff] [blame] | 35 | void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool append) { |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 36 | const int K_stride = ((K + 3) / 4) * 4; |
| 37 | const long loops_count = ((K + 16) / 32) - 1; |
| 38 | K -= loops_count * 32; |
| 39 | const long regs_count = (K / 16) - 1; |
Georgios Pinitas | 1461383 | 2019-03-01 19:07:11 +0000 | [diff] [blame] | 40 | K -= (regs_count + 1) * 16; |
| 41 | const long blocks_count = K / 4; |
| 42 | const long odds_count = K - (blocks_count * 4); |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 43 | |
Georgios Pinitas | 5aa1a0b | 2020-07-02 20:02:20 +0100 | [diff] [blame] | 44 | int rows_to_compute; |
| 45 | |
| 46 | for (int y=0; y<M; y+=rows_to_compute) { |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 47 | const uint8_t * const a_ptr0_base = A + (y * lda); |
| 48 | const unsigned long ldab = lda * sizeof(uint8_t); |
| 49 | |
| 50 | uint32_t *c_ptr0 = C + (y * ldc); |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 51 | |
Georgios Pinitas | 5aa1a0b | 2020-07-02 20:02:20 +0100 | [diff] [blame] | 52 | rows_to_compute = M-y; |
| 53 | if (rows_to_compute > 4) { |
| 54 | if (rows_to_compute % 4) { |
| 55 | rows_to_compute = 4 - 1; |
| 56 | } else { |
| 57 | rows_to_compute = 4; |
| 58 | } |
| 59 | } |
| 60 | |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 61 | for (int x0=0; x0<N; x0+=16ul) { |
| 62 | const long width = std::min((unsigned long)N-x0, 16ul); |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 63 | long loops = loops_count; |
| 64 | long regs = regs_count; |
Georgios Pinitas | 1461383 | 2019-03-01 19:07:11 +0000 | [diff] [blame] | 65 | long blocks = blocks_count; |
| 66 | long odds = odds_count; |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 67 | const uint8_t *a_ptr0 = a_ptr0_base; |
| 68 | const uint8_t *b_ptr0 = B + (K_stride * x0); |
Georgios Pinitas | 1461383 | 2019-03-01 19:07:11 +0000 | [diff] [blame] | 69 | const bool use_result_buffer = (width < 16); |
| 70 | uint32_t result_buffer[64]; |
| 71 | const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t); |
| 72 | uint32_t *c_ptr_real = c_ptr0; |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 73 | if (use_result_buffer && append) { |
Georgios Pinitas | 1461383 | 2019-03-01 19:07:11 +0000 | [diff] [blame] | 74 | for(int cy=0; cy<std::min(M-y, 4); cy++) { |
| 75 | for(unsigned int cx=0; cx<width; cx++) { |
| 76 | result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx]; |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | if (use_result_buffer) { |
| 81 | c_ptr0 = result_buffer; |
| 82 | } |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 83 | |
Georgios Pinitas | 5aa1a0b | 2020-07-02 20:02:20 +0100 | [diff] [blame] | 84 | switch(rows_to_compute) { |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 85 | case 1: |
| 86 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 87 | "temploadreg0 .req X0\n" |
| 88 | "temploadreg1 .req X1\n" |
| 89 | "temploadreg2 .req X2\n" |
| 90 | "temploadreg3 .req X3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 91 | "cbnz %[append], 1f\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 92 | "movi v16.4s, #0\n" |
| 93 | "ldr q0, [%[a_ptr0]]\n" |
| 94 | "movi v17.4s, #0\n" |
| 95 | "ldr q8, [%[b_ptr0]]\n" |
| 96 | "movi v18.4s, #0\n" |
| 97 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 98 | "movi v19.4s, #0\n" |
| 99 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 100 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 101 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
| 102 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 103 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 104 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 105 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 106 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 107 | "cbz %[loops], 2f\n" |
| 108 | "b 3f\n" |
| 109 | "1:\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 110 | "ldr q16, [%[c_ptr0]]\n" |
| 111 | "ldr q17, [%[c_ptr0], #0x10]\n" |
| 112 | "ldr q18, [%[c_ptr0], #0x20]\n" |
| 113 | "ldr q19, [%[c_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 114 | "ldr q0, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 115 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 116 | "ldr q8, [%[b_ptr0]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 117 | "ldr q9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 118 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 119 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 120 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 121 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 122 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 123 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 124 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 125 | "cbz %[loops], 2f\n" |
| 126 | "3:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 127 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 128 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 129 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 130 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 131 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 132 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 133 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 134 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 135 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 136 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 137 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 138 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 139 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 140 | "ldr d9, [%[b_ptr0], #0x10]\n" |
| 141 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
| 142 | "subs %[loops], %[loops], #0x1\n" |
| 143 | "ins v4.d[1], temploadreg0\n" |
| 144 | "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" |
| 145 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
| 146 | "add %[a_ptr0], %[a_ptr0], #0x20\n" |
| 147 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 148 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 149 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 150 | "ins v15.d[1], temploadreg3\n" |
| 151 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
| 152 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 153 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 154 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 155 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 156 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 157 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 158 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 159 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 160 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 161 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 162 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 163 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 164 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 165 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 166 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 167 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
| 168 | "ins v12.d[1], temploadreg0\n" |
| 169 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 170 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 171 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
| 172 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 173 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 174 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
| 175 | "ins v13.d[1], temploadreg1\n" |
| 176 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
| 177 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
| 178 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 179 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 180 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
| 181 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
| 182 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 183 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 184 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
| 185 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
| 186 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 187 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 188 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
| 189 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 190 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 191 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 192 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
| 193 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 194 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 195 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 196 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
| 197 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 198 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 199 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 200 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
| 201 | "ldr d0, [%[a_ptr0], #-0x10]\n" |
| 202 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 203 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 204 | "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" |
| 205 | "ldr d8, [%[b_ptr0]]\n" |
| 206 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 207 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 208 | "ins v0.d[1], temploadreg0\n" |
| 209 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
| 210 | "ins v13.d[1], temploadreg1\n" |
| 211 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
| 212 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 213 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 214 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 215 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 216 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 217 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 218 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 219 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
| 220 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 221 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 222 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 223 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 224 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 225 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 226 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 227 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 228 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 229 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 230 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 231 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 232 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 233 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 234 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 235 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
| 236 | "ins v12.d[1], temploadreg0\n" |
| 237 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 238 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 239 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
| 240 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 241 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 242 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
| 243 | "ins v13.d[1], temploadreg1\n" |
| 244 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
| 245 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
| 246 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 247 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 248 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
| 249 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
| 250 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 251 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 252 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
| 253 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
| 254 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 255 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 256 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
| 257 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 258 | "ins v9.d[1], temploadreg1\n" |
| 259 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
| 260 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 261 | "ins v10.d[1], temploadreg2\n" |
| 262 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
| 263 | "ins v11.d[1], temploadreg3\n" |
| 264 | "ins v12.d[1], temploadreg0\n" |
| 265 | "ins v13.d[1], temploadreg1\n" |
| 266 | "b.ne 3b\n" |
| 267 | "2:\n" |
| 268 | "ins v14.d[1], temploadreg2\n" |
| 269 | "prfm PSTL1KEEP, [%[c_ptr0]]\n" |
| 270 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 271 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
| 272 | "ins v15.d[1], temploadreg3\n" |
| 273 | "cbz %[regs], 4f\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 274 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 275 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 276 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 277 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 278 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 279 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 280 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 281 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 282 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 283 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 284 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 285 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 286 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 287 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 288 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 289 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 290 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 291 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
| 292 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 293 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
| 294 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 295 | "ins v8.d[1], temploadreg0\n" |
| 296 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 297 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 298 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 299 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 300 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 301 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 302 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 303 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 304 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 305 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 306 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 307 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 308 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
| 309 | "ins v12.d[1], temploadreg0\n" |
| 310 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 311 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 312 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
| 313 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 314 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 315 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
| 316 | "ins v13.d[1], temploadreg1\n" |
| 317 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
| 318 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
| 319 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 320 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 321 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
| 322 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
| 323 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 324 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 325 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
| 326 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
| 327 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 328 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 329 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
| 330 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 331 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 332 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 333 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
| 334 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 335 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 336 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 337 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
| 338 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 339 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 340 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 341 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
| 342 | "ldr d8, [%[b_ptr0]]\n" |
| 343 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 344 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 345 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
| 346 | "ldr d9, [%[b_ptr0], #0x10]\n" |
| 347 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 348 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 349 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
| 350 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 351 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 352 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 353 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 354 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 355 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 356 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 357 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
| 358 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 359 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 360 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 361 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 362 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 363 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 364 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 365 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 366 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 367 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 368 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 369 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 370 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 371 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 372 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 373 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
| 374 | "ins v12.d[1], temploadreg0\n" |
| 375 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 376 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 377 | "ins v13.d[1], temploadreg1\n" |
| 378 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 379 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 380 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 381 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
| 382 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
| 383 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 384 | "b 5f\n" |
| 385 | "4:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 386 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 387 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 388 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 389 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 390 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 391 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 392 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 393 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 394 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 395 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 396 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 397 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 398 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 399 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 400 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 401 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
| 402 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 403 | "ins v8.d[1], temploadreg0\n" |
| 404 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 405 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 406 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 407 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 408 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 409 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 410 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 411 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 412 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 413 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 414 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 415 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 416 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
| 417 | "ins v12.d[1], temploadreg0\n" |
| 418 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 419 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 420 | "ins v13.d[1], temploadreg1\n" |
| 421 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 422 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 423 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 424 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
| 425 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
| 426 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 427 | "5:\n" |
| 428 | "cbz %[blocks], 6f\n" |
| 429 | "7:\n" |
| 430 | "ldr q8, [%[b_ptr0]]\n" |
| 431 | "subs %[blocks], %[blocks], #0x1\n" |
| 432 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 433 | "ldr s0, [%[a_ptr0]]\n" |
| 434 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 435 | "add %[a_ptr0], %[a_ptr0], #0x4\n" |
| 436 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 437 | "add %[b_ptr0], %[b_ptr0], #0x40\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 438 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
| 439 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
| 440 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
| 441 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 442 | "b.ne 7b\n" |
| 443 | "6:\n" |
| 444 | "cbz %[odds], 8f\n" |
| 445 | "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" |
| 446 | "subs %[odds], %[odds], #0x1\n" |
| 447 | "b.eq 9f\n" |
| 448 | "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" |
| 449 | "subs %[odds], %[odds], #0x1\n" |
| 450 | "b.eq 9f\n" |
| 451 | "ld1 {v0.b}[2], [%[a_ptr0]]\n" |
| 452 | "9:\n" |
| 453 | "ldr q8, [%[b_ptr0]]\n" |
| 454 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 455 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 456 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 457 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
| 458 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
| 459 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
| 460 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 461 | "8:\n" |
| 462 | "str q16, [%[c_ptr0]]\n" |
| 463 | "str q17, [%[c_ptr0], #0x10]\n" |
| 464 | "str q18, [%[c_ptr0], #0x20]\n" |
| 465 | "str q19, [%[c_ptr0], #0x30]\n" |
| 466 | "add %[c_ptr0], %[c_ptr0], #0x40\n" |
| 467 | ".unreq temploadreg0\n" |
| 468 | ".unreq temploadreg1\n" |
| 469 | ".unreq temploadreg2\n" |
| 470 | ".unreq temploadreg3\n" |
| 471 | : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 472 | : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb) |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 473 | : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory" |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 474 | ); |
| 475 | break; |
| 476 | case 2: |
| 477 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 478 | "a_ptr1 .req X0\n" |
| 479 | "c_ptr1 .req X1\n" |
| 480 | "temploadreg0 .req X2\n" |
| 481 | "temploadreg1 .req X3\n" |
| 482 | "temploadreg2 .req X4\n" |
| 483 | "temploadreg3 .req X5\n" |
| 484 | "add a_ptr1, %[a_ptr0], %[lda]\n" |
| 485 | "add c_ptr1, %[c_ptr0], %[ldc]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 486 | "cbnz %[append], 1f\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 487 | "movi v16.4s, #0\n" |
| 488 | "ldr q0, [%[a_ptr0]]\n" |
| 489 | "movi v17.4s, #0\n" |
| 490 | "ldr q1, [a_ptr1]\n" |
| 491 | "movi v18.4s, #0\n" |
| 492 | "ldr q8, [%[b_ptr0]]\n" |
| 493 | "movi v19.4s, #0\n" |
| 494 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 495 | "movi v20.4s, #0\n" |
| 496 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 497 | "movi v21.4s, #0\n" |
| 498 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 499 | "movi v22.4s, #0\n" |
| 500 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 501 | "movi v23.4s, #0\n" |
| 502 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 503 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 504 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
| 505 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 506 | "add a_ptr1, a_ptr1, #0x10\n" |
| 507 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 508 | "cbz %[loops], 2f\n" |
| 509 | "b 3f\n" |
| 510 | "1:\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 511 | "ldr q16, [%[c_ptr0]]\n" |
| 512 | "ldr q17, [%[c_ptr0], #0x10]\n" |
| 513 | "ldr q18, [%[c_ptr0], #0x20]\n" |
| 514 | "ldr q19, [%[c_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 515 | "ldr q20, [c_ptr1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 516 | "ldr q21, [c_ptr1, #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 517 | "ldr q22, [c_ptr1, #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 518 | "ldr q23, [c_ptr1, #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 519 | "ldr q0, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 520 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 521 | "ldr q1, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 522 | "add a_ptr1, a_ptr1, #0x10\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 523 | "ldr q8, [%[b_ptr0]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 524 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 525 | "ldr q10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 526 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 527 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 528 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 529 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 530 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 531 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 532 | "cbz %[loops], 2f\n" |
| 533 | "3:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 534 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 535 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 536 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 537 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 538 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 539 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 540 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 541 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 542 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 543 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 544 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 545 | "ldr d5, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 546 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 547 | "ldr temploadreg1, [a_ptr1, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 548 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 549 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 550 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 551 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 552 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 553 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 554 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 555 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 556 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 557 | "ins v5.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 558 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 559 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 560 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 561 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 562 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 563 | "subs %[loops], %[loops], #0x1\n" |
| 564 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 565 | "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" |
| 566 | "ins v15.d[1], temploadreg3\n" |
| 567 | "add %[a_ptr0], %[a_ptr0], #0x20\n" |
| 568 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
| 569 | "add a_ptr1, a_ptr1, #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 570 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 571 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 572 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 573 | "ins v8.d[1], temploadreg0\n" |
| 574 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 575 | "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" |
| 576 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 577 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 578 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 579 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 580 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 581 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 582 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 583 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 584 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 585 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 586 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 587 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 588 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 589 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 590 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 591 | "ins v12.d[1], temploadreg0\n" |
| 592 | "ins v13.d[1], temploadreg1\n" |
| 593 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 594 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 595 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 596 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 597 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 598 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 599 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 600 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 601 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 602 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 603 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 604 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 605 | "ins v14.d[1], temploadreg2\n" |
| 606 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
| 607 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
| 608 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 609 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 610 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 611 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 612 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
| 613 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 614 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 615 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 616 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 617 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 618 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 619 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 620 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 621 | ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 622 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 623 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 624 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 625 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 626 | ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 627 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 628 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 629 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 630 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 631 | ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 632 | "ldr d0, [%[a_ptr0], #-0x10]\n" |
| 633 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 634 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 635 | "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 636 | ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 637 | "ldr d1, [a_ptr1, #-0x10]\n" |
| 638 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 639 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 640 | "ldr temploadreg1, [a_ptr1, #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 641 | ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 642 | "ldr d8, [%[b_ptr0]]\n" |
| 643 | "ins v0.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 644 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 645 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 646 | ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 647 | "ldr d9, [%[b_ptr0], #0x10]\n" |
| 648 | "ins v1.d[1], temploadreg1\n" |
| 649 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
| 650 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 651 | "ins v14.d[1], temploadreg2\n" |
| 652 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 653 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 654 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 655 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 656 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 657 | ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 658 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 659 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 660 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 661 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 662 | ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 663 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 664 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 665 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 666 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 667 | ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 668 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 669 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 670 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 671 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 672 | ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 673 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 674 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 675 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 676 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 677 | ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 678 | "ins v12.d[1], temploadreg0\n" |
| 679 | "ins v13.d[1], temploadreg1\n" |
| 680 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 681 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 682 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 683 | ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 684 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 685 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 686 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 687 | ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 688 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 689 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 690 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 691 | ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 692 | "ins v14.d[1], temploadreg2\n" |
| 693 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
| 694 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
| 695 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 696 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 697 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 698 | ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 699 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
| 700 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 701 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 702 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 703 | ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 704 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 705 | "ins v9.d[1], temploadreg1\n" |
| 706 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
| 707 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 708 | "ins v10.d[1], temploadreg2\n" |
| 709 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
| 710 | "ins v11.d[1], temploadreg3\n" |
| 711 | "ins v12.d[1], temploadreg0\n" |
| 712 | "ins v13.d[1], temploadreg1\n" |
| 713 | "b.ne 3b\n" |
| 714 | "2:\n" |
| 715 | "ins v14.d[1], temploadreg2\n" |
| 716 | "prfm PSTL1KEEP, [%[c_ptr0]]\n" |
| 717 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 718 | "prfm PSTL1KEEP, [c_ptr1]\n" |
| 719 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
| 720 | "ins v15.d[1], temploadreg3\n" |
| 721 | "cbz %[regs], 4f\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 722 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 723 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 724 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 725 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 726 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 727 | "ldr d5, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 728 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 729 | "ldr temploadreg1, [a_ptr1, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 730 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 731 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 732 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 733 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 734 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 735 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 736 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 737 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 738 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 739 | "ins v5.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 740 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 741 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 742 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 743 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 744 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 745 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 746 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 747 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 748 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 749 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 750 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 751 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 752 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 753 | "ins v8.d[1], temploadreg0\n" |
| 754 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
| 755 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
| 756 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 757 | "add a_ptr1, a_ptr1, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 758 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 759 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 760 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 761 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 762 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 763 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 764 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 765 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 766 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 767 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 768 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 769 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 770 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 771 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 772 | "ins v12.d[1], temploadreg0\n" |
| 773 | "ins v13.d[1], temploadreg1\n" |
| 774 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 775 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 776 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 777 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 778 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 779 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 780 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 781 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 782 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 783 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 784 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 785 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 786 | "ins v14.d[1], temploadreg2\n" |
| 787 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
| 788 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
| 789 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 790 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 791 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 792 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 793 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
| 794 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 795 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 796 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 797 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 798 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 799 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 800 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 801 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 802 | ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 803 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 804 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 805 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 806 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 807 | ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 808 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 809 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 810 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 811 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 812 | ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 813 | "ldr d8, [%[b_ptr0]]\n" |
| 814 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 815 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 816 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 817 | ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 818 | "ldr d9, [%[b_ptr0], #0x10]\n" |
| 819 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 820 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 821 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 822 | ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 823 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 824 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 825 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 826 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 827 | ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 828 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 829 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 830 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 831 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 832 | ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 833 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 834 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 835 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 836 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 837 | ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 838 | "ldr d13, [%[b_ptr0], #0x50]\n" |
| 839 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 840 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 841 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 842 | ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 843 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 844 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 845 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 846 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 847 | ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 848 | "ldr d15, [%[b_ptr0], #0x70]\n" |
| 849 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 850 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 851 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 852 | ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 853 | "ins v12.d[1], temploadreg0\n" |
| 854 | "ins v13.d[1], temploadreg1\n" |
| 855 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 856 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 857 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 858 | ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 859 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 860 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
| 861 | ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" |
| 862 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
| 863 | ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" |
| 864 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
| 865 | ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" |
| 866 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
| 867 | ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 868 | "b 5f\n" |
| 869 | "4:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 870 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 871 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 872 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 873 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 874 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 875 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 876 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 877 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 878 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 879 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 880 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 881 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 882 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 883 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 884 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 885 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 886 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 887 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 888 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 889 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 890 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 891 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 892 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 893 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 894 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 895 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 896 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 897 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 898 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 899 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 900 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 901 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 902 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 903 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 904 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 905 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 906 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 907 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 908 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 909 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 910 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 911 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 912 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 913 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 914 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 915 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 916 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 917 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 918 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 919 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 920 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
| 921 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
| 922 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
| 923 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
| 924 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
| 925 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
| 926 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 927 | "5:\n" |
| 928 | "cbz %[blocks], 6f\n" |
| 929 | "7:\n" |
| 930 | "ldr q8, [%[b_ptr0]]\n" |
| 931 | "subs %[blocks], %[blocks], #0x1\n" |
| 932 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 933 | "ldr s0, [%[a_ptr0]]\n" |
| 934 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 935 | "add %[a_ptr0], %[a_ptr0], #0x4\n" |
| 936 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 937 | "add %[b_ptr0], %[b_ptr0], #0x40\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 938 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 939 | "ldr s1, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 940 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 941 | "add a_ptr1, a_ptr1, #0x4\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 942 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
| 943 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
| 944 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
| 945 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 946 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
| 947 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 948 | "b.ne 7b\n" |
| 949 | "6:\n" |
| 950 | "cbz %[odds], 8f\n" |
| 951 | "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" |
| 952 | "ld1 {v1.b}[0], [a_ptr1], #1\n" |
| 953 | "subs %[odds], %[odds], #0x1\n" |
| 954 | "b.eq 9f\n" |
| 955 | "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" |
| 956 | "ld1 {v1.b}[1], [a_ptr1], #1\n" |
| 957 | "subs %[odds], %[odds], #0x1\n" |
| 958 | "b.eq 9f\n" |
| 959 | "ld1 {v0.b}[2], [%[a_ptr0]]\n" |
| 960 | "ld1 {v1.b}[2], [a_ptr1]\n" |
| 961 | "9:\n" |
| 962 | "ldr q8, [%[b_ptr0]]\n" |
| 963 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 964 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 965 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 966 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
| 967 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
| 968 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
| 969 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
| 970 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
| 971 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 972 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
| 973 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 974 | "8:\n" |
| 975 | "str q16, [%[c_ptr0]]\n" |
| 976 | "str q17, [%[c_ptr0], #0x10]\n" |
| 977 | "str q18, [%[c_ptr0], #0x20]\n" |
| 978 | "str q19, [%[c_ptr0], #0x30]\n" |
| 979 | "add %[c_ptr0], %[c_ptr0], #0x40\n" |
| 980 | "str q20, [c_ptr1]\n" |
| 981 | "str q21, [c_ptr1, #0x10]\n" |
| 982 | "str q22, [c_ptr1, #0x20]\n" |
| 983 | "str q23, [c_ptr1, #0x30]\n" |
| 984 | ".unreq a_ptr1\n" |
| 985 | ".unreq c_ptr1\n" |
| 986 | ".unreq temploadreg0\n" |
| 987 | ".unreq temploadreg1\n" |
| 988 | ".unreq temploadreg2\n" |
| 989 | ".unreq temploadreg3\n" |
| 990 | : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 991 | : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb) |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 992 | : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory" |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 993 | ); |
| 994 | break; |
| 995 | case 3: |
| 996 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 997 | "a_ptr1 .req X0\n" |
| 998 | "a_ptr2 .req X1\n" |
| 999 | "c_ptr1 .req X2\n" |
| 1000 | "c_ptr2 .req X3\n" |
| 1001 | "temploadreg0 .req X4\n" |
| 1002 | "temploadreg1 .req X5\n" |
| 1003 | "temploadreg2 .req X6\n" |
| 1004 | "temploadreg3 .req X7\n" |
| 1005 | "add a_ptr1, %[a_ptr0], %[lda]\n" |
| 1006 | "add c_ptr1, %[c_ptr0], %[ldc]\n" |
| 1007 | "add a_ptr2, a_ptr1, %[lda]\n" |
| 1008 | "add c_ptr2, c_ptr1, %[ldc]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1009 | "cbnz %[append], 1f\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1010 | "movi v16.4s, #0\n" |
| 1011 | "ldr q0, [%[a_ptr0]]\n" |
| 1012 | "movi v17.4s, #0\n" |
| 1013 | "ldr q1, [a_ptr1]\n" |
| 1014 | "movi v18.4s, #0\n" |
| 1015 | "ldr q2, [a_ptr2]\n" |
| 1016 | "movi v19.4s, #0\n" |
| 1017 | "ldr q8, [%[b_ptr0]]\n" |
| 1018 | "movi v20.4s, #0\n" |
| 1019 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 1020 | "movi v21.4s, #0\n" |
| 1021 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 1022 | "movi v22.4s, #0\n" |
| 1023 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 1024 | "movi v23.4s, #0\n" |
| 1025 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 1026 | "movi v24.4s, #0\n" |
| 1027 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 1028 | "movi v25.4s, #0\n" |
| 1029 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 1030 | "movi v26.4s, #0\n" |
| 1031 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 1032 | "movi v27.4s, #0\n" |
| 1033 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
| 1034 | "add a_ptr1, a_ptr1, #0x10\n" |
| 1035 | "ins v14.d[1], temploadreg2\n" |
| 1036 | "add a_ptr2, a_ptr2, #0x10\n" |
| 1037 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 1038 | "cbz %[loops], 2f\n" |
| 1039 | "b 3f\n" |
| 1040 | "1:\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1041 | "ldr q16, [%[c_ptr0]]\n" |
| 1042 | "ldr q17, [%[c_ptr0], #0x10]\n" |
| 1043 | "ldr q18, [%[c_ptr0], #0x20]\n" |
| 1044 | "ldr q19, [%[c_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1045 | "ldr q20, [c_ptr1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1046 | "ldr q21, [c_ptr1, #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1047 | "ldr q22, [c_ptr1, #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1048 | "ldr q23, [c_ptr1, #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1049 | "ldr q24, [c_ptr2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1050 | "ldr q25, [c_ptr2, #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1051 | "ldr q26, [c_ptr2, #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1052 | "ldr q27, [c_ptr2, #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1053 | "ldr q0, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1054 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1055 | "ldr q1, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1056 | "add a_ptr1, a_ptr1, #0x10\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1057 | "ldr q2, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1058 | "add a_ptr2, a_ptr2, #0x10\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1059 | "ldr q8, [%[b_ptr0]]\n" |
| 1060 | "ldr q9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1061 | "ldr q10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1062 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1063 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 1064 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 1065 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 1066 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 1067 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 1068 | "ins v14.d[1], temploadreg2\n" |
| 1069 | "cbz %[loops], 2f\n" |
| 1070 | "3:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1071 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1072 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1073 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1074 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1075 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1076 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1077 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1078 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1079 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1080 | "ldr d5, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1081 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1082 | "ldr temploadreg1, [a_ptr1, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1083 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1084 | "ldr d6, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1085 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1086 | "ldr temploadreg2, [a_ptr2, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1087 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1088 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1089 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1090 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1091 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1092 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1093 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1094 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1095 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1096 | "ins v5.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1097 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1098 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1099 | ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1100 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1101 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1102 | "ins v6.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1103 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1104 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1105 | ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1106 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1107 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1108 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1109 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1110 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1111 | ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1112 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 1113 | "ins v8.d[1], temploadreg0\n" |
| 1114 | "subs %[loops], %[loops], #0x1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1115 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1116 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1117 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1118 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1119 | ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1120 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1121 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1122 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1123 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1124 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1125 | ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1126 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1127 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1128 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1129 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1130 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1131 | ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1132 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1133 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1134 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1135 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1136 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1137 | ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1138 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1139 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1140 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1141 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1142 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1143 | ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1144 | "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1145 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1146 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1147 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1148 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1149 | ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1150 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1151 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1152 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1153 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1154 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1155 | ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1156 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1157 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1158 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1159 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1160 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1161 | ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1162 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1163 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1164 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1165 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1166 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1167 | ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1168 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
| 1169 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 1170 | "add %[a_ptr0], %[a_ptr0], #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1171 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1172 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1173 | ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1174 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1175 | ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1176 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 1177 | "ins v10.d[1], temploadreg2\n" |
| 1178 | "add a_ptr1, a_ptr1, #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1179 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1180 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1181 | ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1182 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1183 | ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1184 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1185 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1186 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1187 | ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1188 | "ldr d0, [%[a_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1189 | ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1190 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1191 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1192 | "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1193 | ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1194 | "ldr d1, [a_ptr1, #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1195 | ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1196 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1197 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1198 | "ldr temploadreg1, [a_ptr1, #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1199 | ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1200 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1201 | ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1202 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1203 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1204 | "ins v0.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1205 | ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1206 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1207 | ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1208 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1209 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1210 | "ins v1.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1211 | ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1212 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1213 | ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1214 | "ldr d10, [%[b_ptr0], #0x20]\n" |
| 1215 | "ldr d11, [%[b_ptr0], #0x30]\n" |
| 1216 | "add a_ptr2, a_ptr2, #0x20\n" |
| 1217 | "ins v15.d[1], temploadreg3\n" |
| 1218 | "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" |
| 1219 | "ldr d2, [a_ptr2, #-0x10]\n" |
| 1220 | "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1221 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1222 | "ldr temploadreg2, [a_ptr2, #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1223 | ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1224 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1225 | ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1226 | "ldr d12, [%[b_ptr0], #0x40]\n" |
| 1227 | "ins v8.d[1], temploadreg0\n" |
| 1228 | "ins v2.d[1], temploadreg2\n" |
| 1229 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
| 1230 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1231 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1232 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1233 | ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1234 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1235 | ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1236 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
| 1237 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 1238 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1239 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1240 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1241 | ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1242 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1243 | ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1244 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1245 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1246 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1247 | ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1248 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1249 | ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1250 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1251 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1252 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1253 | ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1254 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1255 | ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1256 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1257 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1258 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1259 | ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1260 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1261 | ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1262 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1263 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1264 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1265 | ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1266 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1267 | ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1268 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1269 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1270 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1271 | ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1272 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1273 | ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1274 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1275 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1276 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1277 | ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1278 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1279 | ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1280 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
| 1281 | "ins v9.d[1], temploadreg1\n" |
| 1282 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
| 1283 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
| 1284 | "ins v10.d[1], temploadreg2\n" |
| 1285 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
| 1286 | "ins v11.d[1], temploadreg3\n" |
| 1287 | "ins v12.d[1], temploadreg0\n" |
| 1288 | "ins v13.d[1], temploadreg1\n" |
| 1289 | "ins v14.d[1], temploadreg2\n" |
| 1290 | "b.ne 3b\n" |
| 1291 | "2:\n" |
| 1292 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 1293 | "prfm PSTL1KEEP, [%[c_ptr0]]\n" |
| 1294 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
| 1295 | "prfm PSTL1KEEP, [c_ptr1]\n" |
| 1296 | "prfm PSTL1KEEP, [c_ptr2]\n" |
| 1297 | "ins v15.d[1], temploadreg3\n" |
| 1298 | "cbz %[regs], 4f\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1299 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1300 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1301 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1302 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1303 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1304 | "ldr d5, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1305 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1306 | "ldr temploadreg1, [a_ptr1, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1307 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1308 | "ldr d6, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1309 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1310 | "ldr temploadreg2, [a_ptr2, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1311 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1312 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1313 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1314 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1315 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1316 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1317 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1318 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1319 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1320 | "ins v5.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1321 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1322 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1323 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1324 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1325 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1326 | "ins v6.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1327 | ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1328 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1329 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1330 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1331 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1332 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1333 | ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1334 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1335 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1336 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1337 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1338 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1339 | ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1340 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1341 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1342 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1343 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1344 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1345 | ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1346 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1347 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1348 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1349 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1350 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1351 | ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1352 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1353 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1354 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1355 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1356 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1357 | ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1358 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1359 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1360 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1361 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1362 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1363 | ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1364 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1365 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1366 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1367 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1368 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1369 | ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1370 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1371 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1372 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1373 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1374 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1375 | ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1376 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1377 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1378 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1379 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1380 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1381 | ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1382 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1383 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1384 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1385 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1386 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1387 | ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1388 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1389 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1390 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1391 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1392 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1393 | ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1394 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1395 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1396 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1397 | ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1398 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1399 | ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1400 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1401 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1402 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1403 | ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1404 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1405 | ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1406 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1407 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1408 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1409 | ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1410 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1411 | ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1412 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1413 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1414 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1415 | ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1416 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1417 | ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1418 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1419 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1420 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1421 | ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1422 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1423 | ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1424 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1425 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1426 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1427 | ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1428 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1429 | ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1430 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1431 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1432 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1433 | ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1434 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1435 | ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1436 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1437 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1438 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1439 | ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1440 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1441 | ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1442 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1443 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1444 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1445 | ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1446 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1447 | ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1448 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1449 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1450 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1451 | ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1452 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1453 | ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1454 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1455 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1456 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1457 | ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1458 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1459 | ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1460 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1461 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1462 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1463 | ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1464 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1465 | ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1466 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1467 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1468 | "add a_ptr1, a_ptr1, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1469 | ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1470 | "add a_ptr2, a_ptr2, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1471 | ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" |
| 1472 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
| 1473 | ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" |
| 1474 | ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" |
| 1475 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
| 1476 | ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" |
| 1477 | ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" |
| 1478 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
| 1479 | ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" |
| 1480 | ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1481 | "b 5f\n" |
| 1482 | "4:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1483 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1484 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1485 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1486 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1487 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1488 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1489 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1490 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1491 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1492 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1493 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1494 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1495 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1496 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1497 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1498 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1499 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1500 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1501 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1502 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1503 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1504 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1505 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1506 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1507 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1508 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1509 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1510 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1511 | ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1512 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1513 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1514 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1515 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1516 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1517 | ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1518 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1519 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1520 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1521 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
| 1522 | ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1523 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1524 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1525 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1526 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
| 1527 | ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1528 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1529 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1530 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1531 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1532 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1533 | ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1534 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1535 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
| 1536 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
| 1537 | ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" |
| 1538 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
| 1539 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
| 1540 | ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" |
| 1541 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
| 1542 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
| 1543 | ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" |
| 1544 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
| 1545 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
| 1546 | ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" |
| 1547 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
| 1548 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
| 1549 | ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" |
| 1550 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
| 1551 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
| 1552 | ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" |
| 1553 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
| 1554 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
| 1555 | ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1556 | "5:\n" |
| 1557 | "cbz %[blocks], 6f\n" |
| 1558 | "7:\n" |
| 1559 | "ldr q8, [%[b_ptr0]]\n" |
| 1560 | "subs %[blocks], %[blocks], #0x1\n" |
| 1561 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 1562 | "ldr s0, [%[a_ptr0]]\n" |
| 1563 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 1564 | "add %[a_ptr0], %[a_ptr0], #0x4\n" |
| 1565 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 1566 | "add %[b_ptr0], %[b_ptr0], #0x40\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1567 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1568 | "ldr s1, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1569 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1570 | "add a_ptr1, a_ptr1, #0x4\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1571 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1572 | "ldr s2, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1573 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1574 | "add a_ptr2, a_ptr2, #0x4\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1575 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
| 1576 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
| 1577 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
| 1578 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 1579 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
| 1580 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
| 1581 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
| 1582 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1583 | "b.ne 7b\n" |
| 1584 | "6:\n" |
| 1585 | "cbz %[odds], 8f\n" |
| 1586 | "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" |
| 1587 | "ld1 {v1.b}[0], [a_ptr1], #1\n" |
| 1588 | "ld1 {v2.b}[0], [a_ptr2], #1\n" |
| 1589 | "subs %[odds], %[odds], #0x1\n" |
| 1590 | "b.eq 9f\n" |
| 1591 | "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" |
| 1592 | "ld1 {v1.b}[1], [a_ptr1], #1\n" |
| 1593 | "ld1 {v2.b}[1], [a_ptr2], #1\n" |
| 1594 | "subs %[odds], %[odds], #0x1\n" |
| 1595 | "b.eq 9f\n" |
| 1596 | "ld1 {v0.b}[2], [%[a_ptr0]]\n" |
| 1597 | "ld1 {v1.b}[2], [a_ptr1]\n" |
| 1598 | "ld1 {v2.b}[2], [a_ptr2]\n" |
| 1599 | "9:\n" |
| 1600 | "ldr q8, [%[b_ptr0]]\n" |
| 1601 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 1602 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 1603 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1604 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
| 1605 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
| 1606 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
| 1607 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
| 1608 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
| 1609 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
| 1610 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
| 1611 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 1612 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
| 1613 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
| 1614 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
| 1615 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1616 | "8:\n" |
| 1617 | "str q16, [%[c_ptr0]]\n" |
| 1618 | "str q17, [%[c_ptr0], #0x10]\n" |
| 1619 | "str q18, [%[c_ptr0], #0x20]\n" |
| 1620 | "str q19, [%[c_ptr0], #0x30]\n" |
| 1621 | "add %[c_ptr0], %[c_ptr0], #0x40\n" |
| 1622 | "str q20, [c_ptr1]\n" |
| 1623 | "str q21, [c_ptr1, #0x10]\n" |
| 1624 | "str q22, [c_ptr1, #0x20]\n" |
| 1625 | "str q23, [c_ptr1, #0x30]\n" |
| 1626 | "str q24, [c_ptr2]\n" |
| 1627 | "str q25, [c_ptr2, #0x10]\n" |
| 1628 | "str q26, [c_ptr2, #0x20]\n" |
| 1629 | "str q27, [c_ptr2, #0x30]\n" |
| 1630 | ".unreq a_ptr1\n" |
| 1631 | ".unreq a_ptr2\n" |
| 1632 | ".unreq c_ptr1\n" |
| 1633 | ".unreq c_ptr2\n" |
| 1634 | ".unreq temploadreg0\n" |
| 1635 | ".unreq temploadreg1\n" |
| 1636 | ".unreq temploadreg2\n" |
| 1637 | ".unreq temploadreg3\n" |
| 1638 | : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1639 | : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb) |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1640 | : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory" |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 1641 | ); |
| 1642 | break; |
| 1643 | default: |
| 1644 | case 4: |
| 1645 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1646 | "a_ptr1 .req X0\n" |
| 1647 | "a_ptr2 .req X1\n" |
| 1648 | "a_ptr3 .req X2\n" |
| 1649 | "c_ptr1 .req X3\n" |
| 1650 | "c_ptr2 .req X4\n" |
| 1651 | "c_ptr3 .req X5\n" |
| 1652 | "temploadreg0 .req X6\n" |
| 1653 | "temploadreg1 .req X7\n" |
| 1654 | "temploadreg2 .req X8\n" |
| 1655 | "temploadreg3 .req X9\n" |
| 1656 | "add a_ptr1, %[a_ptr0], %[lda]\n" |
| 1657 | "add c_ptr1, %[c_ptr0], %[ldc]\n" |
| 1658 | "add a_ptr2, a_ptr1, %[lda]\n" |
| 1659 | "add c_ptr2, c_ptr1, %[ldc]\n" |
| 1660 | "add a_ptr3, a_ptr2, %[lda]\n" |
| 1661 | "add c_ptr3, c_ptr2, %[ldc]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1662 | "cbnz %[append], 1f\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1663 | "movi v16.4s, #0\n" |
| 1664 | "ldr q0, [%[a_ptr0]]\n" |
| 1665 | "movi v17.4s, #0\n" |
| 1666 | "ldr q1, [a_ptr1]\n" |
| 1667 | "movi v18.4s, #0\n" |
| 1668 | "ldr q2, [a_ptr2]\n" |
| 1669 | "movi v19.4s, #0\n" |
| 1670 | "ldr q3, [a_ptr3]\n" |
| 1671 | "movi v20.4s, #0\n" |
| 1672 | "ldr q8, [%[b_ptr0]]\n" |
| 1673 | "movi v21.4s, #0\n" |
| 1674 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 1675 | "movi v22.4s, #0\n" |
| 1676 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 1677 | "movi v23.4s, #0\n" |
| 1678 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 1679 | "movi v24.4s, #0\n" |
| 1680 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 1681 | "movi v25.4s, #0\n" |
| 1682 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 1683 | "movi v26.4s, #0\n" |
| 1684 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 1685 | "movi v27.4s, #0\n" |
| 1686 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 1687 | "movi v28.4s, #0\n" |
| 1688 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
| 1689 | "movi v29.4s, #0\n" |
| 1690 | "ins v14.d[1], temploadreg2\n" |
| 1691 | "movi v30.4s, #0\n" |
| 1692 | "add a_ptr1, a_ptr1, #0x10\n" |
| 1693 | "movi v31.4s, #0\n" |
| 1694 | "add a_ptr2, a_ptr2, #0x10\n" |
| 1695 | "add a_ptr3, a_ptr3, #0x10\n" |
| 1696 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 1697 | "cbz %[loops], 2f\n" |
| 1698 | "b 3f\n" |
| 1699 | "1:\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1700 | "ldr q16, [%[c_ptr0]]\n" |
| 1701 | "ldr q17, [%[c_ptr0], #0x10]\n" |
| 1702 | "ldr q18, [%[c_ptr0], #0x20]\n" |
| 1703 | "ldr q19, [%[c_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1704 | "ldr q20, [c_ptr1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1705 | "ldr q21, [c_ptr1, #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1706 | "ldr q22, [c_ptr1, #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1707 | "ldr q23, [c_ptr1, #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1708 | "ldr q24, [c_ptr2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1709 | "ldr q25, [c_ptr2, #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1710 | "ldr q26, [c_ptr2, #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1711 | "ldr q27, [c_ptr2, #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1712 | "ldr q28, [c_ptr3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1713 | "ldr q29, [c_ptr3, #0x10]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1714 | "ldr q30, [c_ptr3, #0x20]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1715 | "ldr q31, [c_ptr3, #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1716 | "ldr q0, [%[a_ptr0]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1717 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1718 | "ldr q1, [a_ptr1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1719 | "add a_ptr1, a_ptr1, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1720 | "ldr q2, [a_ptr2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1721 | "add a_ptr2, a_ptr2, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1722 | "ldr q3, [a_ptr3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1723 | "add a_ptr3, a_ptr3, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1724 | "ldr q8, [%[b_ptr0]]\n" |
| 1725 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 1726 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 1727 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1728 | "ldr q12, [%[b_ptr0], #0x40]\n" |
| 1729 | "ldr q13, [%[b_ptr0], #0x50]\n" |
| 1730 | "ldr d14, [%[b_ptr0], #0x60]\n" |
| 1731 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
| 1732 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
| 1733 | "ins v14.d[1], temploadreg2\n" |
| 1734 | "cbz %[loops], 2f\n" |
| 1735 | "3:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1736 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1737 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1738 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1739 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1740 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1741 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1742 | ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1743 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1744 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1745 | "ldr d5, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1746 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1747 | "ldr temploadreg1, [a_ptr1, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1748 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1749 | "ldr d6, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1750 | ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1751 | "ldr temploadreg2, [a_ptr2, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1752 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1753 | "ldr d7, [a_ptr3]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1754 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1755 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1756 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1757 | "ldr temploadreg3, [a_ptr3, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1758 | ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1759 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1760 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1761 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1762 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1763 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1764 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1765 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1766 | ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1767 | "ins v5.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1768 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1769 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1770 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1771 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1772 | ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1773 | "ins v6.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1774 | ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1775 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1776 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1777 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1778 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1779 | "ins v7.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1780 | ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1781 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1782 | ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1783 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1784 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1785 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1786 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1787 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1788 | ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1789 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1790 | ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1791 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1792 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1793 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1794 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1795 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1796 | ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1797 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1798 | ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1799 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1800 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1801 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1802 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1803 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1804 | ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1805 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1806 | ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1807 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1808 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1809 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1810 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1811 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1812 | ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1813 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1814 | ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1815 | "subs %[loops], %[loops], #0x1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1816 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1817 | "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1818 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1819 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1820 | ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1821 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1822 | ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1823 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1824 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1825 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1826 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1827 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1828 | ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1829 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1830 | ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1831 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1832 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1833 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1834 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1835 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1836 | ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1837 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1838 | ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1839 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1840 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1841 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1842 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1843 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1844 | ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1845 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1846 | ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1847 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1848 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1849 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1850 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1851 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1852 | ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1853 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1854 | ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1855 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1856 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1857 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1858 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1859 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1860 | ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1861 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1862 | ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1863 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1864 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1865 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1866 | ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1867 | "add %[a_ptr0], %[a_ptr0], #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1868 | ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1869 | "ldr d0, [%[a_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1870 | ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1871 | "ldr temploadreg0, [%[a_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1872 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1873 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1874 | ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1875 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1876 | ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1877 | "ins v0.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1878 | ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1879 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1880 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1881 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1882 | ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1883 | "add a_ptr1, a_ptr1, #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1884 | ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1885 | "ldr d1, [a_ptr1, #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1886 | ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1887 | "ldr temploadreg1, [a_ptr1, #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1888 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1889 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1890 | ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1891 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1892 | ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1893 | "ins v1.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1894 | ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1895 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1896 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1897 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1898 | ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1899 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1900 | ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1901 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1902 | ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1903 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1904 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1905 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1906 | ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1907 | "add a_ptr2, a_ptr2, #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1908 | ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1909 | "ldr d2, [a_ptr2, #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1910 | ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1911 | "ldr temploadreg2, [a_ptr2, #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1912 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1913 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1914 | ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1915 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1916 | ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1917 | "ins v2.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1918 | ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1919 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1920 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1921 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1922 | ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1923 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1924 | ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1925 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1926 | ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1927 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1928 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1929 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1930 | ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1931 | "add a_ptr3, a_ptr3, #0x20\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1932 | ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1933 | "ldr d3, [a_ptr3, #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1934 | ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1935 | "ldr temploadreg3, [a_ptr3, #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1936 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1937 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1938 | ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1939 | "prfm PLDL1KEEP, [a_ptr1, #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1940 | ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1941 | "ins v3.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1942 | ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1943 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1944 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1945 | "prfm PLDL1KEEP, [a_ptr2, #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1946 | ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1947 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1948 | ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1949 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1950 | ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1951 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1952 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1953 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1954 | ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1955 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1956 | ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1957 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1958 | ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1959 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1960 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1961 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1962 | ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1963 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1964 | ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1965 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1966 | ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1967 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1968 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1969 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1970 | ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1971 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1972 | ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1973 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1974 | ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1975 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1976 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1977 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1978 | ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1979 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1980 | ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1981 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1982 | ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1983 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1984 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1985 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1986 | ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1987 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1988 | ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1989 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1990 | ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1991 | "ins v12.d[1], temploadreg0\n" |
| 1992 | "ins v13.d[1], temploadreg1\n" |
| 1993 | "prfm PLDL1KEEP, [a_ptr3, #0x40]\n" |
| 1994 | "ins v14.d[1], temploadreg2\n" |
| 1995 | "b.ne 3b\n" |
| 1996 | "2:\n" |
| 1997 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
| 1998 | "prfm PSTL1KEEP, [%[c_ptr0]]\n" |
| 1999 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
| 2000 | "prfm PSTL1KEEP, [c_ptr1]\n" |
| 2001 | "prfm PSTL1KEEP, [c_ptr2]\n" |
| 2002 | "prfm PSTL1KEEP, [c_ptr3]\n" |
| 2003 | "ins v15.d[1], temploadreg3\n" |
| 2004 | "cbz %[regs], 4f\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2005 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2006 | "ldr d4, [%[a_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2007 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2008 | "ldr temploadreg0, [%[a_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2009 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2010 | "ldr d5, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2011 | ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2012 | "ldr temploadreg1, [a_ptr1, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2013 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2014 | "ldr d6, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2015 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2016 | "ldr temploadreg2, [a_ptr2, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2017 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2018 | "ldr d7, [a_ptr3]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2019 | ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2020 | "ldr temploadreg3, [a_ptr3, #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2021 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2022 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2023 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2024 | "ins v4.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2025 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2026 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2027 | ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2028 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2029 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2030 | "ins v5.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2031 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2032 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2033 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2034 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2035 | ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2036 | "ins v6.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2037 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2038 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2039 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2040 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2041 | ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2042 | "ins v7.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2043 | ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2044 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2045 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2046 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2047 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2048 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2049 | ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2050 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2051 | ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2052 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2053 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2054 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2055 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2056 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2057 | ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2058 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2059 | ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2060 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2061 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2062 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2063 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2064 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2065 | ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2066 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2067 | ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2068 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2069 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2070 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2071 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2072 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2073 | ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2074 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2075 | ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2076 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2077 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2078 | "add %[b_ptr0], %[b_ptr0], #0x100\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2079 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2080 | "ldr d8, [%[b_ptr0], #-0x80]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2081 | ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2082 | "ldr temploadreg0, [%[b_ptr0], #-0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2083 | ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2084 | "ldr d9, [%[b_ptr0], #-0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2085 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2086 | "ldr temploadreg1, [%[b_ptr0], #-0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2087 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2088 | "ldr temploadreg2, [%[b_ptr0], #-0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2089 | ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2090 | "ldr temploadreg3, [%[b_ptr0], #-0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2091 | ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2092 | "ldr d10, [%[b_ptr0], #-0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2093 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2094 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2095 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2096 | "ldr temploadreg0, [%[b_ptr0], #-0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2097 | ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2098 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2099 | ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2100 | "ldr d11, [%[b_ptr0], #-0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2101 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2102 | "ldr temploadreg1, [%[b_ptr0], #-0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2103 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2104 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2105 | ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2106 | "ldr temploadreg2, [%[b_ptr0], #-0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2107 | ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2108 | "ldr d12, [%[b_ptr0], #-0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2109 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2110 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2111 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2112 | "ldr temploadreg3, [%[b_ptr0], #-0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2113 | ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2114 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2115 | ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2116 | "ldr d13, [%[b_ptr0], #-0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2117 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2118 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2119 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2120 | "add %[a_ptr0], %[a_ptr0], #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2121 | ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2122 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2123 | ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2124 | "ldr d14, [%[b_ptr0], #-0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2125 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2126 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2127 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2128 | "add a_ptr1, a_ptr1, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2129 | ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2130 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2131 | ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2132 | "ldr d15, [%[b_ptr0], #-0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2133 | ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2134 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2135 | ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2136 | "add a_ptr2, a_ptr2, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2137 | ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2138 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2139 | ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2140 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2141 | ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2142 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2143 | ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2144 | "add a_ptr3, a_ptr3, #0x10\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2145 | ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2146 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2147 | ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2148 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2149 | ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2150 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2151 | ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n" |
| 2152 | ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2153 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2154 | ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2155 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2156 | ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2157 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2158 | ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n" |
| 2159 | ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2160 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2161 | ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2162 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2163 | ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2164 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2165 | ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n" |
| 2166 | ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2167 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2168 | ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2169 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2170 | ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2171 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2172 | ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n" |
| 2173 | ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2174 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2175 | ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2176 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2177 | ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n" |
| 2178 | ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n" |
| 2179 | ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2180 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2181 | ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2182 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2183 | ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n" |
| 2184 | ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n" |
| 2185 | ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2186 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2187 | ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2188 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2189 | ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2190 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2191 | ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2192 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2193 | ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n" |
| 2194 | ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n" |
| 2195 | ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n" |
| 2196 | ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n" |
| 2197 | ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n" |
| 2198 | ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n" |
| 2199 | ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n" |
| 2200 | ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n" |
| 2201 | ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n" |
| 2202 | ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n" |
| 2203 | ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n" |
| 2204 | ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n" |
| 2205 | ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n" |
| 2206 | ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n" |
| 2207 | ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n" |
| 2208 | ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n" |
| 2209 | ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n" |
| 2210 | ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n" |
| 2211 | ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n" |
| 2212 | ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n" |
| 2213 | ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n" |
| 2214 | ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n" |
| 2215 | ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n" |
| 2216 | ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n" |
| 2217 | ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n" |
| 2218 | ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n" |
| 2219 | ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n" |
| 2220 | ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n" |
| 2221 | ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n" |
| 2222 | ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2223 | "b 5f\n" |
| 2224 | "4:\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2225 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2226 | "ldr temploadreg0, [%[b_ptr0], #0x8]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2227 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2228 | "ldr temploadreg1, [%[b_ptr0], #0x18]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2229 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2230 | "ldr temploadreg2, [%[b_ptr0], #0x28]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2231 | ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2232 | "ldr d8, [%[b_ptr0]]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2233 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2234 | "ldr temploadreg3, [%[b_ptr0], #0x38]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2235 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
| 2236 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2237 | "ins v8.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2238 | ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2239 | "ldr d9, [%[b_ptr0], #0x10]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2240 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2241 | "ldr temploadreg0, [%[b_ptr0], #0x48]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2242 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 2243 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2244 | "ins v9.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2245 | ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2246 | "ldr d10, [%[b_ptr0], #0x20]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2247 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2248 | "ldr temploadreg1, [%[b_ptr0], #0x58]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2249 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
| 2250 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2251 | "ins v10.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2252 | ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2253 | "ldr d11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2254 | ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2255 | "ldr temploadreg2, [%[b_ptr0], #0x68]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2256 | ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n" |
| 2257 | ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2258 | "ins v11.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2259 | ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2260 | "ldr d12, [%[b_ptr0], #0x40]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2261 | ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2262 | "ldr temploadreg3, [%[b_ptr0], #0x78]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2263 | ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n" |
| 2264 | ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2265 | "ins v12.d[1], temploadreg0\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2266 | ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2267 | "ldr d13, [%[b_ptr0], #0x50]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2268 | ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n" |
| 2269 | ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n" |
| 2270 | ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2271 | "ins v13.d[1], temploadreg1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2272 | ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2273 | "ldr d14, [%[b_ptr0], #0x60]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2274 | ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n" |
| 2275 | ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n" |
| 2276 | ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2277 | "ins v14.d[1], temploadreg2\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2278 | ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2279 | "ldr d15, [%[b_ptr0], #0x70]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2280 | ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2281 | "add %[b_ptr0], %[b_ptr0], #0x80\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2282 | ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2283 | "ins v15.d[1], temploadreg3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2284 | ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n" |
| 2285 | ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n" |
| 2286 | ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n" |
| 2287 | ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n" |
| 2288 | ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n" |
| 2289 | ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n" |
| 2290 | ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n" |
| 2291 | ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n" |
| 2292 | ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n" |
| 2293 | ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n" |
| 2294 | ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n" |
| 2295 | ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n" |
| 2296 | ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n" |
| 2297 | ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n" |
| 2298 | ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n" |
| 2299 | ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n" |
| 2300 | ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n" |
| 2301 | ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n" |
| 2302 | ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n" |
| 2303 | ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n" |
| 2304 | ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n" |
| 2305 | ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n" |
| 2306 | ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n" |
| 2307 | ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n" |
| 2308 | ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n" |
| 2309 | ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n" |
| 2310 | ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n" |
| 2311 | ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n" |
| 2312 | ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n" |
| 2313 | ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2314 | "5:\n" |
| 2315 | "cbz %[blocks], 6f\n" |
| 2316 | "7:\n" |
| 2317 | "ldr q8, [%[b_ptr0]]\n" |
| 2318 | "subs %[blocks], %[blocks], #0x1\n" |
| 2319 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 2320 | "ldr s0, [%[a_ptr0]]\n" |
| 2321 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 2322 | "add %[a_ptr0], %[a_ptr0], #0x4\n" |
| 2323 | "ldr q11, [%[b_ptr0], #0x30]\n" |
| 2324 | "add %[b_ptr0], %[b_ptr0], #0x40\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2325 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2326 | "ldr s1, [a_ptr1]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2327 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2328 | "add a_ptr1, a_ptr1, #0x4\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2329 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2330 | "ldr s2, [a_ptr2]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2331 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2332 | "add a_ptr2, a_ptr2, #0x4\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2333 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2334 | "ldr s3, [a_ptr3]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2335 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2336 | "add a_ptr3, a_ptr3, #0x4\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2337 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
| 2338 | ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" |
| 2339 | ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" |
| 2340 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 2341 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
| 2342 | ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" |
| 2343 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
| 2344 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
| 2345 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
| 2346 | ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2347 | "b.ne 7b\n" |
| 2348 | "6:\n" |
| 2349 | "cbz %[odds], 8f\n" |
| 2350 | "ld1 {v0.b}[0], [%[a_ptr0]], #1\n" |
| 2351 | "ld1 {v1.b}[0], [a_ptr1], #1\n" |
| 2352 | "ld1 {v2.b}[0], [a_ptr2], #1\n" |
| 2353 | "ld1 {v3.b}[0], [a_ptr3], #1\n" |
| 2354 | "subs %[odds], %[odds], #0x1\n" |
| 2355 | "b.eq 9f\n" |
| 2356 | "ld1 {v0.b}[1], [%[a_ptr0]], #1\n" |
| 2357 | "ld1 {v1.b}[1], [a_ptr1], #1\n" |
| 2358 | "ld1 {v2.b}[1], [a_ptr2], #1\n" |
| 2359 | "ld1 {v3.b}[1], [a_ptr3], #1\n" |
| 2360 | "subs %[odds], %[odds], #0x1\n" |
| 2361 | "b.eq 9f\n" |
| 2362 | "ld1 {v0.b}[2], [%[a_ptr0]]\n" |
| 2363 | "ld1 {v1.b}[2], [a_ptr1]\n" |
| 2364 | "ld1 {v2.b}[2], [a_ptr2]\n" |
| 2365 | "ld1 {v3.b}[2], [a_ptr3]\n" |
| 2366 | "9:\n" |
| 2367 | "ldr q8, [%[b_ptr0]]\n" |
| 2368 | "ldr q9, [%[b_ptr0], #0x10]\n" |
| 2369 | "ldr q10, [%[b_ptr0], #0x20]\n" |
| 2370 | "ldr q11, [%[b_ptr0], #0x30]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2371 | ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n" |
| 2372 | ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n" |
| 2373 | ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n" |
| 2374 | ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n" |
| 2375 | ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n" |
| 2376 | ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n" |
| 2377 | ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n" |
| 2378 | ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n" |
| 2379 | ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n" |
| 2380 | ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n" |
| 2381 | ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n" |
| 2382 | ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n" |
| 2383 | ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n" |
| 2384 | ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n" |
| 2385 | ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n" |
| 2386 | ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2387 | "8:\n" |
| 2388 | "str q16, [%[c_ptr0]]\n" |
| 2389 | "str q17, [%[c_ptr0], #0x10]\n" |
| 2390 | "str q18, [%[c_ptr0], #0x20]\n" |
| 2391 | "str q19, [%[c_ptr0], #0x30]\n" |
| 2392 | "add %[c_ptr0], %[c_ptr0], #0x40\n" |
| 2393 | "str q20, [c_ptr1]\n" |
| 2394 | "str q21, [c_ptr1, #0x10]\n" |
| 2395 | "str q22, [c_ptr1, #0x20]\n" |
| 2396 | "str q23, [c_ptr1, #0x30]\n" |
| 2397 | "str q24, [c_ptr2]\n" |
| 2398 | "str q25, [c_ptr2, #0x10]\n" |
| 2399 | "str q26, [c_ptr2, #0x20]\n" |
| 2400 | "str q27, [c_ptr2, #0x30]\n" |
| 2401 | "str q28, [c_ptr3]\n" |
| 2402 | "str q29, [c_ptr3, #0x10]\n" |
| 2403 | "str q30, [c_ptr3, #0x20]\n" |
| 2404 | "str q31, [c_ptr3, #0x30]\n" |
| 2405 | ".unreq a_ptr1\n" |
| 2406 | ".unreq a_ptr2\n" |
| 2407 | ".unreq a_ptr3\n" |
| 2408 | ".unreq c_ptr1\n" |
| 2409 | ".unreq c_ptr2\n" |
| 2410 | ".unreq c_ptr3\n" |
| 2411 | ".unreq temploadreg0\n" |
| 2412 | ".unreq temploadreg1\n" |
| 2413 | ".unreq temploadreg2\n" |
| 2414 | ".unreq temploadreg3\n" |
| 2415 | : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 2416 | : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb) |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2417 | : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory" |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 2418 | ); |
| 2419 | break; |
| 2420 | } |
Georgios Pinitas | 1461383 | 2019-03-01 19:07:11 +0000 | [diff] [blame] | 2421 | if (use_result_buffer) { |
| 2422 | for(int cy=0; cy<std::min(M-y, 4); cy++) { |
| 2423 | for(unsigned int cx=0; cx<width; cx++) { |
| 2424 | c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx]; |
| 2425 | } |
| 2426 | } |
| 2427 | } |
Georgios Pinitas | 1d48065 | 2019-01-23 11:24:50 +0000 | [diff] [blame] | 2428 | } |
| 2429 | } |
| 2430 | } |
| 2431 | |
| 2432 | } // namespace arm_gemm |
| 2433 | |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 2434 | #endif // __aarch64__ |