Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2019 ARM Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #pragma once |
| 25 | |
| 26 | #ifdef __ARM_FEATURE_SVE |
| 27 | |
| 28 | template<> |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 29 | void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append) |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 30 | { |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 31 | UNUSED(act); |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 32 | |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 33 | const int32_t *inptr = in; |
Georgios Pinitas | c7b183a | 2020-03-06 18:12:09 +0000 | [diff] [blame^] | 34 | int32_t nullbias[192]; |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 35 | |
| 36 | if (!append && !bias) |
| 37 | { |
| 38 | memset(nullbias, 0, (3 * get_vector_length<int32_t>() * sizeof(int32_t))); |
| 39 | } |
| 40 | |
| 41 | for (int y=y0; y<ymax; y+=8) |
| 42 | { |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 43 | int32_t *outptr0 = out + (y * ldout) + x0; |
| 44 | int32_t *outptr1 = outptr0 + ldout; |
| 45 | int32_t *outptr2 = outptr1 + ldout; |
| 46 | int32_t *outptr3 = outptr2 + ldout; |
| 47 | int32_t *outptr4 = outptr3 + ldout; |
| 48 | int32_t *outptr5 = outptr4 + ldout; |
| 49 | int32_t *outptr6 = outptr5 + ldout; |
| 50 | int32_t *outptr7 = outptr6 + ldout; |
| 51 | |
| 52 | const int height = ymax - y; |
| 53 | |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 54 | for (int i=x0; i<xmax; i+=(3 * get_vector_length<int32_t>())) |
| 55 | { |
| 56 | if (append) |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 57 | { |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 58 | switch(height) |
| 59 | { |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 60 | case 1: |
| 61 | { |
| 62 | long w = xmax - i; |
| 63 | long p = 0; |
| 64 | /* Optimized routine to copy an entire block */ |
| 65 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 66 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 67 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 68 | "incw %[p], all, mul #1\n" |
| 69 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 70 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 71 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 72 | "whilelt p1.s, %[p], %[w]\n" |
| 73 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 74 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 75 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 76 | "add z10.s, z10.s, z2.s\n" |
| 77 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 78 | "whilelt p2.s, %[p], %[w]\n" |
| 79 | "add z11.s, z11.s, z3.s\n" |
| 80 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 81 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
| 82 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 83 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 84 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 85 | "add z12.s, z12.s, z4.s\n" |
| 86 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 87 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 88 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 89 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 90 | : [w] "r" (w) |
| 91 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 92 | ); |
| 93 | } |
| 94 | break; |
| 95 | |
| 96 | case 2: |
| 97 | { |
| 98 | long w = xmax - i; |
| 99 | long p = 0; |
| 100 | /* Optimized routine to copy an entire block */ |
| 101 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 102 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 103 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 104 | "incw %[p], all, mul #1\n" |
| 105 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 106 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 107 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 108 | "whilelt p1.s, %[p], %[w]\n" |
| 109 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 110 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 111 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 112 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 113 | "add z10.s, z10.s, z2.s\n" |
| 114 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 115 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 116 | "whilelt p2.s, %[p], %[w]\n" |
| 117 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 118 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 119 | "add z11.s, z11.s, z3.s\n" |
| 120 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 121 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
| 122 | "add z13.s, z13.s, z5.s\n" |
| 123 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 124 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 125 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 126 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 127 | "add z12.s, z12.s, z4.s\n" |
| 128 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 129 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 130 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 131 | "add z14.s, z14.s, z6.s\n" |
| 132 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 133 | "addvl %[outptr0], %[outptr0], #3\n" |
| 134 | "add z15.s, z15.s, z7.s\n" |
| 135 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 136 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 137 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 138 | "addvl %[outptr1], %[outptr1], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 139 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 140 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 141 | : [w] "r" (w) |
| 142 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 143 | ); |
| 144 | } |
| 145 | break; |
| 146 | |
| 147 | case 3: |
| 148 | { |
| 149 | long w = xmax - i; |
| 150 | long p = 0; |
| 151 | /* Optimized routine to copy an entire block */ |
| 152 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 153 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 154 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 155 | "incw %[p], all, mul #1\n" |
| 156 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 157 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 158 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 159 | "whilelt p1.s, %[p], %[w]\n" |
| 160 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 161 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 162 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 163 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 164 | "add z10.s, z10.s, z2.s\n" |
| 165 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 166 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 167 | "whilelt p2.s, %[p], %[w]\n" |
| 168 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 169 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 170 | "add z11.s, z11.s, z3.s\n" |
| 171 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 172 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 173 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 174 | "add z13.s, z13.s, z5.s\n" |
| 175 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 176 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 177 | "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" |
| 178 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 179 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 180 | "add z12.s, z12.s, z4.s\n" |
| 181 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 182 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 183 | "ld1w z8.s, p0/z, [%[outptr2]]\n" |
| 184 | "add z14.s, z14.s, z6.s\n" |
| 185 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 186 | "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 187 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 188 | "add z15.s, z15.s, z7.s\n" |
| 189 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 190 | "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n" |
| 191 | "add z16.s, z16.s, z8.s\n" |
| 192 | "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 193 | "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 194 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 195 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 196 | "add z17.s, z17.s, z9.s\n" |
| 197 | "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n" |
| 198 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 199 | "addvl %[outptr1], %[outptr1], #3\n" |
| 200 | "add z10.s, z10.s, z2.s\n" |
| 201 | "st1w z16.s, p0, [%[outptr2]]\n" |
| 202 | "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 203 | "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 204 | "addvl %[outptr2], %[outptr2], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 205 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 206 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 207 | : [w] "r" (w) |
| 208 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 209 | ); |
| 210 | } |
| 211 | break; |
| 212 | |
| 213 | case 4: |
| 214 | { |
| 215 | long w = xmax - i; |
| 216 | long p = 0; |
| 217 | /* Optimized routine to copy an entire block */ |
| 218 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 219 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 220 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 221 | "incw %[p], all, mul #1\n" |
| 222 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 223 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 224 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 225 | "whilelt p1.s, %[p], %[w]\n" |
| 226 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 227 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 228 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 229 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 230 | "add z10.s, z10.s, z2.s\n" |
| 231 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 232 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 233 | "whilelt p2.s, %[p], %[w]\n" |
| 234 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 235 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 236 | "add z11.s, z11.s, z3.s\n" |
| 237 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 238 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 239 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 240 | "add z13.s, z13.s, z5.s\n" |
| 241 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 242 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 243 | "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" |
| 244 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 245 | "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" |
| 246 | "add z12.s, z12.s, z4.s\n" |
| 247 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 248 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 249 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 250 | "ld1w z8.s, p0/z, [%[outptr2]]\n" |
| 251 | "add z14.s, z14.s, z6.s\n" |
| 252 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 253 | "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 254 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 255 | "add z15.s, z15.s, z7.s\n" |
| 256 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 257 | "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n" |
| 258 | "add z16.s, z16.s, z8.s\n" |
| 259 | "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 260 | "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 261 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 262 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 263 | "add z17.s, z17.s, z9.s\n" |
| 264 | "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n" |
| 265 | "ld1w z3.s, p0/z, [%[outptr3]]\n" |
| 266 | "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n" |
| 267 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 268 | "addvl %[outptr1], %[outptr1], #3\n" |
| 269 | "add z10.s, z10.s, z2.s\n" |
| 270 | "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n" |
| 271 | "add z11.s, z11.s, z3.s\n" |
| 272 | "st1w z16.s, p0, [%[outptr2]]\n" |
| 273 | "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n" |
| 274 | "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n" |
| 275 | "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n" |
| 276 | "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 277 | "add z12.s, z12.s, z4.s\n" |
| 278 | "add z13.s, z13.s, z5.s\n" |
| 279 | "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 280 | "addvl %[outptr2], %[outptr2], #3\n" |
| 281 | "st1w z11.s, p0, [%[outptr3]]\n" |
| 282 | "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 283 | "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 284 | "addvl %[outptr3], %[outptr3], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 285 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 286 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 287 | : [w] "r" (w) |
| 288 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 289 | ); |
| 290 | } |
| 291 | break; |
| 292 | |
| 293 | case 5: |
| 294 | { |
| 295 | long w = xmax - i; |
| 296 | long p = 0; |
| 297 | /* Optimized routine to copy an entire block */ |
| 298 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 299 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 300 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 301 | "incw %[p], all, mul #1\n" |
| 302 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 303 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 304 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 305 | "whilelt p1.s, %[p], %[w]\n" |
| 306 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 307 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 308 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 309 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 310 | "add z10.s, z10.s, z2.s\n" |
| 311 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 312 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 313 | "whilelt p2.s, %[p], %[w]\n" |
| 314 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 315 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 316 | "add z11.s, z11.s, z3.s\n" |
| 317 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 318 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 319 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 320 | "add z13.s, z13.s, z5.s\n" |
| 321 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 322 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 323 | "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" |
| 324 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 325 | "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" |
| 326 | "add z12.s, z12.s, z4.s\n" |
| 327 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 328 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 329 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 330 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 331 | "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" |
| 332 | "add z14.s, z14.s, z6.s\n" |
| 333 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 334 | "ld1w z8.s, p0/z, [%[outptr2]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 335 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 336 | "add z15.s, z15.s, z7.s\n" |
| 337 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 338 | "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 339 | "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n" |
| 340 | "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 341 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 342 | "add z16.s, z16.s, z8.s\n" |
| 343 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 344 | "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n" |
| 345 | "add z17.s, z17.s, z9.s\n" |
| 346 | "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n" |
| 347 | "ld1w z3.s, p0/z, [%[outptr3]]\n" |
| 348 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 349 | "addvl %[outptr1], %[outptr1], #3\n" |
| 350 | "add z10.s, z10.s, z2.s\n" |
| 351 | "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n" |
| 352 | "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n" |
| 353 | "st1w z16.s, p0, [%[outptr2]]\n" |
| 354 | "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n" |
| 355 | "add z11.s, z11.s, z3.s\n" |
| 356 | "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n" |
| 357 | "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n" |
| 358 | "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 359 | "add z12.s, z12.s, z4.s\n" |
| 360 | "ld1w z6.s, p0/z, [%[outptr4]]\n" |
| 361 | "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n" |
| 362 | "add z13.s, z13.s, z5.s\n" |
| 363 | "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 364 | "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n" |
| 365 | "addvl %[outptr2], %[outptr2], #3\n" |
| 366 | "add z14.s, z14.s, z6.s\n" |
| 367 | "st1w z11.s, p0, [%[outptr3]]\n" |
| 368 | "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n" |
| 369 | "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n" |
| 370 | "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n" |
| 371 | "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 372 | "add z15.s, z15.s, z7.s\n" |
| 373 | "add z16.s, z16.s, z8.s\n" |
| 374 | "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 375 | "addvl %[outptr3], %[outptr3], #3\n" |
| 376 | "st1w z14.s, p0, [%[outptr4]]\n" |
| 377 | "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 378 | "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 379 | "addvl %[outptr4], %[outptr4], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 380 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 381 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 382 | : [w] "r" (w) |
| 383 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 384 | ); |
| 385 | } |
| 386 | break; |
| 387 | |
| 388 | case 6: |
| 389 | { |
| 390 | long w = xmax - i; |
| 391 | long p = 0; |
| 392 | /* Optimized routine to copy an entire block */ |
| 393 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 394 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 395 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 396 | "incw %[p], all, mul #1\n" |
| 397 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 398 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 399 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 400 | "whilelt p1.s, %[p], %[w]\n" |
| 401 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 402 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 403 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 404 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 405 | "add z10.s, z10.s, z2.s\n" |
| 406 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 407 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 408 | "whilelt p2.s, %[p], %[w]\n" |
| 409 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 410 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 411 | "add z11.s, z11.s, z3.s\n" |
| 412 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 413 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 414 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 415 | "add z13.s, z13.s, z5.s\n" |
| 416 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 417 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 418 | "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" |
| 419 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 420 | "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" |
| 421 | "add z12.s, z12.s, z4.s\n" |
| 422 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 423 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 424 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 425 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 426 | "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" |
| 427 | "add z14.s, z14.s, z6.s\n" |
| 428 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 429 | "ld1w z8.s, p0/z, [%[outptr2]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 430 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 431 | "add z15.s, z15.s, z7.s\n" |
| 432 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 433 | "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 434 | "prfm PLDL1KEEP, [%[inptr], #0x280]\n" |
| 435 | "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n" |
| 436 | "prfm PLDL1KEEP, [%[outptr5], #0x60]\n" |
| 437 | "add z16.s, z16.s, z8.s\n" |
| 438 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 439 | "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 440 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 441 | "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n" |
| 442 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 443 | "addvl %[outptr1], %[outptr1], #3\n" |
| 444 | "add z17.s, z17.s, z9.s\n" |
| 445 | "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n" |
| 446 | "ld1w z3.s, p0/z, [%[outptr3]]\n" |
| 447 | "st1w z16.s, p0, [%[outptr2]]\n" |
| 448 | "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n" |
| 449 | "add z10.s, z10.s, z2.s\n" |
| 450 | "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n" |
| 451 | "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n" |
| 452 | "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 453 | "add z11.s, z11.s, z3.s\n" |
| 454 | "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n" |
| 455 | "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n" |
| 456 | "add z12.s, z12.s, z4.s\n" |
| 457 | "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 458 | "ld1w z6.s, p0/z, [%[outptr4]]\n" |
| 459 | "addvl %[outptr2], %[outptr2], #3\n" |
| 460 | "add z13.s, z13.s, z5.s\n" |
| 461 | "st1w z11.s, p0, [%[outptr3]]\n" |
| 462 | "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n" |
| 463 | "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n" |
| 464 | "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n" |
| 465 | "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 466 | "add z14.s, z14.s, z6.s\n" |
| 467 | "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n" |
| 468 | "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n" |
| 469 | "add z15.s, z15.s, z7.s\n" |
| 470 | "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 471 | "ld1w z9.s, p0/z, [%[outptr5]]\n" |
| 472 | "addvl %[outptr3], %[outptr3], #3\n" |
| 473 | "add z16.s, z16.s, z8.s\n" |
| 474 | "st1w z14.s, p0, [%[outptr4]]\n" |
| 475 | "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n" |
| 476 | "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n" |
| 477 | "ld1w z10.s, p1/z, [x8]\n" |
| 478 | "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 479 | "add z17.s, z17.s, z9.s\n" |
| 480 | "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n" |
| 481 | "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n" |
| 482 | "add z10.s, z10.s, z2.s\n" |
| 483 | "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 484 | "addvl %[outptr4], %[outptr4], #3\n" |
| 485 | "add z11.s, z11.s, z3.s\n" |
| 486 | "st1w z17.s, p0, [%[outptr5]]\n" |
| 487 | "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n" |
| 488 | "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n" |
| 489 | "addvl %[outptr5], %[outptr5], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 490 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 491 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 492 | : [w] "r" (w) |
| 493 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 494 | ); |
| 495 | } |
| 496 | break; |
| 497 | |
| 498 | case 7: |
| 499 | { |
| 500 | long w = xmax - i; |
| 501 | long p = 0; |
| 502 | /* Optimized routine to copy an entire block */ |
| 503 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 504 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 505 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 506 | "incw %[p], all, mul #1\n" |
| 507 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 508 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 509 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 510 | "whilelt p1.s, %[p], %[w]\n" |
| 511 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 512 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 513 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 514 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 515 | "add z10.s, z10.s, z2.s\n" |
| 516 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 517 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 518 | "whilelt p2.s, %[p], %[w]\n" |
| 519 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 520 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 521 | "add z11.s, z11.s, z3.s\n" |
| 522 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 523 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 524 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 525 | "add z13.s, z13.s, z5.s\n" |
| 526 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 527 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 528 | "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" |
| 529 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 530 | "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" |
| 531 | "add z12.s, z12.s, z4.s\n" |
| 532 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 533 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 534 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 535 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 536 | "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" |
| 537 | "add z14.s, z14.s, z6.s\n" |
| 538 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 539 | "ld1w z8.s, p0/z, [%[outptr2]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 540 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 541 | "add z15.s, z15.s, z7.s\n" |
| 542 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 543 | "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 544 | "prfm PLDL1KEEP, [%[inptr], #0x280]\n" |
| 545 | "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n" |
| 546 | "prfm PLDL1KEEP, [%[outptr5], #0x60]\n" |
| 547 | "add z16.s, z16.s, z8.s\n" |
| 548 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 549 | "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 550 | "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 551 | "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n" |
| 552 | "prfm PLDL1KEEP, [%[outptr6], #0x60]\n" |
| 553 | "add z17.s, z17.s, z9.s\n" |
| 554 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 555 | "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n" |
| 556 | "addvl %[outptr1], %[outptr1], #3\n" |
| 557 | "ld1w z3.s, p0/z, [%[outptr3]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 558 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 559 | "add z10.s, z10.s, z2.s\n" |
| 560 | "st1w z16.s, p0, [%[outptr2]]\n" |
| 561 | "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n" |
| 562 | "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n" |
| 563 | "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n" |
| 564 | "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 565 | "add z11.s, z11.s, z3.s\n" |
| 566 | "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n" |
| 567 | "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n" |
| 568 | "add z12.s, z12.s, z4.s\n" |
| 569 | "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 570 | "ld1w z6.s, p0/z, [%[outptr4]]\n" |
| 571 | "addvl %[outptr2], %[outptr2], #3\n" |
| 572 | "add z13.s, z13.s, z5.s\n" |
| 573 | "st1w z11.s, p0, [%[outptr3]]\n" |
| 574 | "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n" |
| 575 | "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n" |
| 576 | "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n" |
| 577 | "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 578 | "add z14.s, z14.s, z6.s\n" |
| 579 | "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n" |
| 580 | "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n" |
| 581 | "add z15.s, z15.s, z7.s\n" |
| 582 | "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 583 | "ld1w z9.s, p0/z, [%[outptr5]]\n" |
| 584 | "addvl %[outptr3], %[outptr3], #3\n" |
| 585 | "add z16.s, z16.s, z8.s\n" |
| 586 | "st1w z14.s, p0, [%[outptr4]]\n" |
| 587 | "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n" |
| 588 | "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n" |
| 589 | "ld1w z10.s, p1/z, [x8]\n" |
| 590 | "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 591 | "add z17.s, z17.s, z9.s\n" |
| 592 | "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n" |
| 593 | "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n" |
| 594 | "add z10.s, z10.s, z2.s\n" |
| 595 | "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 596 | "ld1w z4.s, p0/z, [%[outptr6]]\n" |
| 597 | "addvl %[outptr4], %[outptr4], #3\n" |
| 598 | "add z11.s, z11.s, z3.s\n" |
| 599 | "st1w z17.s, p0, [%[outptr5]]\n" |
| 600 | "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n" |
| 601 | "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n" |
| 602 | "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n" |
| 603 | "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n" |
| 604 | "add z12.s, z12.s, z4.s\n" |
| 605 | "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n" |
| 606 | "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n" |
| 607 | "add z13.s, z13.s, z5.s\n" |
| 608 | "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n" |
| 609 | "addvl %[outptr5], %[outptr5], #3\n" |
| 610 | "add z14.s, z14.s, z6.s\n" |
| 611 | "st1w z12.s, p0, [%[outptr6]]\n" |
| 612 | "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n" |
| 613 | "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n" |
| 614 | "addvl %[outptr6], %[outptr6], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 615 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 616 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 617 | : [w] "r" (w) |
| 618 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 619 | ); |
| 620 | } |
| 621 | break; |
| 622 | |
| 623 | default: |
| 624 | case 8: |
| 625 | { |
| 626 | long w = xmax - i; |
| 627 | long p = 0; |
| 628 | /* Optimized routine to copy an entire block */ |
| 629 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 630 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 631 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 632 | "incw %[p], all, mul #1\n" |
| 633 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 634 | "prfm PLDL1KEEP, [%[outptr0], #0x60]\n" |
| 635 | "ld1w z2.s, p0/z, [%[outptr0]]\n" |
| 636 | "whilelt p1.s, %[p], %[w]\n" |
| 637 | "ld1w z10.s, p0/z, [%[inptr]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 638 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 639 | "ld1w z5.s, p0/z, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 640 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 641 | "add z10.s, z10.s, z2.s\n" |
| 642 | "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n" |
| 643 | "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 644 | "whilelt p2.s, %[p], %[w]\n" |
| 645 | "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 646 | "prfm PLDL1KEEP, [%[outptr1], #0x60]\n" |
| 647 | "add z11.s, z11.s, z3.s\n" |
| 648 | "st1w z10.s, p0, [%[outptr0]]\n" |
| 649 | "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 650 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 651 | "add z13.s, z13.s, z5.s\n" |
| 652 | "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 653 | "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 654 | "prfm PLDL1KEEP, [%[outptr2], #0x60]\n" |
| 655 | "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n" |
| 656 | "prfm PLDL1KEEP, [%[outptr3], #0x60]\n" |
| 657 | "add z12.s, z12.s, z4.s\n" |
| 658 | "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 659 | "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n" |
| 660 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 661 | "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 662 | "prfm PLDL1KEEP, [%[outptr4], #0x60]\n" |
| 663 | "add z14.s, z14.s, z6.s\n" |
| 664 | "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 665 | "ld1w z8.s, p0/z, [%[outptr2]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 666 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 667 | "add z15.s, z15.s, z7.s\n" |
| 668 | "st1w z13.s, p0, [%[outptr1]]\n" |
| 669 | "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 670 | "prfm PLDL1KEEP, [%[inptr], #0x280]\n" |
| 671 | "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n" |
| 672 | "prfm PLDL1KEEP, [%[outptr5], #0x60]\n" |
| 673 | "add z16.s, z16.s, z8.s\n" |
| 674 | "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 675 | "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 676 | "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 677 | "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n" |
| 678 | "prfm PLDL1KEEP, [%[outptr6], #0x60]\n" |
| 679 | "add z17.s, z17.s, z9.s\n" |
| 680 | "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 681 | "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n" |
| 682 | "addvl %[outptr1], %[outptr1], #3\n" |
| 683 | "ld1w z3.s, p0/z, [%[outptr3]]\n" |
| 684 | "prfm PLDL1KEEP, [%[outptr7], #0x60]\n" |
| 685 | "add z10.s, z10.s, z2.s\n" |
| 686 | "st1w z16.s, p0, [%[outptr2]]\n" |
| 687 | "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 688 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 689 | "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n" |
| 690 | "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 691 | "add z11.s, z11.s, z3.s\n" |
| 692 | "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n" |
| 693 | "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n" |
| 694 | "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n" |
| 695 | "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 696 | "addvl %[outptr2], %[outptr2], #3\n" |
| 697 | "add z12.s, z12.s, z4.s\n" |
| 698 | "ld1w z6.s, p0/z, [%[outptr4]]\n" |
| 699 | "add z13.s, z13.s, z5.s\n" |
| 700 | "st1w z11.s, p0, [%[outptr3]]\n" |
| 701 | "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n" |
| 702 | "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n" |
| 703 | "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n" |
| 704 | "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 705 | "add z14.s, z14.s, z6.s\n" |
| 706 | "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n" |
| 707 | "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n" |
| 708 | "add z15.s, z15.s, z7.s\n" |
| 709 | "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 710 | "ld1w z9.s, p0/z, [%[outptr5]]\n" |
| 711 | "addvl %[outptr3], %[outptr3], #3\n" |
| 712 | "add z16.s, z16.s, z8.s\n" |
| 713 | "st1w z14.s, p0, [%[outptr4]]\n" |
| 714 | "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n" |
| 715 | "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n" |
| 716 | "ld1w z10.s, p1/z, [x8]\n" |
| 717 | "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 718 | "add z17.s, z17.s, z9.s\n" |
| 719 | "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n" |
| 720 | "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n" |
| 721 | "add z10.s, z10.s, z2.s\n" |
| 722 | "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 723 | "ld1w z4.s, p0/z, [%[outptr6]]\n" |
| 724 | "addvl %[outptr4], %[outptr4], #3\n" |
| 725 | "add z11.s, z11.s, z3.s\n" |
| 726 | "st1w z17.s, p0, [%[outptr5]]\n" |
| 727 | "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n" |
| 728 | "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n" |
| 729 | "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n" |
| 730 | "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n" |
| 731 | "add z12.s, z12.s, z4.s\n" |
| 732 | "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n" |
| 733 | "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n" |
| 734 | "add z13.s, z13.s, z5.s\n" |
| 735 | "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n" |
| 736 | "ld1w z7.s, p0/z, [%[outptr7]]\n" |
| 737 | "addvl %[outptr5], %[outptr5], #3\n" |
| 738 | "add z14.s, z14.s, z6.s\n" |
| 739 | "st1w z12.s, p0, [%[outptr6]]\n" |
| 740 | "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n" |
| 741 | "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n" |
| 742 | "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n" |
| 743 | "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n" |
| 744 | "add z15.s, z15.s, z7.s\n" |
| 745 | "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n" |
| 746 | "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n" |
| 747 | "add z16.s, z16.s, z8.s\n" |
| 748 | "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n" |
| 749 | "addvl %[outptr6], %[outptr6], #3\n" |
| 750 | "add z17.s, z17.s, z9.s\n" |
| 751 | "st1w z15.s, p0, [%[outptr7]]\n" |
| 752 | "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n" |
| 753 | "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n" |
| 754 | "addvl %[outptr7], %[outptr7], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 755 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 756 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 757 | : [w] "r" (w) |
| 758 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 759 | ); |
| 760 | } |
| 761 | break; |
| 762 | |
| 763 | |
| 764 | } |
| 765 | } |
| 766 | else |
| 767 | { |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 768 | const int32_t *biasptr = nullbias; |
| 769 | if (bias) |
| 770 | { |
| 771 | biasptr = bias + i; |
| 772 | } |
| 773 | |
| 774 | switch(height) |
| 775 | { |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 776 | case 1: |
| 777 | { |
| 778 | long w = xmax - i; |
| 779 | long p = 0; |
| 780 | /* Optimized routine to copy an entire block */ |
| 781 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 782 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 783 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 784 | "incw %[p], all, mul #1\n" |
| 785 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 786 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 787 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 788 | "whilelt p1.s, %[p], %[w]\n" |
| 789 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 790 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 791 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
| 792 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 793 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 794 | "whilelt p2.s, %[p], %[w]\n" |
| 795 | "add z13.s, z13.s, z2.s\n" |
| 796 | "add z14.s, z14.s, z3.s\n" |
| 797 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 798 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 799 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 800 | "add z15.s, z15.s, z4.s\n" |
| 801 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 802 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 803 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 804 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 805 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 806 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 807 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 808 | ); |
| 809 | } |
| 810 | break; |
| 811 | |
| 812 | case 2: |
| 813 | { |
| 814 | long w = xmax - i; |
| 815 | long p = 0; |
| 816 | /* Optimized routine to copy an entire block */ |
| 817 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 818 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 819 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 820 | "incw %[p], all, mul #1\n" |
| 821 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 822 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 823 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 824 | "whilelt p1.s, %[p], %[w]\n" |
| 825 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 826 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 827 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 828 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 829 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 830 | "whilelt p2.s, %[p], %[w]\n" |
| 831 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 832 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 833 | "add z13.s, z13.s, z2.s\n" |
| 834 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 835 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
| 836 | "add z14.s, z14.s, z3.s\n" |
| 837 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 838 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 839 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 840 | "add z15.s, z15.s, z4.s\n" |
| 841 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 842 | "add z16.s, z16.s, z2.s\n" |
| 843 | "add z17.s, z17.s, z3.s\n" |
| 844 | "add z18.s, z18.s, z4.s\n" |
| 845 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 846 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 847 | "addvl %[outptr0], %[outptr0], #3\n" |
| 848 | "st1w z16.s, p0, [%[outptr1]]\n" |
| 849 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 850 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 851 | "addvl %[outptr1], %[outptr1], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 852 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 853 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 854 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 855 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 856 | ); |
| 857 | } |
| 858 | break; |
| 859 | |
| 860 | case 3: |
| 861 | { |
| 862 | long w = xmax - i; |
| 863 | long p = 0; |
| 864 | /* Optimized routine to copy an entire block */ |
| 865 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 866 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 867 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 868 | "incw %[p], all, mul #1\n" |
| 869 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 870 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 871 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 872 | "whilelt p1.s, %[p], %[w]\n" |
| 873 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 874 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 875 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 876 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 877 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 878 | "whilelt p2.s, %[p], %[w]\n" |
| 879 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 880 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 881 | "add z13.s, z13.s, z2.s\n" |
| 882 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 883 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 884 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 885 | "add z14.s, z14.s, z3.s\n" |
| 886 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 887 | "add z15.s, z15.s, z4.s\n" |
| 888 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 889 | "add z16.s, z16.s, z2.s\n" |
| 890 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 891 | "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 892 | "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" |
| 893 | "add z17.s, z17.s, z3.s\n" |
| 894 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 895 | "add z18.s, z18.s, z4.s\n" |
| 896 | "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 897 | "add z19.s, z19.s, z2.s\n" |
| 898 | "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 899 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 900 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 901 | "addvl %[outptr0], %[outptr0], #3\n" |
| 902 | "add z20.s, z20.s, z3.s\n" |
| 903 | "add z13.s, z13.s, z4.s\n" |
| 904 | "st1w z16.s, p0, [%[outptr1]]\n" |
| 905 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 906 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 907 | "addvl %[outptr1], %[outptr1], #3\n" |
| 908 | "st1w z19.s, p0, [%[outptr2]]\n" |
| 909 | "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 910 | "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 911 | "addvl %[outptr2], %[outptr2], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 912 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 913 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 914 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 915 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 916 | ); |
| 917 | } |
| 918 | break; |
| 919 | |
| 920 | case 4: |
| 921 | { |
| 922 | long w = xmax - i; |
| 923 | long p = 0; |
| 924 | /* Optimized routine to copy an entire block */ |
| 925 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 926 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 927 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 928 | "incw %[p], all, mul #1\n" |
| 929 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 930 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 931 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 932 | "whilelt p1.s, %[p], %[w]\n" |
| 933 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 934 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 935 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 936 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 937 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 938 | "whilelt p2.s, %[p], %[w]\n" |
| 939 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 940 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 941 | "add z13.s, z13.s, z2.s\n" |
| 942 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 943 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 944 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 945 | "add z14.s, z14.s, z3.s\n" |
| 946 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 947 | "add z15.s, z15.s, z4.s\n" |
| 948 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 949 | "add z16.s, z16.s, z2.s\n" |
| 950 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 951 | "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 952 | "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" |
| 953 | "add z17.s, z17.s, z3.s\n" |
| 954 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 955 | "add z18.s, z18.s, z4.s\n" |
| 956 | "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 957 | "add z19.s, z19.s, z2.s\n" |
| 958 | "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n" |
| 959 | "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n" |
| 960 | "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" |
| 961 | "add z20.s, z20.s, z3.s\n" |
| 962 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 963 | "add z13.s, z13.s, z4.s\n" |
| 964 | "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n" |
| 965 | "add z14.s, z14.s, z2.s\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 966 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 967 | "st1w z16.s, p0, [%[outptr1]]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 968 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 969 | "add z15.s, z15.s, z3.s\n" |
| 970 | "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n" |
| 971 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 972 | "add z16.s, z16.s, z4.s\n" |
| 973 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 974 | "addvl %[outptr1], %[outptr1], #3\n" |
| 975 | "st1w z19.s, p0, [%[outptr2]]\n" |
| 976 | "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 977 | "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 978 | "addvl %[outptr2], %[outptr2], #3\n" |
| 979 | "st1w z14.s, p0, [%[outptr3]]\n" |
| 980 | "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 981 | "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 982 | "addvl %[outptr3], %[outptr3], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 983 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 984 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 985 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 986 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 987 | ); |
| 988 | } |
| 989 | break; |
| 990 | |
| 991 | case 5: |
| 992 | { |
| 993 | long w = xmax - i; |
| 994 | long p = 0; |
| 995 | /* Optimized routine to copy an entire block */ |
| 996 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 997 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 998 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 999 | "incw %[p], all, mul #1\n" |
| 1000 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1001 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 1002 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 1003 | "whilelt p1.s, %[p], %[w]\n" |
| 1004 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1005 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1006 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1007 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1008 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 1009 | "whilelt p2.s, %[p], %[w]\n" |
| 1010 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 1011 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 1012 | "add z13.s, z13.s, z2.s\n" |
| 1013 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 1014 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1015 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1016 | "add z14.s, z14.s, z3.s\n" |
| 1017 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 1018 | "add z15.s, z15.s, z4.s\n" |
| 1019 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 1020 | "add z16.s, z16.s, z2.s\n" |
| 1021 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 1022 | "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 1023 | "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" |
| 1024 | "add z17.s, z17.s, z3.s\n" |
| 1025 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 1026 | "add z18.s, z18.s, z4.s\n" |
| 1027 | "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 1028 | "add z19.s, z19.s, z2.s\n" |
| 1029 | "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n" |
| 1030 | "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n" |
| 1031 | "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" |
| 1032 | "add z20.s, z20.s, z3.s\n" |
| 1033 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 1034 | "add z13.s, z13.s, z4.s\n" |
| 1035 | "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n" |
| 1036 | "add z14.s, z14.s, z2.s\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1037 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1038 | "st1w z16.s, p0, [%[outptr1]]\n" |
| 1039 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 1040 | "add z15.s, z15.s, z3.s\n" |
| 1041 | "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n" |
| 1042 | "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" |
| 1043 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1044 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1045 | "add z16.s, z16.s, z4.s\n" |
| 1046 | "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n" |
| 1047 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
| 1048 | "addvl %[outptr1], %[outptr1], #3\n" |
| 1049 | "add z17.s, z17.s, z2.s\n" |
| 1050 | "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n" |
| 1051 | "st1w z19.s, p0, [%[outptr2]]\n" |
| 1052 | "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n" |
| 1053 | "add z18.s, z18.s, z3.s\n" |
| 1054 | "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 1055 | "add z19.s, z19.s, z4.s\n" |
| 1056 | "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 1057 | "addvl %[outptr2], %[outptr2], #3\n" |
| 1058 | "st1w z14.s, p0, [%[outptr3]]\n" |
| 1059 | "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 1060 | "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 1061 | "addvl %[outptr3], %[outptr3], #3\n" |
| 1062 | "st1w z17.s, p0, [%[outptr4]]\n" |
| 1063 | "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 1064 | "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 1065 | "addvl %[outptr4], %[outptr4], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1066 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 1067 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1068 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 1069 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1070 | ); |
| 1071 | } |
| 1072 | break; |
| 1073 | |
| 1074 | case 6: |
| 1075 | { |
| 1076 | long w = xmax - i; |
| 1077 | long p = 0; |
| 1078 | /* Optimized routine to copy an entire block */ |
| 1079 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1080 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1081 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1082 | "incw %[p], all, mul #1\n" |
| 1083 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1084 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 1085 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 1086 | "whilelt p1.s, %[p], %[w]\n" |
| 1087 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1088 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1089 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1090 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1091 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 1092 | "whilelt p2.s, %[p], %[w]\n" |
| 1093 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 1094 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 1095 | "add z13.s, z13.s, z2.s\n" |
| 1096 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 1097 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1098 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1099 | "add z14.s, z14.s, z3.s\n" |
| 1100 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 1101 | "add z15.s, z15.s, z4.s\n" |
| 1102 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 1103 | "add z16.s, z16.s, z2.s\n" |
| 1104 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 1105 | "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 1106 | "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" |
| 1107 | "add z17.s, z17.s, z3.s\n" |
| 1108 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 1109 | "add z18.s, z18.s, z4.s\n" |
| 1110 | "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 1111 | "add z19.s, z19.s, z2.s\n" |
| 1112 | "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n" |
| 1113 | "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n" |
| 1114 | "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" |
| 1115 | "add z20.s, z20.s, z3.s\n" |
| 1116 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 1117 | "add z13.s, z13.s, z4.s\n" |
| 1118 | "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n" |
| 1119 | "add z14.s, z14.s, z2.s\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1120 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1121 | "st1w z16.s, p0, [%[outptr1]]\n" |
| 1122 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 1123 | "add z15.s, z15.s, z3.s\n" |
| 1124 | "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n" |
| 1125 | "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" |
| 1126 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 1127 | "prfm PLDL1KEEP, [%[inptr], #0x280]\n" |
| 1128 | "add z16.s, z16.s, z4.s\n" |
| 1129 | "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n" |
| 1130 | "prfm PSTL1KEEP, [%[outptr5], #0x60]\n" |
| 1131 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1132 | "addvl %[outptr1], %[outptr1], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1133 | "add z17.s, z17.s, z2.s\n" |
| 1134 | "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1135 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1136 | "st1w z19.s, p0, [%[outptr2]]\n" |
| 1137 | "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n" |
| 1138 | "add z18.s, z18.s, z3.s\n" |
| 1139 | "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 1140 | "add z19.s, z19.s, z4.s\n" |
| 1141 | "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n" |
| 1142 | "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 1143 | "addvl %[outptr2], %[outptr2], #3\n" |
| 1144 | "add z20.s, z20.s, z2.s\n" |
| 1145 | "ld1w z13.s, p1/z, [x8]\n" |
| 1146 | "st1w z14.s, p0, [%[outptr3]]\n" |
| 1147 | "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n" |
| 1148 | "add z13.s, z13.s, z3.s\n" |
| 1149 | "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 1150 | "add z14.s, z14.s, z4.s\n" |
| 1151 | "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 1152 | "addvl %[outptr3], %[outptr3], #3\n" |
| 1153 | "st1w z17.s, p0, [%[outptr4]]\n" |
| 1154 | "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 1155 | "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 1156 | "addvl %[outptr4], %[outptr4], #3\n" |
| 1157 | "st1w z20.s, p0, [%[outptr5]]\n" |
| 1158 | "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n" |
| 1159 | "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n" |
| 1160 | "addvl %[outptr5], %[outptr5], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1161 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 1162 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1163 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 1164 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1165 | ); |
| 1166 | } |
| 1167 | break; |
| 1168 | |
| 1169 | case 7: |
| 1170 | { |
| 1171 | long w = xmax - i; |
| 1172 | long p = 0; |
| 1173 | /* Optimized routine to copy an entire block */ |
| 1174 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1175 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1176 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1177 | "incw %[p], all, mul #1\n" |
| 1178 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1179 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 1180 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 1181 | "whilelt p1.s, %[p], %[w]\n" |
| 1182 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1183 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1184 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1185 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1186 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 1187 | "whilelt p2.s, %[p], %[w]\n" |
| 1188 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 1189 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 1190 | "add z13.s, z13.s, z2.s\n" |
| 1191 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 1192 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1193 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1194 | "add z14.s, z14.s, z3.s\n" |
| 1195 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 1196 | "add z15.s, z15.s, z4.s\n" |
| 1197 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 1198 | "add z16.s, z16.s, z2.s\n" |
| 1199 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 1200 | "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 1201 | "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" |
| 1202 | "add z17.s, z17.s, z3.s\n" |
| 1203 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 1204 | "add z18.s, z18.s, z4.s\n" |
| 1205 | "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 1206 | "add z19.s, z19.s, z2.s\n" |
| 1207 | "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n" |
| 1208 | "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n" |
| 1209 | "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" |
| 1210 | "add z20.s, z20.s, z3.s\n" |
| 1211 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 1212 | "add z13.s, z13.s, z4.s\n" |
| 1213 | "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n" |
| 1214 | "add z14.s, z14.s, z2.s\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1215 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1216 | "st1w z16.s, p0, [%[outptr1]]\n" |
| 1217 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 1218 | "add z15.s, z15.s, z3.s\n" |
| 1219 | "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n" |
| 1220 | "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" |
| 1221 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 1222 | "prfm PLDL1KEEP, [%[inptr], #0x280]\n" |
| 1223 | "add z16.s, z16.s, z4.s\n" |
| 1224 | "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n" |
| 1225 | "prfm PSTL1KEEP, [%[outptr5], #0x60]\n" |
| 1226 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1227 | "addvl %[outptr1], %[outptr1], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1228 | "add z17.s, z17.s, z2.s\n" |
| 1229 | "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1230 | "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1231 | "st1w z19.s, p0, [%[outptr2]]\n" |
| 1232 | "prfm PSTL1KEEP, [%[outptr6], #0x60]\n" |
| 1233 | "add z18.s, z18.s, z3.s\n" |
| 1234 | "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1235 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1236 | "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n" |
| 1237 | "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n" |
| 1238 | "add z19.s, z19.s, z4.s\n" |
| 1239 | "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 1240 | "addvl %[outptr2], %[outptr2], #3\n" |
| 1241 | "add z20.s, z20.s, z2.s\n" |
| 1242 | "ld1w z13.s, p1/z, [x8]\n" |
| 1243 | "st1w z14.s, p0, [%[outptr3]]\n" |
| 1244 | "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n" |
| 1245 | "add z13.s, z13.s, z3.s\n" |
| 1246 | "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 1247 | "add z14.s, z14.s, z4.s\n" |
| 1248 | "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n" |
| 1249 | "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 1250 | "addvl %[outptr3], %[outptr3], #3\n" |
| 1251 | "add z15.s, z15.s, z2.s\n" |
| 1252 | "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n" |
| 1253 | "st1w z17.s, p0, [%[outptr4]]\n" |
| 1254 | "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n" |
| 1255 | "add z16.s, z16.s, z3.s\n" |
| 1256 | "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 1257 | "add z17.s, z17.s, z4.s\n" |
| 1258 | "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 1259 | "addvl %[outptr4], %[outptr4], #3\n" |
| 1260 | "st1w z20.s, p0, [%[outptr5]]\n" |
| 1261 | "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n" |
| 1262 | "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n" |
| 1263 | "addvl %[outptr5], %[outptr5], #3\n" |
| 1264 | "st1w z15.s, p0, [%[outptr6]]\n" |
| 1265 | "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n" |
| 1266 | "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n" |
| 1267 | "addvl %[outptr6], %[outptr6], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1268 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 1269 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1270 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 1271 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1272 | ); |
| 1273 | } |
| 1274 | break; |
| 1275 | |
| 1276 | default: |
| 1277 | case 8: |
| 1278 | { |
| 1279 | long w = xmax - i; |
| 1280 | long p = 0; |
| 1281 | /* Optimized routine to copy an entire block */ |
| 1282 | __asm __volatile ( |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1283 | "addvl x8, %[inptr], #16\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1284 | "whilelt p0.s, %[p], %[w]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1285 | "incw %[p], all, mul #1\n" |
| 1286 | "prfm PLDL1KEEP, [%[inptr], #0x180]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1287 | "prfm PSTL1KEEP, [%[outptr0], #0x60]\n" |
| 1288 | "ld1w z2.s, p0/z, [%[biasptr]]\n" |
| 1289 | "whilelt p1.s, %[p], %[w]\n" |
| 1290 | "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1291 | "incw %[p], all, mul #1\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1292 | "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1293 | "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1294 | "ld1w z13.s, p0/z, [%[inptr]]\n" |
| 1295 | "whilelt p2.s, %[p], %[w]\n" |
| 1296 | "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n" |
| 1297 | "prfm PSTL1KEEP, [%[outptr1], #0x60]\n" |
| 1298 | "add z13.s, z13.s, z2.s\n" |
| 1299 | "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n" |
| 1300 | "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1301 | "prfm PLDL1KEEP, [%[inptr], #0x200]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1302 | "add z14.s, z14.s, z3.s\n" |
| 1303 | "st1w z13.s, p0, [%[outptr0]]\n" |
| 1304 | "add z15.s, z15.s, z4.s\n" |
| 1305 | "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n" |
| 1306 | "add z16.s, z16.s, z2.s\n" |
| 1307 | "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n" |
| 1308 | "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n" |
| 1309 | "prfm PSTL1KEEP, [%[outptr2], #0x60]\n" |
| 1310 | "add z17.s, z17.s, z3.s\n" |
| 1311 | "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n" |
| 1312 | "add z18.s, z18.s, z4.s\n" |
| 1313 | "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n" |
| 1314 | "add z19.s, z19.s, z2.s\n" |
| 1315 | "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n" |
| 1316 | "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n" |
| 1317 | "prfm PSTL1KEEP, [%[outptr3], #0x60]\n" |
| 1318 | "add z20.s, z20.s, z3.s\n" |
| 1319 | "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n" |
| 1320 | "add z13.s, z13.s, z4.s\n" |
| 1321 | "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n" |
| 1322 | "add z14.s, z14.s, z2.s\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1323 | "addvl %[outptr0], %[outptr0], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1324 | "st1w z16.s, p0, [%[outptr1]]\n" |
| 1325 | "prfm PLDL1KEEP, [%[inptr], #0x240]\n" |
| 1326 | "add z15.s, z15.s, z3.s\n" |
| 1327 | "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n" |
| 1328 | "prfm PSTL1KEEP, [%[outptr4], #0x60]\n" |
| 1329 | "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n" |
| 1330 | "prfm PLDL1KEEP, [%[inptr], #0x280]\n" |
| 1331 | "add z16.s, z16.s, z4.s\n" |
| 1332 | "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n" |
| 1333 | "prfm PSTL1KEEP, [%[outptr5], #0x60]\n" |
| 1334 | "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1335 | "addvl %[outptr1], %[outptr1], #3\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1336 | "add z17.s, z17.s, z2.s\n" |
| 1337 | "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1338 | "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1339 | "st1w z19.s, p0, [%[outptr2]]\n" |
| 1340 | "prfm PSTL1KEEP, [%[outptr6], #0x60]\n" |
| 1341 | "add z18.s, z18.s, z3.s\n" |
| 1342 | "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n" |
| 1343 | "prfm PSTL1KEEP, [%[outptr7], #0x60]\n" |
| 1344 | "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1345 | "addvl %[inptr], %[inptr], #24\n" |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1346 | "add z19.s, z19.s, z4.s\n" |
| 1347 | "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n" |
| 1348 | "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n" |
| 1349 | "addvl %[outptr2], %[outptr2], #3\n" |
| 1350 | "add z20.s, z20.s, z2.s\n" |
| 1351 | "ld1w z13.s, p1/z, [x8]\n" |
| 1352 | "st1w z14.s, p0, [%[outptr3]]\n" |
| 1353 | "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n" |
| 1354 | "add z13.s, z13.s, z3.s\n" |
| 1355 | "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n" |
| 1356 | "add z14.s, z14.s, z4.s\n" |
| 1357 | "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n" |
| 1358 | "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n" |
| 1359 | "addvl %[outptr3], %[outptr3], #3\n" |
| 1360 | "add z15.s, z15.s, z2.s\n" |
| 1361 | "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n" |
| 1362 | "st1w z17.s, p0, [%[outptr4]]\n" |
| 1363 | "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n" |
| 1364 | "add z16.s, z16.s, z3.s\n" |
| 1365 | "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n" |
| 1366 | "add z17.s, z17.s, z4.s\n" |
| 1367 | "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n" |
| 1368 | "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n" |
| 1369 | "addvl %[outptr4], %[outptr4], #3\n" |
| 1370 | "add z18.s, z18.s, z2.s\n" |
| 1371 | "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n" |
| 1372 | "st1w z20.s, p0, [%[outptr5]]\n" |
| 1373 | "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n" |
| 1374 | "add z19.s, z19.s, z3.s\n" |
| 1375 | "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n" |
| 1376 | "add z20.s, z20.s, z4.s\n" |
| 1377 | "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n" |
| 1378 | "addvl %[outptr5], %[outptr5], #3\n" |
| 1379 | "st1w z15.s, p0, [%[outptr6]]\n" |
| 1380 | "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n" |
| 1381 | "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n" |
| 1382 | "addvl %[outptr6], %[outptr6], #3\n" |
| 1383 | "st1w z18.s, p0, [%[outptr7]]\n" |
| 1384 | "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n" |
| 1385 | "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n" |
| 1386 | "addvl %[outptr7], %[outptr7], #3\n" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1387 | : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), |
| 1388 | [inptr] "+r" (inptr), [p] "+r" (p) |
Georgios Pinitas | 48b3ef8 | 2019-10-14 19:03:09 +0100 | [diff] [blame] | 1389 | : [w] "r" (w), [biasptr] "r" (biasptr) |
| 1390 | : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc" |
Georgios Pinitas | cfa2bba | 2019-06-27 17:00:52 +0100 | [diff] [blame] | 1391 | ); |
| 1392 | } |
| 1393 | break; |
| 1394 | |
| 1395 | |
| 1396 | } |
| 1397 | } |
| 1398 | } |
| 1399 | } |
| 1400 | } |
| 1401 | |
| 1402 | #endif // __ARM_FEATURE_SVE |