blob: 115ba59459558964125884c3647f4e43184a7909 [file] [log] [blame]
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001/*
David Mansellce79ac62022-09-23 09:57:43 +01002 * Copyright (c) 2019-2020,2022 Arm Limited.
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
Michalis Spyrou20fca522021-06-07 14:23:57 +010026#ifdef ARM_COMPUTE_ENABLE_SVE
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010027
28template<>
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010029void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010030{
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010031 const int32_t *inptr = in;
Georgios Pinitasc7b183a2020-03-06 18:12:09 +000032 int32_t nullbias[192];
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010033
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010034
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010035 if (!append && !bias)
36 {
37 memset(nullbias, 0, (3 * get_vector_length<int32_t>() * sizeof(int32_t)));
38 }
39
40 for (int y=y0; y<ymax; y+=8)
41 {
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010042 int32_t *outptr0 = out + (y * ldout) + x0;
43 int32_t *outptr1 = outptr0 + ldout;
44 int32_t *outptr2 = outptr1 + ldout;
45 int32_t *outptr3 = outptr2 + ldout;
46 int32_t *outptr4 = outptr3 + ldout;
47 int32_t *outptr5 = outptr4 + ldout;
48 int32_t *outptr6 = outptr5 + ldout;
49 int32_t *outptr7 = outptr6 + ldout;
50
51 const int height = ymax - y;
52
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010053 for (int i=x0; i<xmax; i+=(3 * get_vector_length<int32_t>()))
54 {
55 if (append)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010056 {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010057 switch(height)
58 {
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010059 case 1:
60 {
61 long w = xmax - i;
62 long p = 0;
63 /* Optimized routine to copy an entire block */
64 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010065 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010066 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010067 "incw %[p], all, mul #1\n"
68 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010069 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
70 "ld1w z2.s, p0/z, [%[outptr0]]\n"
71 "whilelt p1.s, %[p], %[w]\n"
72 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010073 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010074 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
75 "add z10.s, z10.s, z2.s\n"
76 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
77 "whilelt p2.s, %[p], %[w]\n"
78 "add z11.s, z11.s, z3.s\n"
79 "st1w z10.s, p0, [%[outptr0]]\n"
80 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
81 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010082 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010083 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
84 "add z12.s, z12.s, z4.s\n"
85 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
86 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010087 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
88 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010089 : [w] "r" (w)
90 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010091 );
92 }
93 break;
94
95 case 2:
96 {
97 long w = xmax - i;
98 long p = 0;
99 /* Optimized routine to copy an entire block */
100 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100101 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100102 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100103 "incw %[p], all, mul #1\n"
104 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100105 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
106 "ld1w z2.s, p0/z, [%[outptr0]]\n"
107 "whilelt p1.s, %[p], %[w]\n"
108 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100109 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100110 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100111 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100112 "add z10.s, z10.s, z2.s\n"
113 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
114 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
115 "whilelt p2.s, %[p], %[w]\n"
116 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
117 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
118 "add z11.s, z11.s, z3.s\n"
119 "st1w z10.s, p0, [%[outptr0]]\n"
120 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
121 "add z13.s, z13.s, z5.s\n"
122 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
123 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
124 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
125 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
126 "add z12.s, z12.s, z4.s\n"
127 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
128 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100129 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100130 "add z14.s, z14.s, z6.s\n"
131 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
132 "addvl %[outptr0], %[outptr0], #3\n"
133 "add z15.s, z15.s, z7.s\n"
134 "st1w z13.s, p0, [%[outptr1]]\n"
135 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
136 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
137 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100138 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
139 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100140 : [w] "r" (w)
141 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100142 );
143 }
144 break;
145
146 case 3:
147 {
148 long w = xmax - i;
149 long p = 0;
150 /* Optimized routine to copy an entire block */
151 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100152 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100153 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100154 "incw %[p], all, mul #1\n"
155 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100156 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
157 "ld1w z2.s, p0/z, [%[outptr0]]\n"
158 "whilelt p1.s, %[p], %[w]\n"
159 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100160 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100161 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100162 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100163 "add z10.s, z10.s, z2.s\n"
164 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
165 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
166 "whilelt p2.s, %[p], %[w]\n"
167 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
168 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
169 "add z11.s, z11.s, z3.s\n"
170 "st1w z10.s, p0, [%[outptr0]]\n"
171 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100172 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100173 "add z13.s, z13.s, z5.s\n"
174 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
175 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
176 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
177 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
178 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
179 "add z12.s, z12.s, z4.s\n"
180 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
181 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
182 "ld1w z8.s, p0/z, [%[outptr2]]\n"
183 "add z14.s, z14.s, z6.s\n"
184 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
185 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100186 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100187 "add z15.s, z15.s, z7.s\n"
188 "st1w z13.s, p0, [%[outptr1]]\n"
189 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
190 "add z16.s, z16.s, z8.s\n"
191 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
192 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100193 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100194 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
195 "add z17.s, z17.s, z9.s\n"
196 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
197 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
198 "addvl %[outptr1], %[outptr1], #3\n"
199 "add z10.s, z10.s, z2.s\n"
200 "st1w z16.s, p0, [%[outptr2]]\n"
201 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
202 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
203 "addvl %[outptr2], %[outptr2], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100204 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
205 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100206 : [w] "r" (w)
207 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100208 );
209 }
210 break;
211
212 case 4:
213 {
214 long w = xmax - i;
215 long p = 0;
216 /* Optimized routine to copy an entire block */
217 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100218 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100219 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100220 "incw %[p], all, mul #1\n"
221 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100222 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
223 "ld1w z2.s, p0/z, [%[outptr0]]\n"
224 "whilelt p1.s, %[p], %[w]\n"
225 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100226 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100227 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100228 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100229 "add z10.s, z10.s, z2.s\n"
230 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
231 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
232 "whilelt p2.s, %[p], %[w]\n"
233 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
234 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
235 "add z11.s, z11.s, z3.s\n"
236 "st1w z10.s, p0, [%[outptr0]]\n"
237 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100238 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100239 "add z13.s, z13.s, z5.s\n"
240 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
241 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
242 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
243 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
244 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
245 "add z12.s, z12.s, z4.s\n"
246 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
247 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
248 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
249 "ld1w z8.s, p0/z, [%[outptr2]]\n"
250 "add z14.s, z14.s, z6.s\n"
251 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
252 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100253 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100254 "add z15.s, z15.s, z7.s\n"
255 "st1w z13.s, p0, [%[outptr1]]\n"
256 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
257 "add z16.s, z16.s, z8.s\n"
258 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
259 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100260 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100261 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
262 "add z17.s, z17.s, z9.s\n"
263 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
264 "ld1w z3.s, p0/z, [%[outptr3]]\n"
265 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
266 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
267 "addvl %[outptr1], %[outptr1], #3\n"
268 "add z10.s, z10.s, z2.s\n"
269 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
270 "add z11.s, z11.s, z3.s\n"
271 "st1w z16.s, p0, [%[outptr2]]\n"
272 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
273 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
274 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
275 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
276 "add z12.s, z12.s, z4.s\n"
277 "add z13.s, z13.s, z5.s\n"
278 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
279 "addvl %[outptr2], %[outptr2], #3\n"
280 "st1w z11.s, p0, [%[outptr3]]\n"
281 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
282 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
283 "addvl %[outptr3], %[outptr3], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100284 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
285 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100286 : [w] "r" (w)
287 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100288 );
289 }
290 break;
291
292 case 5:
293 {
294 long w = xmax - i;
295 long p = 0;
296 /* Optimized routine to copy an entire block */
297 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100298 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100299 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100300 "incw %[p], all, mul #1\n"
301 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100302 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
303 "ld1w z2.s, p0/z, [%[outptr0]]\n"
304 "whilelt p1.s, %[p], %[w]\n"
305 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100306 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100307 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100308 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100309 "add z10.s, z10.s, z2.s\n"
310 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
311 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
312 "whilelt p2.s, %[p], %[w]\n"
313 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
314 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
315 "add z11.s, z11.s, z3.s\n"
316 "st1w z10.s, p0, [%[outptr0]]\n"
317 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100318 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100319 "add z13.s, z13.s, z5.s\n"
320 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
321 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
322 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
323 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
324 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
325 "add z12.s, z12.s, z4.s\n"
326 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
327 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
328 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
329 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
330 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
331 "add z14.s, z14.s, z6.s\n"
332 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
333 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100334 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100335 "add z15.s, z15.s, z7.s\n"
336 "st1w z13.s, p0, [%[outptr1]]\n"
337 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
338 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
339 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100340 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100341 "add z16.s, z16.s, z8.s\n"
342 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
343 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
344 "add z17.s, z17.s, z9.s\n"
345 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
346 "ld1w z3.s, p0/z, [%[outptr3]]\n"
347 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
348 "addvl %[outptr1], %[outptr1], #3\n"
349 "add z10.s, z10.s, z2.s\n"
350 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
351 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
352 "st1w z16.s, p0, [%[outptr2]]\n"
353 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
354 "add z11.s, z11.s, z3.s\n"
355 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
356 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
357 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
358 "add z12.s, z12.s, z4.s\n"
359 "ld1w z6.s, p0/z, [%[outptr4]]\n"
360 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
361 "add z13.s, z13.s, z5.s\n"
362 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
363 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
364 "addvl %[outptr2], %[outptr2], #3\n"
365 "add z14.s, z14.s, z6.s\n"
366 "st1w z11.s, p0, [%[outptr3]]\n"
367 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
368 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
369 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
370 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
371 "add z15.s, z15.s, z7.s\n"
372 "add z16.s, z16.s, z8.s\n"
373 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
374 "addvl %[outptr3], %[outptr3], #3\n"
375 "st1w z14.s, p0, [%[outptr4]]\n"
376 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
377 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
378 "addvl %[outptr4], %[outptr4], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100379 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
380 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100381 : [w] "r" (w)
382 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100383 );
384 }
385 break;
386
387 case 6:
388 {
389 long w = xmax - i;
390 long p = 0;
391 /* Optimized routine to copy an entire block */
392 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100393 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100394 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100395 "incw %[p], all, mul #1\n"
396 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100397 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
398 "ld1w z2.s, p0/z, [%[outptr0]]\n"
399 "whilelt p1.s, %[p], %[w]\n"
400 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100401 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100402 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100403 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100404 "add z10.s, z10.s, z2.s\n"
405 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
406 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
407 "whilelt p2.s, %[p], %[w]\n"
408 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
409 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
410 "add z11.s, z11.s, z3.s\n"
411 "st1w z10.s, p0, [%[outptr0]]\n"
412 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100413 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100414 "add z13.s, z13.s, z5.s\n"
415 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
416 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
417 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
418 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
419 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
420 "add z12.s, z12.s, z4.s\n"
421 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
422 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
423 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
424 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
425 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
426 "add z14.s, z14.s, z6.s\n"
427 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
428 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100429 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100430 "add z15.s, z15.s, z7.s\n"
431 "st1w z13.s, p0, [%[outptr1]]\n"
432 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
433 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
434 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
435 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
436 "add z16.s, z16.s, z8.s\n"
437 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
438 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100439 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100440 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
441 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
442 "addvl %[outptr1], %[outptr1], #3\n"
443 "add z17.s, z17.s, z9.s\n"
444 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
445 "ld1w z3.s, p0/z, [%[outptr3]]\n"
446 "st1w z16.s, p0, [%[outptr2]]\n"
447 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
448 "add z10.s, z10.s, z2.s\n"
449 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
450 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
451 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
452 "add z11.s, z11.s, z3.s\n"
453 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
454 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
455 "add z12.s, z12.s, z4.s\n"
456 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
457 "ld1w z6.s, p0/z, [%[outptr4]]\n"
458 "addvl %[outptr2], %[outptr2], #3\n"
459 "add z13.s, z13.s, z5.s\n"
460 "st1w z11.s, p0, [%[outptr3]]\n"
461 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
462 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
463 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
464 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
465 "add z14.s, z14.s, z6.s\n"
466 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
467 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
468 "add z15.s, z15.s, z7.s\n"
469 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
470 "ld1w z9.s, p0/z, [%[outptr5]]\n"
471 "addvl %[outptr3], %[outptr3], #3\n"
472 "add z16.s, z16.s, z8.s\n"
473 "st1w z14.s, p0, [%[outptr4]]\n"
474 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
475 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
476 "ld1w z10.s, p1/z, [x8]\n"
477 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
478 "add z17.s, z17.s, z9.s\n"
479 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
480 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
481 "add z10.s, z10.s, z2.s\n"
482 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
483 "addvl %[outptr4], %[outptr4], #3\n"
484 "add z11.s, z11.s, z3.s\n"
485 "st1w z17.s, p0, [%[outptr5]]\n"
486 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
487 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
488 "addvl %[outptr5], %[outptr5], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100489 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
490 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100491 : [w] "r" (w)
492 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100493 );
494 }
495 break;
496
497 case 7:
498 {
499 long w = xmax - i;
500 long p = 0;
501 /* Optimized routine to copy an entire block */
502 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100503 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100504 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100505 "incw %[p], all, mul #1\n"
506 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100507 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
508 "ld1w z2.s, p0/z, [%[outptr0]]\n"
509 "whilelt p1.s, %[p], %[w]\n"
510 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100511 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100512 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100513 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100514 "add z10.s, z10.s, z2.s\n"
515 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
516 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
517 "whilelt p2.s, %[p], %[w]\n"
518 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
519 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
520 "add z11.s, z11.s, z3.s\n"
521 "st1w z10.s, p0, [%[outptr0]]\n"
522 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100523 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100524 "add z13.s, z13.s, z5.s\n"
525 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
526 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
527 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
528 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
529 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
530 "add z12.s, z12.s, z4.s\n"
531 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
532 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
533 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
534 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
535 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
536 "add z14.s, z14.s, z6.s\n"
537 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
538 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100539 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100540 "add z15.s, z15.s, z7.s\n"
541 "st1w z13.s, p0, [%[outptr1]]\n"
542 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
543 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
544 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
545 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
546 "add z16.s, z16.s, z8.s\n"
547 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
548 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100549 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100550 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
551 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
552 "add z17.s, z17.s, z9.s\n"
553 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
554 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
555 "addvl %[outptr1], %[outptr1], #3\n"
556 "ld1w z3.s, p0/z, [%[outptr3]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100557 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100558 "add z10.s, z10.s, z2.s\n"
559 "st1w z16.s, p0, [%[outptr2]]\n"
560 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
561 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
562 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
563 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
564 "add z11.s, z11.s, z3.s\n"
565 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
566 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
567 "add z12.s, z12.s, z4.s\n"
568 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
569 "ld1w z6.s, p0/z, [%[outptr4]]\n"
570 "addvl %[outptr2], %[outptr2], #3\n"
571 "add z13.s, z13.s, z5.s\n"
572 "st1w z11.s, p0, [%[outptr3]]\n"
573 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
574 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
575 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
576 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
577 "add z14.s, z14.s, z6.s\n"
578 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
579 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
580 "add z15.s, z15.s, z7.s\n"
581 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
582 "ld1w z9.s, p0/z, [%[outptr5]]\n"
583 "addvl %[outptr3], %[outptr3], #3\n"
584 "add z16.s, z16.s, z8.s\n"
585 "st1w z14.s, p0, [%[outptr4]]\n"
586 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
587 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
588 "ld1w z10.s, p1/z, [x8]\n"
589 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
590 "add z17.s, z17.s, z9.s\n"
591 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
592 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
593 "add z10.s, z10.s, z2.s\n"
594 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
595 "ld1w z4.s, p0/z, [%[outptr6]]\n"
596 "addvl %[outptr4], %[outptr4], #3\n"
597 "add z11.s, z11.s, z3.s\n"
598 "st1w z17.s, p0, [%[outptr5]]\n"
599 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
600 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
601 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
602 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
603 "add z12.s, z12.s, z4.s\n"
604 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
605 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
606 "add z13.s, z13.s, z5.s\n"
607 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
608 "addvl %[outptr5], %[outptr5], #3\n"
609 "add z14.s, z14.s, z6.s\n"
610 "st1w z12.s, p0, [%[outptr6]]\n"
611 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
612 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
613 "addvl %[outptr6], %[outptr6], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100614 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
615 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100616 : [w] "r" (w)
617 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100618 );
619 }
620 break;
621
622 default:
623 case 8:
624 {
625 long w = xmax - i;
626 long p = 0;
627 /* Optimized routine to copy an entire block */
628 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100629 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100630 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100631 "incw %[p], all, mul #1\n"
632 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100633 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
634 "ld1w z2.s, p0/z, [%[outptr0]]\n"
635 "whilelt p1.s, %[p], %[w]\n"
636 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100637 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100638 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100639 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100640 "add z10.s, z10.s, z2.s\n"
641 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
642 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
643 "whilelt p2.s, %[p], %[w]\n"
644 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
645 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
646 "add z11.s, z11.s, z3.s\n"
647 "st1w z10.s, p0, [%[outptr0]]\n"
648 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100649 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100650 "add z13.s, z13.s, z5.s\n"
651 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
652 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
653 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
654 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
655 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
656 "add z12.s, z12.s, z4.s\n"
657 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
658 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
659 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
660 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
661 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
662 "add z14.s, z14.s, z6.s\n"
663 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
664 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100665 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100666 "add z15.s, z15.s, z7.s\n"
667 "st1w z13.s, p0, [%[outptr1]]\n"
668 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
669 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
670 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
671 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
672 "add z16.s, z16.s, z8.s\n"
673 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
674 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100675 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100676 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
677 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
678 "add z17.s, z17.s, z9.s\n"
679 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
680 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
681 "addvl %[outptr1], %[outptr1], #3\n"
682 "ld1w z3.s, p0/z, [%[outptr3]]\n"
683 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
684 "add z10.s, z10.s, z2.s\n"
685 "st1w z16.s, p0, [%[outptr2]]\n"
686 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100687 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100688 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
689 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
690 "add z11.s, z11.s, z3.s\n"
691 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
692 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
693 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
694 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
695 "addvl %[outptr2], %[outptr2], #3\n"
696 "add z12.s, z12.s, z4.s\n"
697 "ld1w z6.s, p0/z, [%[outptr4]]\n"
698 "add z13.s, z13.s, z5.s\n"
699 "st1w z11.s, p0, [%[outptr3]]\n"
700 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
701 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
702 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
703 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
704 "add z14.s, z14.s, z6.s\n"
705 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
706 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
707 "add z15.s, z15.s, z7.s\n"
708 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
709 "ld1w z9.s, p0/z, [%[outptr5]]\n"
710 "addvl %[outptr3], %[outptr3], #3\n"
711 "add z16.s, z16.s, z8.s\n"
712 "st1w z14.s, p0, [%[outptr4]]\n"
713 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
714 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
715 "ld1w z10.s, p1/z, [x8]\n"
716 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
717 "add z17.s, z17.s, z9.s\n"
718 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
719 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
720 "add z10.s, z10.s, z2.s\n"
721 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
722 "ld1w z4.s, p0/z, [%[outptr6]]\n"
723 "addvl %[outptr4], %[outptr4], #3\n"
724 "add z11.s, z11.s, z3.s\n"
725 "st1w z17.s, p0, [%[outptr5]]\n"
726 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
727 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
728 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
729 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
730 "add z12.s, z12.s, z4.s\n"
731 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
732 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
733 "add z13.s, z13.s, z5.s\n"
734 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
735 "ld1w z7.s, p0/z, [%[outptr7]]\n"
736 "addvl %[outptr5], %[outptr5], #3\n"
737 "add z14.s, z14.s, z6.s\n"
738 "st1w z12.s, p0, [%[outptr6]]\n"
739 "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n"
740 "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n"
741 "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n"
742 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
743 "add z15.s, z15.s, z7.s\n"
744 "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n"
745 "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n"
746 "add z16.s, z16.s, z8.s\n"
747 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
748 "addvl %[outptr6], %[outptr6], #3\n"
749 "add z17.s, z17.s, z9.s\n"
750 "st1w z15.s, p0, [%[outptr7]]\n"
751 "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n"
752 "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n"
753 "addvl %[outptr7], %[outptr7], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100754 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
755 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100756 : [w] "r" (w)
757 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100758 );
759 }
760 break;
761
762
763 }
764 }
765 else
766 {
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100767 const int32_t *biasptr = bias ? bias + i : nullbias;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100768
769 switch(height)
770 {
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100771 case 1:
772 {
773 long w = xmax - i;
774 long p = 0;
775 /* Optimized routine to copy an entire block */
776 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100777 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100778 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100779 "incw %[p], all, mul #1\n"
780 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100781 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
782 "ld1w z2.s, p0/z, [%[biasptr]]\n"
783 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100784 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100785 "incw %[p], all, mul #1\n"
786 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
787 "add z13.s, z13.s, z2.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100788 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
789 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100790 "add z14.s, z14.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +0100791 "st1w z13.s, p0, [%[outptr0]]\n"
792 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100793 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100794 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100795 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100796 "add z15.s, z15.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100797 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
798 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100799 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
800 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100801 : [w] "r" (w), [biasptr] "r" (biasptr)
802 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100803 );
804 }
805 break;
806
807 case 2:
808 {
809 long w = xmax - i;
810 long p = 0;
811 /* Optimized routine to copy an entire block */
812 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100813 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100814 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100815 "incw %[p], all, mul #1\n"
816 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100817 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
818 "ld1w z2.s, p0/z, [%[biasptr]]\n"
819 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100820 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100821 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100822 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100823 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
824 "add z13.s, z13.s, z2.s\n"
825 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
826 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
827 "whilelt p2.s, %[p], %[w]\n"
828 "add z16.s, z16.s, z2.s\n"
829 "st1w z13.s, p0, [%[outptr0]]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100830 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100831 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
832 "add z14.s, z14.s, z3.s\n"
833 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
834 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
835 "add z17.s, z17.s, z3.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100836 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100837 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100838 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100839 "add z15.s, z15.s, z4.s\n"
840 "add z18.s, z18.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100841 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
842 "addvl %[outptr0], %[outptr0], #3\n"
843 "st1w z16.s, p0, [%[outptr1]]\n"
844 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
845 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
846 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100847 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
848 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100849 : [w] "r" (w), [biasptr] "r" (biasptr)
850 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100851 );
852 }
853 break;
854
855 case 3:
856 {
857 long w = xmax - i;
858 long p = 0;
859 /* Optimized routine to copy an entire block */
860 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100861 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100862 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100863 "incw %[p], all, mul #1\n"
864 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100865 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
866 "ld1w z2.s, p0/z, [%[biasptr]]\n"
867 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100868 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100869 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100870 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100871 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
872 "add z13.s, z13.s, z2.s\n"
873 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
874 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
875 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100876 "add z16.s, z16.s, z2.s\n"
David Mansellce79ac62022-09-23 09:57:43 +0100877 "st1w z13.s, p0, [%[outptr0]]\n"
878 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
879 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
880 "add z14.s, z14.s, z3.s\n"
881 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
882 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
883 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100884 "add z17.s, z17.s, z3.s\n"
885 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100886 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
887 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
888 "add z15.s, z15.s, z4.s\n"
889 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100890 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100891 "addvl %[inptr], %[inptr], #24\n"
892 "add z18.s, z18.s, z4.s\n"
893 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100894 "add z19.s, z19.s, z2.s\n"
895 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100896 "add z20.s, z20.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +0100897 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100898 "st1w z16.s, p0, [%[outptr1]]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100899 "add z13.s, z13.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100900 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
901 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
902 "addvl %[outptr1], %[outptr1], #3\n"
903 "st1w z19.s, p0, [%[outptr2]]\n"
904 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
905 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
906 "addvl %[outptr2], %[outptr2], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100907 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
908 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100909 : [w] "r" (w), [biasptr] "r" (biasptr)
910 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100911 );
912 }
913 break;
914
915 case 4:
916 {
917 long w = xmax - i;
918 long p = 0;
919 /* Optimized routine to copy an entire block */
920 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100921 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100922 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100923 "incw %[p], all, mul #1\n"
924 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100925 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
926 "ld1w z2.s, p0/z, [%[biasptr]]\n"
927 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100928 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100929 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100930 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100931 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
932 "add z13.s, z13.s, z2.s\n"
933 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
934 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
935 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100936 "add z16.s, z16.s, z2.s\n"
David Mansellce79ac62022-09-23 09:57:43 +0100937 "st1w z13.s, p0, [%[outptr0]]\n"
938 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
939 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
940 "add z14.s, z14.s, z3.s\n"
941 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
942 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
943 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100944 "add z17.s, z17.s, z3.s\n"
945 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100946 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
947 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
948 "add z15.s, z15.s, z4.s\n"
949 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100950 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100951 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
952 "add z18.s, z18.s, z4.s\n"
953 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100954 "add z19.s, z19.s, z2.s\n"
955 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100956 "add z20.s, z20.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +0100957 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100958 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100959 "addvl %[outptr0], %[outptr0], #3\n"
David Mansellce79ac62022-09-23 09:57:43 +0100960 "add z13.s, z13.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100961 "st1w z16.s, p0, [%[outptr1]]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100962 "add z14.s, z14.s, z2.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100963 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +0100964 "add z15.s, z15.s, z3.s\n"
965 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100966 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
967 "add z16.s, z16.s, z4.s\n"
968 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
969 "addvl %[outptr1], %[outptr1], #3\n"
970 "st1w z19.s, p0, [%[outptr2]]\n"
971 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
972 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
973 "addvl %[outptr2], %[outptr2], #3\n"
974 "st1w z14.s, p0, [%[outptr3]]\n"
975 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
976 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
977 "addvl %[outptr3], %[outptr3], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100978 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
979 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100980 : [w] "r" (w), [biasptr] "r" (biasptr)
981 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100982 );
983 }
984 break;
985
986 case 5:
987 {
988 long w = xmax - i;
989 long p = 0;
990 /* Optimized routine to copy an entire block */
991 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100992 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100993 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100994 "incw %[p], all, mul #1\n"
995 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100996 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
997 "ld1w z2.s, p0/z, [%[biasptr]]\n"
998 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100999 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001000 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001001 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001002 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1003 "add z13.s, z13.s, z2.s\n"
1004 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1005 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1006 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001007 "add z16.s, z16.s, z2.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001008 "st1w z13.s, p0, [%[outptr0]]\n"
1009 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1010 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1011 "add z14.s, z14.s, z3.s\n"
1012 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1013 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1014 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001015 "add z17.s, z17.s, z3.s\n"
1016 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001017 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1018 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1019 "add z15.s, z15.s, z4.s\n"
1020 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001021 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001022 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1023 "add z18.s, z18.s, z4.s\n"
1024 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001025 "add z19.s, z19.s, z2.s\n"
1026 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001027 "add z20.s, z20.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001028 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001029 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001030 "addvl %[outptr0], %[outptr0], #3\n"
David Mansellce79ac62022-09-23 09:57:43 +01001031 "add z13.s, z13.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001032 "st1w z16.s, p0, [%[outptr1]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001033 "add z14.s, z14.s, z2.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001034 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001035 "add z15.s, z15.s, z3.s\n"
1036 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001037 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001038 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001039 "add z16.s, z16.s, z4.s\n"
1040 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001041 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001042 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1043 "addvl %[outptr1], %[outptr1], #3\n"
1044 "add z17.s, z17.s, z2.s\n"
1045 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1046 "st1w z19.s, p0, [%[outptr2]]\n"
1047 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1048 "add z18.s, z18.s, z3.s\n"
1049 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1050 "add z19.s, z19.s, z4.s\n"
1051 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1052 "addvl %[outptr2], %[outptr2], #3\n"
1053 "st1w z14.s, p0, [%[outptr3]]\n"
1054 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1055 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1056 "addvl %[outptr3], %[outptr3], #3\n"
1057 "st1w z17.s, p0, [%[outptr4]]\n"
1058 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1059 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1060 "addvl %[outptr4], %[outptr4], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001061 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1062 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001063 : [w] "r" (w), [biasptr] "r" (biasptr)
1064 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001065 );
1066 }
1067 break;
1068
1069 case 6:
1070 {
1071 long w = xmax - i;
1072 long p = 0;
1073 /* Optimized routine to copy an entire block */
1074 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001075 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001076 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001077 "incw %[p], all, mul #1\n"
1078 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001079 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1080 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1081 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001082 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001083 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001084 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001085 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1086 "add z13.s, z13.s, z2.s\n"
1087 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1088 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1089 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001090 "add z16.s, z16.s, z2.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001091 "st1w z13.s, p0, [%[outptr0]]\n"
1092 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1093 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1094 "add z14.s, z14.s, z3.s\n"
1095 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1096 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1097 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001098 "add z17.s, z17.s, z3.s\n"
1099 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001100 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1101 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1102 "add z15.s, z15.s, z4.s\n"
1103 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001104 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001105 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1106 "add z18.s, z18.s, z4.s\n"
1107 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001108 "add z19.s, z19.s, z2.s\n"
1109 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001110 "add z20.s, z20.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001111 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001112 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001113 "addvl %[outptr0], %[outptr0], #3\n"
David Mansellce79ac62022-09-23 09:57:43 +01001114 "add z13.s, z13.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001115 "st1w z16.s, p0, [%[outptr1]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001116 "add z14.s, z14.s, z2.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001117 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001118 "add z15.s, z15.s, z3.s\n"
1119 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001120 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001121 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001122 "add z16.s, z16.s, z4.s\n"
1123 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001124 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001125 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001126 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001127 "add z17.s, z17.s, z2.s\n"
1128 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001129 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001130 "st1w z19.s, p0, [%[outptr2]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001131 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001132 "add z18.s, z18.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001133 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001134 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001135 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001136 "add z19.s, z19.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001137 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1138 "addvl %[outptr2], %[outptr2], #3\n"
1139 "add z20.s, z20.s, z2.s\n"
1140 "ld1w z13.s, p1/z, [x8]\n"
1141 "st1w z14.s, p0, [%[outptr3]]\n"
1142 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1143 "add z13.s, z13.s, z3.s\n"
1144 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1145 "add z14.s, z14.s, z4.s\n"
1146 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1147 "addvl %[outptr3], %[outptr3], #3\n"
1148 "st1w z17.s, p0, [%[outptr4]]\n"
1149 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1150 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1151 "addvl %[outptr4], %[outptr4], #3\n"
1152 "st1w z20.s, p0, [%[outptr5]]\n"
1153 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1154 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1155 "addvl %[outptr5], %[outptr5], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001156 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1157 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001158 : [w] "r" (w), [biasptr] "r" (biasptr)
1159 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001160 );
1161 }
1162 break;
1163
1164 case 7:
1165 {
1166 long w = xmax - i;
1167 long p = 0;
1168 /* Optimized routine to copy an entire block */
1169 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001170 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001171 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001172 "incw %[p], all, mul #1\n"
1173 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001174 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1175 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1176 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001177 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001178 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001179 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001180 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1181 "add z13.s, z13.s, z2.s\n"
1182 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1183 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1184 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001185 "add z16.s, z16.s, z2.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001186 "st1w z13.s, p0, [%[outptr0]]\n"
1187 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1188 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1189 "add z14.s, z14.s, z3.s\n"
1190 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1191 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1192 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001193 "add z17.s, z17.s, z3.s\n"
1194 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001195 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1196 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1197 "add z15.s, z15.s, z4.s\n"
1198 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001199 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001200 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1201 "add z18.s, z18.s, z4.s\n"
1202 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001203 "add z19.s, z19.s, z2.s\n"
1204 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001205 "add z20.s, z20.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001206 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001207 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001208 "addvl %[outptr0], %[outptr0], #3\n"
David Mansellce79ac62022-09-23 09:57:43 +01001209 "add z13.s, z13.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001210 "st1w z16.s, p0, [%[outptr1]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001211 "add z14.s, z14.s, z2.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001212 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001213 "add z15.s, z15.s, z3.s\n"
1214 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001215 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001216 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001217 "add z16.s, z16.s, z4.s\n"
1218 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001219 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001220 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001221 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001222 "add z17.s, z17.s, z2.s\n"
1223 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001224 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001225 "st1w z19.s, p0, [%[outptr2]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001226 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001227 "add z18.s, z18.s, z3.s\n"
1228 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001229 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001230 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001231 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001232 "add z19.s, z19.s, z4.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001233 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001234 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1235 "addvl %[outptr2], %[outptr2], #3\n"
1236 "add z20.s, z20.s, z2.s\n"
1237 "ld1w z13.s, p1/z, [x8]\n"
1238 "st1w z14.s, p0, [%[outptr3]]\n"
1239 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1240 "add z13.s, z13.s, z3.s\n"
1241 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1242 "add z14.s, z14.s, z4.s\n"
1243 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1244 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1245 "addvl %[outptr3], %[outptr3], #3\n"
1246 "add z15.s, z15.s, z2.s\n"
1247 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1248 "st1w z17.s, p0, [%[outptr4]]\n"
1249 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1250 "add z16.s, z16.s, z3.s\n"
1251 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1252 "add z17.s, z17.s, z4.s\n"
1253 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1254 "addvl %[outptr4], %[outptr4], #3\n"
1255 "st1w z20.s, p0, [%[outptr5]]\n"
1256 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1257 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1258 "addvl %[outptr5], %[outptr5], #3\n"
1259 "st1w z15.s, p0, [%[outptr6]]\n"
1260 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1261 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1262 "addvl %[outptr6], %[outptr6], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001263 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1264 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001265 : [w] "r" (w), [biasptr] "r" (biasptr)
1266 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001267 );
1268 }
1269 break;
1270
1271 default:
1272 case 8:
1273 {
1274 long w = xmax - i;
1275 long p = 0;
1276 /* Optimized routine to copy an entire block */
1277 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001278 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001279 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001280 "incw %[p], all, mul #1\n"
1281 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001282 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1283 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1284 "whilelt p1.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001285 "ld1w z13.s, p0/z, [%[inptr]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001286 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001287 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001288 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1289 "add z13.s, z13.s, z2.s\n"
1290 "ld1w z3.s, p1/z, [%[biasptr], #1, MUL VL]\n"
1291 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1292 "whilelt p2.s, %[p], %[w]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001293 "add z16.s, z16.s, z2.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001294 "st1w z13.s, p0, [%[outptr0]]\n"
1295 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1296 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1297 "add z14.s, z14.s, z3.s\n"
1298 "ld1w z4.s, p2/z, [%[biasptr], #2, MUL VL]\n"
1299 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1300 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001301 "add z17.s, z17.s, z3.s\n"
1302 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001303 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1304 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1305 "add z15.s, z15.s, z4.s\n"
1306 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001307 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001308 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1309 "add z18.s, z18.s, z4.s\n"
1310 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001311 "add z19.s, z19.s, z2.s\n"
1312 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001313 "add z20.s, z20.s, z3.s\n"
David Mansellce79ac62022-09-23 09:57:43 +01001314 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001315 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001316 "addvl %[outptr0], %[outptr0], #3\n"
David Mansellce79ac62022-09-23 09:57:43 +01001317 "add z13.s, z13.s, z4.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001318 "st1w z16.s, p0, [%[outptr1]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001319 "add z14.s, z14.s, z2.s\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001320 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001321 "add z15.s, z15.s, z3.s\n"
1322 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001323 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001324 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001325 "add z16.s, z16.s, z4.s\n"
1326 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001327 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001328 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001329 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001330 "add z17.s, z17.s, z2.s\n"
1331 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001332 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001333 "st1w z19.s, p0, [%[outptr2]]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001334 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001335 "add z18.s, z18.s, z3.s\n"
1336 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001337 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001338 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001339 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001340 "add z19.s, z19.s, z4.s\n"
1341 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
David Mansellce79ac62022-09-23 09:57:43 +01001342 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001343 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1344 "addvl %[outptr2], %[outptr2], #3\n"
1345 "add z20.s, z20.s, z2.s\n"
1346 "ld1w z13.s, p1/z, [x8]\n"
1347 "st1w z14.s, p0, [%[outptr3]]\n"
1348 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1349 "add z13.s, z13.s, z3.s\n"
1350 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1351 "add z14.s, z14.s, z4.s\n"
1352 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1353 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1354 "addvl %[outptr3], %[outptr3], #3\n"
1355 "add z15.s, z15.s, z2.s\n"
1356 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1357 "st1w z17.s, p0, [%[outptr4]]\n"
1358 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1359 "add z16.s, z16.s, z3.s\n"
1360 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1361 "add z17.s, z17.s, z4.s\n"
1362 "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
1363 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1364 "addvl %[outptr4], %[outptr4], #3\n"
1365 "add z18.s, z18.s, z2.s\n"
1366 "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
1367 "st1w z20.s, p0, [%[outptr5]]\n"
1368 "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
1369 "add z19.s, z19.s, z3.s\n"
1370 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1371 "add z20.s, z20.s, z4.s\n"
1372 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1373 "addvl %[outptr5], %[outptr5], #3\n"
1374 "st1w z15.s, p0, [%[outptr6]]\n"
1375 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1376 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1377 "addvl %[outptr6], %[outptr6], #3\n"
1378 "st1w z18.s, p0, [%[outptr7]]\n"
1379 "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n"
1380 "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n"
1381 "addvl %[outptr7], %[outptr7], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001382 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1383 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001384 : [w] "r" (w), [biasptr] "r" (biasptr)
1385 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001386 );
1387 }
1388 break;
1389
1390
1391 }
1392 }
1393 }
1394 }
1395}
1396
Michalis Spyrou20fca522021-06-07 14:23:57 +01001397#endif // ARM_COMPUTE_ENABLE_SVE