blob: d4c5073f8d64eec90d66a4f9b6cf29e0303ae953 [file] [log] [blame]
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001/*
2 * Copyright (c) 2019 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010029void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010030{
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010031 UNUSED(act);
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010032
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010033 const int32_t *inptr = in;
Georgios Pinitasc7b183a2020-03-06 18:12:09 +000034 int32_t nullbias[192];
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010035
36 if (!append && !bias)
37 {
38 memset(nullbias, 0, (3 * get_vector_length<int32_t>() * sizeof(int32_t)));
39 }
40
41 for (int y=y0; y<ymax; y+=8)
42 {
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010043 int32_t *outptr0 = out + (y * ldout) + x0;
44 int32_t *outptr1 = outptr0 + ldout;
45 int32_t *outptr2 = outptr1 + ldout;
46 int32_t *outptr3 = outptr2 + ldout;
47 int32_t *outptr4 = outptr3 + ldout;
48 int32_t *outptr5 = outptr4 + ldout;
49 int32_t *outptr6 = outptr5 + ldout;
50 int32_t *outptr7 = outptr6 + ldout;
51
52 const int height = ymax - y;
53
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010054 for (int i=x0; i<xmax; i+=(3 * get_vector_length<int32_t>()))
55 {
56 if (append)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010057 {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010058 switch(height)
59 {
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010060 case 1:
61 {
62 long w = xmax - i;
63 long p = 0;
64 /* Optimized routine to copy an entire block */
65 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010066 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010067 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010068 "incw %[p], all, mul #1\n"
69 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010070 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
71 "ld1w z2.s, p0/z, [%[outptr0]]\n"
72 "whilelt p1.s, %[p], %[w]\n"
73 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010074 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010075 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
76 "add z10.s, z10.s, z2.s\n"
77 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
78 "whilelt p2.s, %[p], %[w]\n"
79 "add z11.s, z11.s, z3.s\n"
80 "st1w z10.s, p0, [%[outptr0]]\n"
81 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
82 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010083 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010084 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
85 "add z12.s, z12.s, z4.s\n"
86 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
87 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010088 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
89 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010090 : [w] "r" (w)
91 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010092 );
93 }
94 break;
95
96 case 2:
97 {
98 long w = xmax - i;
99 long p = 0;
100 /* Optimized routine to copy an entire block */
101 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100102 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100103 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100104 "incw %[p], all, mul #1\n"
105 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100106 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
107 "ld1w z2.s, p0/z, [%[outptr0]]\n"
108 "whilelt p1.s, %[p], %[w]\n"
109 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100110 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100111 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100112 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100113 "add z10.s, z10.s, z2.s\n"
114 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
115 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
116 "whilelt p2.s, %[p], %[w]\n"
117 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
118 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
119 "add z11.s, z11.s, z3.s\n"
120 "st1w z10.s, p0, [%[outptr0]]\n"
121 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
122 "add z13.s, z13.s, z5.s\n"
123 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
124 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
125 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
126 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
127 "add z12.s, z12.s, z4.s\n"
128 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
129 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100130 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100131 "add z14.s, z14.s, z6.s\n"
132 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
133 "addvl %[outptr0], %[outptr0], #3\n"
134 "add z15.s, z15.s, z7.s\n"
135 "st1w z13.s, p0, [%[outptr1]]\n"
136 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
137 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
138 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100139 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
140 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100141 : [w] "r" (w)
142 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100143 );
144 }
145 break;
146
147 case 3:
148 {
149 long w = xmax - i;
150 long p = 0;
151 /* Optimized routine to copy an entire block */
152 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100153 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100154 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100155 "incw %[p], all, mul #1\n"
156 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100157 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
158 "ld1w z2.s, p0/z, [%[outptr0]]\n"
159 "whilelt p1.s, %[p], %[w]\n"
160 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100161 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100162 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100163 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100164 "add z10.s, z10.s, z2.s\n"
165 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
166 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
167 "whilelt p2.s, %[p], %[w]\n"
168 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
169 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
170 "add z11.s, z11.s, z3.s\n"
171 "st1w z10.s, p0, [%[outptr0]]\n"
172 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100173 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100174 "add z13.s, z13.s, z5.s\n"
175 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
176 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
177 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
178 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
179 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
180 "add z12.s, z12.s, z4.s\n"
181 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
182 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
183 "ld1w z8.s, p0/z, [%[outptr2]]\n"
184 "add z14.s, z14.s, z6.s\n"
185 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
186 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100187 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100188 "add z15.s, z15.s, z7.s\n"
189 "st1w z13.s, p0, [%[outptr1]]\n"
190 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
191 "add z16.s, z16.s, z8.s\n"
192 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
193 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100194 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100195 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
196 "add z17.s, z17.s, z9.s\n"
197 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
198 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
199 "addvl %[outptr1], %[outptr1], #3\n"
200 "add z10.s, z10.s, z2.s\n"
201 "st1w z16.s, p0, [%[outptr2]]\n"
202 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
203 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
204 "addvl %[outptr2], %[outptr2], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100205 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
206 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100207 : [w] "r" (w)
208 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100209 );
210 }
211 break;
212
213 case 4:
214 {
215 long w = xmax - i;
216 long p = 0;
217 /* Optimized routine to copy an entire block */
218 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100219 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100220 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100221 "incw %[p], all, mul #1\n"
222 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100223 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
224 "ld1w z2.s, p0/z, [%[outptr0]]\n"
225 "whilelt p1.s, %[p], %[w]\n"
226 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100227 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100228 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100229 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100230 "add z10.s, z10.s, z2.s\n"
231 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
232 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
233 "whilelt p2.s, %[p], %[w]\n"
234 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
235 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
236 "add z11.s, z11.s, z3.s\n"
237 "st1w z10.s, p0, [%[outptr0]]\n"
238 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100239 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100240 "add z13.s, z13.s, z5.s\n"
241 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
242 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
243 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
244 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
245 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
246 "add z12.s, z12.s, z4.s\n"
247 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
248 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
249 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
250 "ld1w z8.s, p0/z, [%[outptr2]]\n"
251 "add z14.s, z14.s, z6.s\n"
252 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
253 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100254 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100255 "add z15.s, z15.s, z7.s\n"
256 "st1w z13.s, p0, [%[outptr1]]\n"
257 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
258 "add z16.s, z16.s, z8.s\n"
259 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
260 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100261 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100262 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
263 "add z17.s, z17.s, z9.s\n"
264 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
265 "ld1w z3.s, p0/z, [%[outptr3]]\n"
266 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
267 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
268 "addvl %[outptr1], %[outptr1], #3\n"
269 "add z10.s, z10.s, z2.s\n"
270 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
271 "add z11.s, z11.s, z3.s\n"
272 "st1w z16.s, p0, [%[outptr2]]\n"
273 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
274 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
275 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
276 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
277 "add z12.s, z12.s, z4.s\n"
278 "add z13.s, z13.s, z5.s\n"
279 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
280 "addvl %[outptr2], %[outptr2], #3\n"
281 "st1w z11.s, p0, [%[outptr3]]\n"
282 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
283 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
284 "addvl %[outptr3], %[outptr3], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100285 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
286 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100287 : [w] "r" (w)
288 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100289 );
290 }
291 break;
292
293 case 5:
294 {
295 long w = xmax - i;
296 long p = 0;
297 /* Optimized routine to copy an entire block */
298 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100299 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100300 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100301 "incw %[p], all, mul #1\n"
302 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100303 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
304 "ld1w z2.s, p0/z, [%[outptr0]]\n"
305 "whilelt p1.s, %[p], %[w]\n"
306 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100307 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100308 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100309 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100310 "add z10.s, z10.s, z2.s\n"
311 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
312 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
313 "whilelt p2.s, %[p], %[w]\n"
314 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
315 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
316 "add z11.s, z11.s, z3.s\n"
317 "st1w z10.s, p0, [%[outptr0]]\n"
318 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100319 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100320 "add z13.s, z13.s, z5.s\n"
321 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
322 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
323 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
324 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
325 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
326 "add z12.s, z12.s, z4.s\n"
327 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
328 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
329 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
330 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
331 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
332 "add z14.s, z14.s, z6.s\n"
333 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
334 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100335 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100336 "add z15.s, z15.s, z7.s\n"
337 "st1w z13.s, p0, [%[outptr1]]\n"
338 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
339 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
340 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100341 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100342 "add z16.s, z16.s, z8.s\n"
343 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
344 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
345 "add z17.s, z17.s, z9.s\n"
346 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
347 "ld1w z3.s, p0/z, [%[outptr3]]\n"
348 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
349 "addvl %[outptr1], %[outptr1], #3\n"
350 "add z10.s, z10.s, z2.s\n"
351 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
352 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
353 "st1w z16.s, p0, [%[outptr2]]\n"
354 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
355 "add z11.s, z11.s, z3.s\n"
356 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
357 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
358 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
359 "add z12.s, z12.s, z4.s\n"
360 "ld1w z6.s, p0/z, [%[outptr4]]\n"
361 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
362 "add z13.s, z13.s, z5.s\n"
363 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
364 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
365 "addvl %[outptr2], %[outptr2], #3\n"
366 "add z14.s, z14.s, z6.s\n"
367 "st1w z11.s, p0, [%[outptr3]]\n"
368 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
369 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
370 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
371 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
372 "add z15.s, z15.s, z7.s\n"
373 "add z16.s, z16.s, z8.s\n"
374 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
375 "addvl %[outptr3], %[outptr3], #3\n"
376 "st1w z14.s, p0, [%[outptr4]]\n"
377 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
378 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
379 "addvl %[outptr4], %[outptr4], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100380 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
381 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100382 : [w] "r" (w)
383 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100384 );
385 }
386 break;
387
388 case 6:
389 {
390 long w = xmax - i;
391 long p = 0;
392 /* Optimized routine to copy an entire block */
393 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100394 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100395 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100396 "incw %[p], all, mul #1\n"
397 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100398 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
399 "ld1w z2.s, p0/z, [%[outptr0]]\n"
400 "whilelt p1.s, %[p], %[w]\n"
401 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100402 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100403 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100404 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100405 "add z10.s, z10.s, z2.s\n"
406 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
407 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
408 "whilelt p2.s, %[p], %[w]\n"
409 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
410 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
411 "add z11.s, z11.s, z3.s\n"
412 "st1w z10.s, p0, [%[outptr0]]\n"
413 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100414 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100415 "add z13.s, z13.s, z5.s\n"
416 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
417 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
418 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
419 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
420 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
421 "add z12.s, z12.s, z4.s\n"
422 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
423 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
424 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
425 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
426 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
427 "add z14.s, z14.s, z6.s\n"
428 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
429 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100430 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100431 "add z15.s, z15.s, z7.s\n"
432 "st1w z13.s, p0, [%[outptr1]]\n"
433 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
434 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
435 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
436 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
437 "add z16.s, z16.s, z8.s\n"
438 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
439 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100440 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100441 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
442 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
443 "addvl %[outptr1], %[outptr1], #3\n"
444 "add z17.s, z17.s, z9.s\n"
445 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
446 "ld1w z3.s, p0/z, [%[outptr3]]\n"
447 "st1w z16.s, p0, [%[outptr2]]\n"
448 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
449 "add z10.s, z10.s, z2.s\n"
450 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
451 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
452 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
453 "add z11.s, z11.s, z3.s\n"
454 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
455 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
456 "add z12.s, z12.s, z4.s\n"
457 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
458 "ld1w z6.s, p0/z, [%[outptr4]]\n"
459 "addvl %[outptr2], %[outptr2], #3\n"
460 "add z13.s, z13.s, z5.s\n"
461 "st1w z11.s, p0, [%[outptr3]]\n"
462 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
463 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
464 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
465 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
466 "add z14.s, z14.s, z6.s\n"
467 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
468 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
469 "add z15.s, z15.s, z7.s\n"
470 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
471 "ld1w z9.s, p0/z, [%[outptr5]]\n"
472 "addvl %[outptr3], %[outptr3], #3\n"
473 "add z16.s, z16.s, z8.s\n"
474 "st1w z14.s, p0, [%[outptr4]]\n"
475 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
476 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
477 "ld1w z10.s, p1/z, [x8]\n"
478 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
479 "add z17.s, z17.s, z9.s\n"
480 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
481 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
482 "add z10.s, z10.s, z2.s\n"
483 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
484 "addvl %[outptr4], %[outptr4], #3\n"
485 "add z11.s, z11.s, z3.s\n"
486 "st1w z17.s, p0, [%[outptr5]]\n"
487 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
488 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
489 "addvl %[outptr5], %[outptr5], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100490 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
491 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100492 : [w] "r" (w)
493 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100494 );
495 }
496 break;
497
498 case 7:
499 {
500 long w = xmax - i;
501 long p = 0;
502 /* Optimized routine to copy an entire block */
503 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100504 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100505 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100506 "incw %[p], all, mul #1\n"
507 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100508 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
509 "ld1w z2.s, p0/z, [%[outptr0]]\n"
510 "whilelt p1.s, %[p], %[w]\n"
511 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100512 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100513 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100514 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100515 "add z10.s, z10.s, z2.s\n"
516 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
517 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
518 "whilelt p2.s, %[p], %[w]\n"
519 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
520 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
521 "add z11.s, z11.s, z3.s\n"
522 "st1w z10.s, p0, [%[outptr0]]\n"
523 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100524 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100525 "add z13.s, z13.s, z5.s\n"
526 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
527 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
528 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
529 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
530 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
531 "add z12.s, z12.s, z4.s\n"
532 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
533 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
534 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
535 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
536 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
537 "add z14.s, z14.s, z6.s\n"
538 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
539 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100540 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100541 "add z15.s, z15.s, z7.s\n"
542 "st1w z13.s, p0, [%[outptr1]]\n"
543 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
544 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
545 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
546 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
547 "add z16.s, z16.s, z8.s\n"
548 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
549 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100550 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100551 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
552 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
553 "add z17.s, z17.s, z9.s\n"
554 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
555 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
556 "addvl %[outptr1], %[outptr1], #3\n"
557 "ld1w z3.s, p0/z, [%[outptr3]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100558 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100559 "add z10.s, z10.s, z2.s\n"
560 "st1w z16.s, p0, [%[outptr2]]\n"
561 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
562 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
563 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
564 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
565 "add z11.s, z11.s, z3.s\n"
566 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
567 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
568 "add z12.s, z12.s, z4.s\n"
569 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
570 "ld1w z6.s, p0/z, [%[outptr4]]\n"
571 "addvl %[outptr2], %[outptr2], #3\n"
572 "add z13.s, z13.s, z5.s\n"
573 "st1w z11.s, p0, [%[outptr3]]\n"
574 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
575 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
576 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
577 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
578 "add z14.s, z14.s, z6.s\n"
579 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
580 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
581 "add z15.s, z15.s, z7.s\n"
582 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
583 "ld1w z9.s, p0/z, [%[outptr5]]\n"
584 "addvl %[outptr3], %[outptr3], #3\n"
585 "add z16.s, z16.s, z8.s\n"
586 "st1w z14.s, p0, [%[outptr4]]\n"
587 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
588 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
589 "ld1w z10.s, p1/z, [x8]\n"
590 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
591 "add z17.s, z17.s, z9.s\n"
592 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
593 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
594 "add z10.s, z10.s, z2.s\n"
595 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
596 "ld1w z4.s, p0/z, [%[outptr6]]\n"
597 "addvl %[outptr4], %[outptr4], #3\n"
598 "add z11.s, z11.s, z3.s\n"
599 "st1w z17.s, p0, [%[outptr5]]\n"
600 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
601 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
602 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
603 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
604 "add z12.s, z12.s, z4.s\n"
605 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
606 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
607 "add z13.s, z13.s, z5.s\n"
608 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
609 "addvl %[outptr5], %[outptr5], #3\n"
610 "add z14.s, z14.s, z6.s\n"
611 "st1w z12.s, p0, [%[outptr6]]\n"
612 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
613 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
614 "addvl %[outptr6], %[outptr6], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100615 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
616 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100617 : [w] "r" (w)
618 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100619 );
620 }
621 break;
622
623 default:
624 case 8:
625 {
626 long w = xmax - i;
627 long p = 0;
628 /* Optimized routine to copy an entire block */
629 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100630 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100631 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100632 "incw %[p], all, mul #1\n"
633 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100634 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
635 "ld1w z2.s, p0/z, [%[outptr0]]\n"
636 "whilelt p1.s, %[p], %[w]\n"
637 "ld1w z10.s, p0/z, [%[inptr]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100638 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100639 "ld1w z5.s, p0/z, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100640 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100641 "add z10.s, z10.s, z2.s\n"
642 "ld1w z3.s, p1/z, [%[outptr0], #1, MUL VL]\n"
643 "ld1w z11.s, p1/z, [%[inptr], #1, MUL VL]\n"
644 "whilelt p2.s, %[p], %[w]\n"
645 "ld1w z13.s, p0/z, [%[inptr], #3, MUL VL]\n"
646 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
647 "add z11.s, z11.s, z3.s\n"
648 "st1w z10.s, p0, [%[outptr0]]\n"
649 "ld1w z4.s, p2/z, [%[outptr0], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100650 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100651 "add z13.s, z13.s, z5.s\n"
652 "st1w z11.s, p1, [%[outptr0], #1, MUL VL]\n"
653 "ld1w z12.s, p2/z, [%[inptr], #2, MUL VL]\n"
654 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
655 "ld1w z6.s, p1/z, [%[outptr1], #1, MUL VL]\n"
656 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
657 "add z12.s, z12.s, z4.s\n"
658 "ld1w z14.s, p1/z, [%[inptr], #4, MUL VL]\n"
659 "ld1w z7.s, p2/z, [%[outptr1], #2, MUL VL]\n"
660 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
661 "ld1w z15.s, p2/z, [%[inptr], #5, MUL VL]\n"
662 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
663 "add z14.s, z14.s, z6.s\n"
664 "st1w z12.s, p2, [%[outptr0], #2, MUL VL]\n"
665 "ld1w z8.s, p0/z, [%[outptr2]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100666 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100667 "add z15.s, z15.s, z7.s\n"
668 "st1w z13.s, p0, [%[outptr1]]\n"
669 "ld1w z16.s, p0/z, [%[inptr], #6, MUL VL]\n"
670 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
671 "ld1w z9.s, p1/z, [%[outptr2], #1, MUL VL]\n"
672 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
673 "add z16.s, z16.s, z8.s\n"
674 "st1w z14.s, p1, [%[outptr1], #1, MUL VL]\n"
675 "ld1w z17.s, p1/z, [%[inptr], #7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100676 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100677 "ld1w z2.s, p2/z, [%[outptr2], #2, MUL VL]\n"
678 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
679 "add z17.s, z17.s, z9.s\n"
680 "st1w z15.s, p2, [%[outptr1], #2, MUL VL]\n"
681 "ld1w z10.s, p2/z, [x8, #-8, MUL VL]\n"
682 "addvl %[outptr1], %[outptr1], #3\n"
683 "ld1w z3.s, p0/z, [%[outptr3]]\n"
684 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
685 "add z10.s, z10.s, z2.s\n"
686 "st1w z16.s, p0, [%[outptr2]]\n"
687 "ld1w z11.s, p0/z, [x8, #-7, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100688 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100689 "ld1w z4.s, p1/z, [%[outptr3], #1, MUL VL]\n"
690 "st1w z17.s, p1, [%[outptr2], #1, MUL VL]\n"
691 "add z11.s, z11.s, z3.s\n"
692 "ld1w z12.s, p1/z, [x8, #-6, MUL VL]\n"
693 "ld1w z5.s, p2/z, [%[outptr3], #2, MUL VL]\n"
694 "ld1w z13.s, p2/z, [x8, #-5, MUL VL]\n"
695 "st1w z10.s, p2, [%[outptr2], #2, MUL VL]\n"
696 "addvl %[outptr2], %[outptr2], #3\n"
697 "add z12.s, z12.s, z4.s\n"
698 "ld1w z6.s, p0/z, [%[outptr4]]\n"
699 "add z13.s, z13.s, z5.s\n"
700 "st1w z11.s, p0, [%[outptr3]]\n"
701 "ld1w z14.s, p0/z, [x8, #-4, MUL VL]\n"
702 "ld1w z7.s, p1/z, [%[outptr4], #1, MUL VL]\n"
703 "ld1w z15.s, p1/z, [x8, #-3, MUL VL]\n"
704 "st1w z12.s, p1, [%[outptr3], #1, MUL VL]\n"
705 "add z14.s, z14.s, z6.s\n"
706 "ld1w z8.s, p2/z, [%[outptr4], #2, MUL VL]\n"
707 "ld1w z16.s, p2/z, [x8, #-2, MUL VL]\n"
708 "add z15.s, z15.s, z7.s\n"
709 "st1w z13.s, p2, [%[outptr3], #2, MUL VL]\n"
710 "ld1w z9.s, p0/z, [%[outptr5]]\n"
711 "addvl %[outptr3], %[outptr3], #3\n"
712 "add z16.s, z16.s, z8.s\n"
713 "st1w z14.s, p0, [%[outptr4]]\n"
714 "ld1w z17.s, p0/z, [x8, #-1, MUL VL]\n"
715 "ld1w z2.s, p1/z, [%[outptr5], #1, MUL VL]\n"
716 "ld1w z10.s, p1/z, [x8]\n"
717 "st1w z15.s, p1, [%[outptr4], #1, MUL VL]\n"
718 "add z17.s, z17.s, z9.s\n"
719 "ld1w z3.s, p2/z, [%[outptr5], #2, MUL VL]\n"
720 "ld1w z11.s, p2/z, [x8, #1, MUL VL]\n"
721 "add z10.s, z10.s, z2.s\n"
722 "st1w z16.s, p2, [%[outptr4], #2, MUL VL]\n"
723 "ld1w z4.s, p0/z, [%[outptr6]]\n"
724 "addvl %[outptr4], %[outptr4], #3\n"
725 "add z11.s, z11.s, z3.s\n"
726 "st1w z17.s, p0, [%[outptr5]]\n"
727 "ld1w z12.s, p0/z, [x8, #2, MUL VL]\n"
728 "ld1w z5.s, p1/z, [%[outptr6], #1, MUL VL]\n"
729 "ld1w z13.s, p1/z, [x8, #3, MUL VL]\n"
730 "st1w z10.s, p1, [%[outptr5], #1, MUL VL]\n"
731 "add z12.s, z12.s, z4.s\n"
732 "ld1w z6.s, p2/z, [%[outptr6], #2, MUL VL]\n"
733 "ld1w z14.s, p2/z, [x8, #4, MUL VL]\n"
734 "add z13.s, z13.s, z5.s\n"
735 "st1w z11.s, p2, [%[outptr5], #2, MUL VL]\n"
736 "ld1w z7.s, p0/z, [%[outptr7]]\n"
737 "addvl %[outptr5], %[outptr5], #3\n"
738 "add z14.s, z14.s, z6.s\n"
739 "st1w z12.s, p0, [%[outptr6]]\n"
740 "ld1w z15.s, p0/z, [x8, #5, MUL VL]\n"
741 "ld1w z8.s, p1/z, [%[outptr7], #1, MUL VL]\n"
742 "ld1w z16.s, p1/z, [x8, #6, MUL VL]\n"
743 "st1w z13.s, p1, [%[outptr6], #1, MUL VL]\n"
744 "add z15.s, z15.s, z7.s\n"
745 "ld1w z9.s, p2/z, [%[outptr7], #2, MUL VL]\n"
746 "ld1w z17.s, p2/z, [x8, #7, MUL VL]\n"
747 "add z16.s, z16.s, z8.s\n"
748 "st1w z14.s, p2, [%[outptr6], #2, MUL VL]\n"
749 "addvl %[outptr6], %[outptr6], #3\n"
750 "add z17.s, z17.s, z9.s\n"
751 "st1w z15.s, p0, [%[outptr7]]\n"
752 "st1w z16.s, p1, [%[outptr7], #1, MUL VL]\n"
753 "st1w z17.s, p2, [%[outptr7], #2, MUL VL]\n"
754 "addvl %[outptr7], %[outptr7], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100755 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
756 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100757 : [w] "r" (w)
758 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100759 );
760 }
761 break;
762
763
764 }
765 }
766 else
767 {
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100768 const int32_t *biasptr = nullbias;
769 if (bias)
770 {
771 biasptr = bias + i;
772 }
773
774 switch(height)
775 {
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100776 case 1:
777 {
778 long w = xmax - i;
779 long p = 0;
780 /* Optimized routine to copy an entire block */
781 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100782 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100783 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100784 "incw %[p], all, mul #1\n"
785 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100786 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
787 "ld1w z2.s, p0/z, [%[biasptr]]\n"
788 "whilelt p1.s, %[p], %[w]\n"
789 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100790 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100791 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
792 "ld1w z13.s, p0/z, [%[inptr]]\n"
793 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
794 "whilelt p2.s, %[p], %[w]\n"
795 "add z13.s, z13.s, z2.s\n"
796 "add z14.s, z14.s, z3.s\n"
797 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100798 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100799 "st1w z13.s, p0, [%[outptr0]]\n"
800 "add z15.s, z15.s, z4.s\n"
801 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
802 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
803 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100804 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
805 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100806 : [w] "r" (w), [biasptr] "r" (biasptr)
807 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100808 );
809 }
810 break;
811
812 case 2:
813 {
814 long w = xmax - i;
815 long p = 0;
816 /* Optimized routine to copy an entire block */
817 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100818 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100819 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100820 "incw %[p], all, mul #1\n"
821 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100822 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
823 "ld1w z2.s, p0/z, [%[biasptr]]\n"
824 "whilelt p1.s, %[p], %[w]\n"
825 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100826 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100827 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100828 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100829 "ld1w z13.s, p0/z, [%[inptr]]\n"
830 "whilelt p2.s, %[p], %[w]\n"
831 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
832 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
833 "add z13.s, z13.s, z2.s\n"
834 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
835 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
836 "add z14.s, z14.s, z3.s\n"
837 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
838 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100839 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100840 "add z15.s, z15.s, z4.s\n"
841 "st1w z13.s, p0, [%[outptr0]]\n"
842 "add z16.s, z16.s, z2.s\n"
843 "add z17.s, z17.s, z3.s\n"
844 "add z18.s, z18.s, z4.s\n"
845 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
846 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
847 "addvl %[outptr0], %[outptr0], #3\n"
848 "st1w z16.s, p0, [%[outptr1]]\n"
849 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
850 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
851 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100852 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
853 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100854 : [w] "r" (w), [biasptr] "r" (biasptr)
855 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100856 );
857 }
858 break;
859
860 case 3:
861 {
862 long w = xmax - i;
863 long p = 0;
864 /* Optimized routine to copy an entire block */
865 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100866 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100867 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100868 "incw %[p], all, mul #1\n"
869 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100870 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
871 "ld1w z2.s, p0/z, [%[biasptr]]\n"
872 "whilelt p1.s, %[p], %[w]\n"
873 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100874 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100875 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100876 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100877 "ld1w z13.s, p0/z, [%[inptr]]\n"
878 "whilelt p2.s, %[p], %[w]\n"
879 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
880 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
881 "add z13.s, z13.s, z2.s\n"
882 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
883 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100884 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100885 "add z14.s, z14.s, z3.s\n"
886 "st1w z13.s, p0, [%[outptr0]]\n"
887 "add z15.s, z15.s, z4.s\n"
888 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
889 "add z16.s, z16.s, z2.s\n"
890 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
891 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
892 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
893 "add z17.s, z17.s, z3.s\n"
894 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
895 "add z18.s, z18.s, z4.s\n"
896 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
897 "add z19.s, z19.s, z2.s\n"
898 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100899 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100900 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
901 "addvl %[outptr0], %[outptr0], #3\n"
902 "add z20.s, z20.s, z3.s\n"
903 "add z13.s, z13.s, z4.s\n"
904 "st1w z16.s, p0, [%[outptr1]]\n"
905 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
906 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
907 "addvl %[outptr1], %[outptr1], #3\n"
908 "st1w z19.s, p0, [%[outptr2]]\n"
909 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
910 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
911 "addvl %[outptr2], %[outptr2], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100912 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
913 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100914 : [w] "r" (w), [biasptr] "r" (biasptr)
915 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100916 );
917 }
918 break;
919
920 case 4:
921 {
922 long w = xmax - i;
923 long p = 0;
924 /* Optimized routine to copy an entire block */
925 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100926 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100927 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100928 "incw %[p], all, mul #1\n"
929 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100930 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
931 "ld1w z2.s, p0/z, [%[biasptr]]\n"
932 "whilelt p1.s, %[p], %[w]\n"
933 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100934 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100935 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100936 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100937 "ld1w z13.s, p0/z, [%[inptr]]\n"
938 "whilelt p2.s, %[p], %[w]\n"
939 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
940 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
941 "add z13.s, z13.s, z2.s\n"
942 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
943 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100944 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100945 "add z14.s, z14.s, z3.s\n"
946 "st1w z13.s, p0, [%[outptr0]]\n"
947 "add z15.s, z15.s, z4.s\n"
948 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
949 "add z16.s, z16.s, z2.s\n"
950 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
951 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
952 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
953 "add z17.s, z17.s, z3.s\n"
954 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
955 "add z18.s, z18.s, z4.s\n"
956 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
957 "add z19.s, z19.s, z2.s\n"
958 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
959 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
960 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
961 "add z20.s, z20.s, z3.s\n"
962 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
963 "add z13.s, z13.s, z4.s\n"
964 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
965 "add z14.s, z14.s, z2.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100966 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100967 "st1w z16.s, p0, [%[outptr1]]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100968 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100969 "add z15.s, z15.s, z3.s\n"
970 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
971 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
972 "add z16.s, z16.s, z4.s\n"
973 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
974 "addvl %[outptr1], %[outptr1], #3\n"
975 "st1w z19.s, p0, [%[outptr2]]\n"
976 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
977 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
978 "addvl %[outptr2], %[outptr2], #3\n"
979 "st1w z14.s, p0, [%[outptr3]]\n"
980 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
981 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
982 "addvl %[outptr3], %[outptr3], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100983 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
984 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100985 : [w] "r" (w), [biasptr] "r" (biasptr)
986 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100987 );
988 }
989 break;
990
991 case 5:
992 {
993 long w = xmax - i;
994 long p = 0;
995 /* Optimized routine to copy an entire block */
996 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100997 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100998 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100999 "incw %[p], all, mul #1\n"
1000 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001001 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1002 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1003 "whilelt p1.s, %[p], %[w]\n"
1004 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001005 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001006 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001007 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001008 "ld1w z13.s, p0/z, [%[inptr]]\n"
1009 "whilelt p2.s, %[p], %[w]\n"
1010 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1011 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1012 "add z13.s, z13.s, z2.s\n"
1013 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1014 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001015 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001016 "add z14.s, z14.s, z3.s\n"
1017 "st1w z13.s, p0, [%[outptr0]]\n"
1018 "add z15.s, z15.s, z4.s\n"
1019 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1020 "add z16.s, z16.s, z2.s\n"
1021 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1022 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1023 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1024 "add z17.s, z17.s, z3.s\n"
1025 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1026 "add z18.s, z18.s, z4.s\n"
1027 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1028 "add z19.s, z19.s, z2.s\n"
1029 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1030 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1031 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1032 "add z20.s, z20.s, z3.s\n"
1033 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1034 "add z13.s, z13.s, z4.s\n"
1035 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1036 "add z14.s, z14.s, z2.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001037 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001038 "st1w z16.s, p0, [%[outptr1]]\n"
1039 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1040 "add z15.s, z15.s, z3.s\n"
1041 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1042 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1043 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001044 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001045 "add z16.s, z16.s, z4.s\n"
1046 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1047 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
1048 "addvl %[outptr1], %[outptr1], #3\n"
1049 "add z17.s, z17.s, z2.s\n"
1050 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
1051 "st1w z19.s, p0, [%[outptr2]]\n"
1052 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1053 "add z18.s, z18.s, z3.s\n"
1054 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1055 "add z19.s, z19.s, z4.s\n"
1056 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1057 "addvl %[outptr2], %[outptr2], #3\n"
1058 "st1w z14.s, p0, [%[outptr3]]\n"
1059 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1060 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1061 "addvl %[outptr3], %[outptr3], #3\n"
1062 "st1w z17.s, p0, [%[outptr4]]\n"
1063 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1064 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1065 "addvl %[outptr4], %[outptr4], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001066 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1067 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001068 : [w] "r" (w), [biasptr] "r" (biasptr)
1069 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001070 );
1071 }
1072 break;
1073
1074 case 6:
1075 {
1076 long w = xmax - i;
1077 long p = 0;
1078 /* Optimized routine to copy an entire block */
1079 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001080 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001081 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001082 "incw %[p], all, mul #1\n"
1083 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001084 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1085 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1086 "whilelt p1.s, %[p], %[w]\n"
1087 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001088 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001089 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001090 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001091 "ld1w z13.s, p0/z, [%[inptr]]\n"
1092 "whilelt p2.s, %[p], %[w]\n"
1093 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1094 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1095 "add z13.s, z13.s, z2.s\n"
1096 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1097 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001098 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001099 "add z14.s, z14.s, z3.s\n"
1100 "st1w z13.s, p0, [%[outptr0]]\n"
1101 "add z15.s, z15.s, z4.s\n"
1102 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1103 "add z16.s, z16.s, z2.s\n"
1104 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1105 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1106 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1107 "add z17.s, z17.s, z3.s\n"
1108 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1109 "add z18.s, z18.s, z4.s\n"
1110 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1111 "add z19.s, z19.s, z2.s\n"
1112 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1113 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1114 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1115 "add z20.s, z20.s, z3.s\n"
1116 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1117 "add z13.s, z13.s, z4.s\n"
1118 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1119 "add z14.s, z14.s, z2.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001120 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001121 "st1w z16.s, p0, [%[outptr1]]\n"
1122 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1123 "add z15.s, z15.s, z3.s\n"
1124 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1125 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1126 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1127 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1128 "add z16.s, z16.s, z4.s\n"
1129 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1130 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1131 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001132 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001133 "add z17.s, z17.s, z2.s\n"
1134 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001135 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001136 "st1w z19.s, p0, [%[outptr2]]\n"
1137 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1138 "add z18.s, z18.s, z3.s\n"
1139 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1140 "add z19.s, z19.s, z4.s\n"
1141 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1142 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1143 "addvl %[outptr2], %[outptr2], #3\n"
1144 "add z20.s, z20.s, z2.s\n"
1145 "ld1w z13.s, p1/z, [x8]\n"
1146 "st1w z14.s, p0, [%[outptr3]]\n"
1147 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1148 "add z13.s, z13.s, z3.s\n"
1149 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1150 "add z14.s, z14.s, z4.s\n"
1151 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1152 "addvl %[outptr3], %[outptr3], #3\n"
1153 "st1w z17.s, p0, [%[outptr4]]\n"
1154 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1155 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1156 "addvl %[outptr4], %[outptr4], #3\n"
1157 "st1w z20.s, p0, [%[outptr5]]\n"
1158 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1159 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1160 "addvl %[outptr5], %[outptr5], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001161 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1162 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001163 : [w] "r" (w), [biasptr] "r" (biasptr)
1164 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001165 );
1166 }
1167 break;
1168
1169 case 7:
1170 {
1171 long w = xmax - i;
1172 long p = 0;
1173 /* Optimized routine to copy an entire block */
1174 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001175 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001176 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001177 "incw %[p], all, mul #1\n"
1178 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001179 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1180 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1181 "whilelt p1.s, %[p], %[w]\n"
1182 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001183 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001184 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001185 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001186 "ld1w z13.s, p0/z, [%[inptr]]\n"
1187 "whilelt p2.s, %[p], %[w]\n"
1188 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1189 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1190 "add z13.s, z13.s, z2.s\n"
1191 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1192 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001193 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001194 "add z14.s, z14.s, z3.s\n"
1195 "st1w z13.s, p0, [%[outptr0]]\n"
1196 "add z15.s, z15.s, z4.s\n"
1197 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1198 "add z16.s, z16.s, z2.s\n"
1199 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1200 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1201 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1202 "add z17.s, z17.s, z3.s\n"
1203 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1204 "add z18.s, z18.s, z4.s\n"
1205 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1206 "add z19.s, z19.s, z2.s\n"
1207 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1208 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1209 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1210 "add z20.s, z20.s, z3.s\n"
1211 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1212 "add z13.s, z13.s, z4.s\n"
1213 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1214 "add z14.s, z14.s, z2.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001215 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001216 "st1w z16.s, p0, [%[outptr1]]\n"
1217 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1218 "add z15.s, z15.s, z3.s\n"
1219 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1220 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1221 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1222 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1223 "add z16.s, z16.s, z4.s\n"
1224 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1225 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1226 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001227 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001228 "add z17.s, z17.s, z2.s\n"
1229 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001230 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001231 "st1w z19.s, p0, [%[outptr2]]\n"
1232 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1233 "add z18.s, z18.s, z3.s\n"
1234 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001235 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001236 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
1237 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1238 "add z19.s, z19.s, z4.s\n"
1239 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1240 "addvl %[outptr2], %[outptr2], #3\n"
1241 "add z20.s, z20.s, z2.s\n"
1242 "ld1w z13.s, p1/z, [x8]\n"
1243 "st1w z14.s, p0, [%[outptr3]]\n"
1244 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1245 "add z13.s, z13.s, z3.s\n"
1246 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1247 "add z14.s, z14.s, z4.s\n"
1248 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1249 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1250 "addvl %[outptr3], %[outptr3], #3\n"
1251 "add z15.s, z15.s, z2.s\n"
1252 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1253 "st1w z17.s, p0, [%[outptr4]]\n"
1254 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1255 "add z16.s, z16.s, z3.s\n"
1256 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1257 "add z17.s, z17.s, z4.s\n"
1258 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1259 "addvl %[outptr4], %[outptr4], #3\n"
1260 "st1w z20.s, p0, [%[outptr5]]\n"
1261 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1262 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1263 "addvl %[outptr5], %[outptr5], #3\n"
1264 "st1w z15.s, p0, [%[outptr6]]\n"
1265 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1266 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1267 "addvl %[outptr6], %[outptr6], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001268 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1269 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001270 : [w] "r" (w), [biasptr] "r" (biasptr)
1271 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001272 );
1273 }
1274 break;
1275
1276 default:
1277 case 8:
1278 {
1279 long w = xmax - i;
1280 long p = 0;
1281 /* Optimized routine to copy an entire block */
1282 __asm __volatile (
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001283 "addvl x8, %[inptr], #16\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001284 "whilelt p0.s, %[p], %[w]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001285 "incw %[p], all, mul #1\n"
1286 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001287 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1288 "ld1w z2.s, p0/z, [%[biasptr]]\n"
1289 "whilelt p1.s, %[p], %[w]\n"
1290 "ld1w z3.s, p0/z, [%[biasptr], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001291 "incw %[p], all, mul #1\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001292 "ld1w z4.s, p0/z, [%[biasptr], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001293 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001294 "ld1w z13.s, p0/z, [%[inptr]]\n"
1295 "whilelt p2.s, %[p], %[w]\n"
1296 "ld1w z14.s, p1/z, [%[inptr], #1, MUL VL]\n"
1297 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1298 "add z13.s, z13.s, z2.s\n"
1299 "ld1w z15.s, p2/z, [%[inptr], #2, MUL VL]\n"
1300 "ld1w z16.s, p0/z, [%[inptr], #3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001301 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001302 "add z14.s, z14.s, z3.s\n"
1303 "st1w z13.s, p0, [%[outptr0]]\n"
1304 "add z15.s, z15.s, z4.s\n"
1305 "ld1w z17.s, p1/z, [%[inptr], #4, MUL VL]\n"
1306 "add z16.s, z16.s, z2.s\n"
1307 "ld1w z18.s, p2/z, [%[inptr], #5, MUL VL]\n"
1308 "ld1w z19.s, p0/z, [%[inptr], #6, MUL VL]\n"
1309 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1310 "add z17.s, z17.s, z3.s\n"
1311 "st1w z14.s, p1, [%[outptr0], #1, MUL VL]\n"
1312 "add z18.s, z18.s, z4.s\n"
1313 "ld1w z20.s, p1/z, [%[inptr], #7, MUL VL]\n"
1314 "add z19.s, z19.s, z2.s\n"
1315 "ld1w z13.s, p2/z, [x8, #-8, MUL VL]\n"
1316 "ld1w z14.s, p0/z, [x8, #-7, MUL VL]\n"
1317 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1318 "add z20.s, z20.s, z3.s\n"
1319 "st1w z15.s, p2, [%[outptr0], #2, MUL VL]\n"
1320 "add z13.s, z13.s, z4.s\n"
1321 "ld1w z15.s, p1/z, [x8, #-6, MUL VL]\n"
1322 "add z14.s, z14.s, z2.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001323 "addvl %[outptr0], %[outptr0], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001324 "st1w z16.s, p0, [%[outptr1]]\n"
1325 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1326 "add z15.s, z15.s, z3.s\n"
1327 "ld1w z16.s, p2/z, [x8, #-5, MUL VL]\n"
1328 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1329 "st1w z17.s, p1, [%[outptr1], #1, MUL VL]\n"
1330 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1331 "add z16.s, z16.s, z4.s\n"
1332 "ld1w z17.s, p0/z, [x8, #-4, MUL VL]\n"
1333 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1334 "st1w z18.s, p2, [%[outptr1], #2, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001335 "addvl %[outptr1], %[outptr1], #3\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001336 "add z17.s, z17.s, z2.s\n"
1337 "ld1w z18.s, p1/z, [x8, #-3, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001338 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001339 "st1w z19.s, p0, [%[outptr2]]\n"
1340 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1341 "add z18.s, z18.s, z3.s\n"
1342 "ld1w z19.s, p2/z, [x8, #-2, MUL VL]\n"
1343 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1344 "st1w z20.s, p1, [%[outptr2], #1, MUL VL]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001345 "addvl %[inptr], %[inptr], #24\n"
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001346 "add z19.s, z19.s, z4.s\n"
1347 "ld1w z20.s, p0/z, [x8, #-1, MUL VL]\n"
1348 "st1w z13.s, p2, [%[outptr2], #2, MUL VL]\n"
1349 "addvl %[outptr2], %[outptr2], #3\n"
1350 "add z20.s, z20.s, z2.s\n"
1351 "ld1w z13.s, p1/z, [x8]\n"
1352 "st1w z14.s, p0, [%[outptr3]]\n"
1353 "ld1w z14.s, p2/z, [x8, #1, MUL VL]\n"
1354 "add z13.s, z13.s, z3.s\n"
1355 "st1w z15.s, p1, [%[outptr3], #1, MUL VL]\n"
1356 "add z14.s, z14.s, z4.s\n"
1357 "ld1w z15.s, p0/z, [x8, #2, MUL VL]\n"
1358 "st1w z16.s, p2, [%[outptr3], #2, MUL VL]\n"
1359 "addvl %[outptr3], %[outptr3], #3\n"
1360 "add z15.s, z15.s, z2.s\n"
1361 "ld1w z16.s, p1/z, [x8, #3, MUL VL]\n"
1362 "st1w z17.s, p0, [%[outptr4]]\n"
1363 "ld1w z17.s, p2/z, [x8, #4, MUL VL]\n"
1364 "add z16.s, z16.s, z3.s\n"
1365 "st1w z18.s, p1, [%[outptr4], #1, MUL VL]\n"
1366 "add z17.s, z17.s, z4.s\n"
1367 "ld1w z18.s, p0/z, [x8, #5, MUL VL]\n"
1368 "st1w z19.s, p2, [%[outptr4], #2, MUL VL]\n"
1369 "addvl %[outptr4], %[outptr4], #3\n"
1370 "add z18.s, z18.s, z2.s\n"
1371 "ld1w z19.s, p1/z, [x8, #6, MUL VL]\n"
1372 "st1w z20.s, p0, [%[outptr5]]\n"
1373 "ld1w z20.s, p2/z, [x8, #7, MUL VL]\n"
1374 "add z19.s, z19.s, z3.s\n"
1375 "st1w z13.s, p1, [%[outptr5], #1, MUL VL]\n"
1376 "add z20.s, z20.s, z4.s\n"
1377 "st1w z14.s, p2, [%[outptr5], #2, MUL VL]\n"
1378 "addvl %[outptr5], %[outptr5], #3\n"
1379 "st1w z15.s, p0, [%[outptr6]]\n"
1380 "st1w z16.s, p1, [%[outptr6], #1, MUL VL]\n"
1381 "st1w z17.s, p2, [%[outptr6], #2, MUL VL]\n"
1382 "addvl %[outptr6], %[outptr6], #3\n"
1383 "st1w z18.s, p0, [%[outptr7]]\n"
1384 "st1w z19.s, p1, [%[outptr7], #1, MUL VL]\n"
1385 "st1w z20.s, p2, [%[outptr7], #2, MUL VL]\n"
1386 "addvl %[outptr7], %[outptr7], #3\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001387 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1388 [inptr] "+r" (inptr), [p] "+r" (p)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001389 : [w] "r" (w), [biasptr] "r" (biasptr)
1390 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001391 );
1392 }
1393 break;
1394
1395
1396 }
1397 }
1398 }
1399 }
1400}
1401
1402#endif // __ARM_FEATURE_SVE