blob: 4b51066ebee9b36c37ee660ecd0c7a1cbde7afc5 [file] [log] [blame]
Georgios Pinitas421405b2018-10-26 19:05:32 +01001/*
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01002 * Copyright (c) 2018-2019 Arm Limited.
Georgios Pinitas421405b2018-10-26 19:05:32 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29inline void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
30{
31 const float *inptr = in;
32
33 for (int y=y0; y<ymax; y+=8) {
34 float *outptr0 = out + (y * ldout) + x0;
35 float *outptr1 = outptr0 + ldout;
36 float *outptr2 = outptr1 + ldout;
37 float *outptr3 = outptr2 + ldout;
38 float *outptr4 = outptr3 + ldout;
39 float *outptr5 = outptr4 + ldout;
40 float *outptr6 = outptr5 + ldout;
41 float *outptr7 = outptr6 + ldout;
42
43 const int height = ymax - y;
44
45 for (int i=x0; i<xmax; i+=(3 * get_vector_length<float>())) {
46 if (beta==0.0f)
47 {
48 switch(height) {
49 case 1:
50 {
51 long w = xmax - i;
52 long p = 0;
53 /* Optimized routine to copy an entire block */
54 __asm __volatile (
55 "mov z2.s, %s[alpha]\n"
56 "addvl x8, %[inptr], #16\n"
57 "mov z3.s, %s[beta]\n"
58 "whilelt p0.s, %[p], %[w]\n"
59 "b.none 1f\n"
60 "ld1w z4.s, p0/z, [%[inptr]]\n"
61 "incw %[p], all, mul #1\n"
62 "fmul z8.s, z4.s, z2.s\n"
63 "st1w z8.s, p0, [%[outptr0]]\n"
64 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
65 "whilelt p0.s, %[p], %[w]\n"
66 "b.none 1f\n"
67 "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
68 "incw %[p], all, mul #1\n"
69 "fmul z9.s, z5.s, z2.s\n"
70 "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
71 "whilelt p0.s, %[p], %[w]\n"
72 "b.none 1f\n"
73 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
74 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
75 "fmul z10.s, z6.s, z2.s\n"
76 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
77 "addvl %[outptr0], %[outptr0], #3\n"
78 "1:\n"
79 "addvl %[inptr], %[inptr], #24\n"
80 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
81 [inptr] "+r" (inptr), [p] "+r" (p)
82 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
83 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
84 );
85 }
86 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010087
Georgios Pinitas421405b2018-10-26 19:05:32 +010088 case 2:
89 {
90 long w = xmax - i;
91 long p = 0;
92 /* Optimized routine to copy an entire block */
93 __asm __volatile (
94 "mov z2.s, %s[alpha]\n"
95 "addvl x8, %[inptr], #16\n"
96 "mov z3.s, %s[beta]\n"
97 "whilelt p0.s, %[p], %[w]\n"
98 "b.none 1f\n"
99 "ld1w z4.s, p0/z, [%[inptr]]\n"
100 "incw %[p], all, mul #1\n"
101 "fmul z8.s, z4.s, z2.s\n"
102 "st1w z8.s, p0, [%[outptr0]]\n"
103 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
104 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
105 "fmul z9.s, z5.s, z2.s\n"
106 "st1w z9.s, p0, [%[outptr1]]\n"
107 "whilelt p0.s, %[p], %[w]\n"
108 "b.none 1f\n"
109 "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
110 "incw %[p], all, mul #1\n"
111 "fmul z10.s, z6.s, z2.s\n"
112 "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
113 "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
114 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
115 "fmul z11.s, z7.s, z2.s\n"
116 "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
117 "whilelt p0.s, %[p], %[w]\n"
118 "b.none 1f\n"
119 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
120 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
121 "fmul z8.s, z4.s, z2.s\n"
122 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
123 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
124 "addvl %[outptr0], %[outptr0], #3\n"
125 "fmul z9.s, z5.s, z2.s\n"
126 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
127 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
128 "addvl %[outptr1], %[outptr1], #3\n"
129 "1:\n"
130 "addvl %[inptr], %[inptr], #24\n"
131 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
132 [inptr] "+r" (inptr), [p] "+r" (p)
133 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
134 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
135 );
136 }
137 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100138
Georgios Pinitas421405b2018-10-26 19:05:32 +0100139 case 3:
140 {
141 long w = xmax - i;
142 long p = 0;
143 /* Optimized routine to copy an entire block */
144 __asm __volatile (
145 "mov z2.s, %s[alpha]\n"
146 "addvl x8, %[inptr], #16\n"
147 "mov z3.s, %s[beta]\n"
148 "whilelt p0.s, %[p], %[w]\n"
149 "b.none 1f\n"
150 "ld1w z4.s, p0/z, [%[inptr]]\n"
151 "incw %[p], all, mul #1\n"
152 "fmul z8.s, z4.s, z2.s\n"
153 "st1w z8.s, p0, [%[outptr0]]\n"
154 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
155 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
156 "fmul z9.s, z5.s, z2.s\n"
157 "st1w z9.s, p0, [%[outptr1]]\n"
158 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
159 "fmul z10.s, z6.s, z2.s\n"
160 "st1w z10.s, p0, [%[outptr2]]\n"
161 "whilelt p0.s, %[p], %[w]\n"
162 "b.none 1f\n"
163 "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
164 "incw %[p], all, mul #1\n"
165 "fmul z11.s, z7.s, z2.s\n"
166 "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
167 "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
168 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
169 "fmul z8.s, z4.s, z2.s\n"
170 "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
171 "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
172 "fmul z9.s, z5.s, z2.s\n"
173 "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
174 "whilelt p0.s, %[p], %[w]\n"
175 "b.none 1f\n"
176 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
177 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
178 "fmul z10.s, z6.s, z2.s\n"
179 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
180 "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
181 "addvl %[outptr0], %[outptr0], #3\n"
182 "fmul z11.s, z7.s, z2.s\n"
183 "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
184 "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
185 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
186 "fmul z8.s, z4.s, z2.s\n"
187 "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
188 "addvl %[outptr1], %[outptr1], #3\n"
189 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
190 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
191 "addvl %[outptr2], %[outptr2], #3\n"
192 "1:\n"
193 "addvl %[inptr], %[inptr], #24\n"
194 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
195 [inptr] "+r" (inptr), [p] "+r" (p)
196 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
197 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
198 );
199 }
200 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100201
Georgios Pinitas421405b2018-10-26 19:05:32 +0100202 case 4:
203 {
204 long w = xmax - i;
205 long p = 0;
206 /* Optimized routine to copy an entire block */
207 __asm __volatile (
208 "mov z2.s, %s[alpha]\n"
209 "addvl x8, %[inptr], #16\n"
210 "mov z3.s, %s[beta]\n"
211 "whilelt p0.s, %[p], %[w]\n"
212 "b.none 1f\n"
213 "ld1w z4.s, p0/z, [%[inptr]]\n"
214 "incw %[p], all, mul #1\n"
215 "fmul z8.s, z4.s, z2.s\n"
216 "st1w z8.s, p0, [%[outptr0]]\n"
217 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
218 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
219 "fmul z9.s, z5.s, z2.s\n"
220 "st1w z9.s, p0, [%[outptr1]]\n"
221 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
222 "fmul z10.s, z6.s, z2.s\n"
223 "st1w z10.s, p0, [%[outptr2]]\n"
224 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
225 "fmul z11.s, z7.s, z2.s\n"
226 "st1w z11.s, p0, [%[outptr3]]\n"
227 "whilelt p0.s, %[p], %[w]\n"
228 "b.none 1f\n"
229 "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
230 "incw %[p], all, mul #1\n"
231 "fmul z8.s, z4.s, z2.s\n"
232 "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
233 "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
234 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
235 "fmul z9.s, z5.s, z2.s\n"
236 "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
237 "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
238 "fmul z10.s, z6.s, z2.s\n"
239 "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
240 "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
241 "fmul z11.s, z7.s, z2.s\n"
242 "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
243 "whilelt p0.s, %[p], %[w]\n"
244 "b.none 1f\n"
245 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
246 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
247 "fmul z8.s, z4.s, z2.s\n"
248 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
249 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
250 "addvl %[outptr0], %[outptr0], #3\n"
251 "fmul z9.s, z5.s, z2.s\n"
252 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
253 "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
254 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
255 "fmul z10.s, z6.s, z2.s\n"
256 "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
257 "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
258 "addvl %[outptr1], %[outptr1], #3\n"
259 "fmul z11.s, z7.s, z2.s\n"
260 "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
261 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
262 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
263 "addvl %[outptr2], %[outptr2], #3\n"
264 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
265 "addvl %[outptr3], %[outptr3], #3\n"
266 "1:\n"
267 "addvl %[inptr], %[inptr], #24\n"
268 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
269 [inptr] "+r" (inptr), [p] "+r" (p)
270 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
271 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
272 );
273 }
274 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100275
Georgios Pinitas421405b2018-10-26 19:05:32 +0100276 case 5:
277 {
278 long w = xmax - i;
279 long p = 0;
280 /* Optimized routine to copy an entire block */
281 __asm __volatile (
282 "mov z2.s, %s[alpha]\n"
283 "addvl x8, %[inptr], #16\n"
284 "mov z3.s, %s[beta]\n"
285 "whilelt p0.s, %[p], %[w]\n"
286 "b.none 1f\n"
287 "ld1w z4.s, p0/z, [%[inptr]]\n"
288 "incw %[p], all, mul #1\n"
289 "fmul z8.s, z4.s, z2.s\n"
290 "st1w z8.s, p0, [%[outptr0]]\n"
291 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
292 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
293 "fmul z9.s, z5.s, z2.s\n"
294 "st1w z9.s, p0, [%[outptr1]]\n"
295 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
296 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
297 "fmul z10.s, z6.s, z2.s\n"
298 "st1w z10.s, p0, [%[outptr2]]\n"
299 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
300 "fmul z11.s, z7.s, z2.s\n"
301 "st1w z11.s, p0, [%[outptr3]]\n"
302 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
303 "fmul z8.s, z4.s, z2.s\n"
304 "st1w z8.s, p0, [%[outptr4]]\n"
305 "whilelt p0.s, %[p], %[w]\n"
306 "b.none 1f\n"
307 "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
308 "incw %[p], all, mul #1\n"
309 "fmul z9.s, z5.s, z2.s\n"
310 "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
311 "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
312 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
313 "fmul z10.s, z6.s, z2.s\n"
314 "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
315 "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
316 "fmul z11.s, z7.s, z2.s\n"
317 "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
318 "ld1w z4.s, p0/z, [x8, #-6, MUL VL]\n"
319 "fmul z8.s, z4.s, z2.s\n"
320 "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
321 "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
322 "fmul z9.s, z5.s, z2.s\n"
323 "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
324 "whilelt p0.s, %[p], %[w]\n"
325 "b.none 1f\n"
326 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
327 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
328 "fmul z10.s, z6.s, z2.s\n"
329 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
330 "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
331 "addvl %[outptr0], %[outptr0], #3\n"
332 "fmul z11.s, z7.s, z2.s\n"
333 "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
334 "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
335 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
336 "fmul z8.s, z4.s, z2.s\n"
337 "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
338 "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
339 "addvl %[outptr1], %[outptr1], #3\n"
340 "fmul z9.s, z5.s, z2.s\n"
341 "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
342 "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
343 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
344 "fmul z10.s, z6.s, z2.s\n"
345 "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
346 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
347 "addvl %[outptr2], %[outptr2], #3\n"
348 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
349 "addvl %[outptr3], %[outptr3], #3\n"
350 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
351 "addvl %[outptr4], %[outptr4], #3\n"
352 "1:\n"
353 "addvl %[inptr], %[inptr], #24\n"
354 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
355 [inptr] "+r" (inptr), [p] "+r" (p)
356 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
357 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
358 );
359 }
360 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100361
Georgios Pinitas421405b2018-10-26 19:05:32 +0100362 case 6:
363 {
364 long w = xmax - i;
365 long p = 0;
366 /* Optimized routine to copy an entire block */
367 __asm __volatile (
368 "mov z2.s, %s[alpha]\n"
369 "addvl x8, %[inptr], #16\n"
370 "mov z3.s, %s[beta]\n"
371 "whilelt p0.s, %[p], %[w]\n"
372 "b.none 1f\n"
373 "ld1w z4.s, p0/z, [%[inptr]]\n"
374 "incw %[p], all, mul #1\n"
375 "fmul z8.s, z4.s, z2.s\n"
376 "st1w z8.s, p0, [%[outptr0]]\n"
377 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
378 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
379 "fmul z9.s, z5.s, z2.s\n"
380 "st1w z9.s, p0, [%[outptr1]]\n"
381 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
382 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
383 "fmul z10.s, z6.s, z2.s\n"
384 "st1w z10.s, p0, [%[outptr2]]\n"
385 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
386 "fmul z11.s, z7.s, z2.s\n"
387 "st1w z11.s, p0, [%[outptr3]]\n"
388 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
389 "fmul z8.s, z4.s, z2.s\n"
390 "st1w z8.s, p0, [%[outptr4]]\n"
391 "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
392 "fmul z9.s, z5.s, z2.s\n"
393 "st1w z9.s, p0, [%[outptr5]]\n"
394 "whilelt p0.s, %[p], %[w]\n"
395 "b.none 1f\n"
396 "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
397 "incw %[p], all, mul #1\n"
398 "fmul z10.s, z6.s, z2.s\n"
399 "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
400 "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
401 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
402 "fmul z11.s, z7.s, z2.s\n"
403 "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
404 "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
405 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
406 "fmul z8.s, z4.s, z2.s\n"
407 "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
408 "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
409 "fmul z9.s, z5.s, z2.s\n"
410 "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
411 "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
412 "fmul z10.s, z6.s, z2.s\n"
413 "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
414 "ld1w z7.s, p0/z, [x8]\n"
415 "fmul z11.s, z7.s, z2.s\n"
416 "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
417 "whilelt p0.s, %[p], %[w]\n"
418 "b.none 1f\n"
419 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
420 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
421 "fmul z8.s, z4.s, z2.s\n"
422 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
423 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
424 "addvl %[outptr0], %[outptr0], #3\n"
425 "fmul z9.s, z5.s, z2.s\n"
426 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
427 "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
428 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
429 "fmul z10.s, z6.s, z2.s\n"
430 "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
431 "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
432 "addvl %[outptr1], %[outptr1], #3\n"
433 "fmul z11.s, z7.s, z2.s\n"
434 "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
435 "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
436 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
437 "fmul z8.s, z4.s, z2.s\n"
438 "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
439 "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
440 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
441 "fmul z9.s, z5.s, z2.s\n"
442 "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
443 "addvl %[outptr2], %[outptr2], #3\n"
444 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
445 "addvl %[outptr3], %[outptr3], #3\n"
446 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
447 "addvl %[outptr4], %[outptr4], #3\n"
448 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
449 "addvl %[outptr5], %[outptr5], #3\n"
450 "1:\n"
451 "addvl %[inptr], %[inptr], #24\n"
452 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
453 [inptr] "+r" (inptr), [p] "+r" (p)
454 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
455 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
456 );
457 }
458 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100459
Georgios Pinitas421405b2018-10-26 19:05:32 +0100460 case 7:
461 {
462 long w = xmax - i;
463 long p = 0;
464 /* Optimized routine to copy an entire block */
465 __asm __volatile (
466 "mov z2.s, %s[alpha]\n"
467 "addvl x8, %[inptr], #16\n"
468 "mov z3.s, %s[beta]\n"
469 "whilelt p0.s, %[p], %[w]\n"
470 "b.none 1f\n"
471 "ld1w z4.s, p0/z, [%[inptr]]\n"
472 "incw %[p], all, mul #1\n"
473 "fmul z8.s, z4.s, z2.s\n"
474 "st1w z8.s, p0, [%[outptr0]]\n"
475 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
476 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
477 "fmul z9.s, z5.s, z2.s\n"
478 "st1w z9.s, p0, [%[outptr1]]\n"
479 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
480 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
481 "fmul z10.s, z6.s, z2.s\n"
482 "st1w z10.s, p0, [%[outptr2]]\n"
483 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
484 "fmul z11.s, z7.s, z2.s\n"
485 "st1w z11.s, p0, [%[outptr3]]\n"
486 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
487 "fmul z8.s, z4.s, z2.s\n"
488 "st1w z8.s, p0, [%[outptr4]]\n"
489 "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
490 "fmul z9.s, z5.s, z2.s\n"
491 "st1w z9.s, p0, [%[outptr5]]\n"
492 "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
493 "fmul z10.s, z6.s, z2.s\n"
494 "st1w z10.s, p0, [%[outptr6]]\n"
495 "whilelt p0.s, %[p], %[w]\n"
496 "b.none 1f\n"
497 "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
498 "incw %[p], all, mul #1\n"
499 "fmul z11.s, z7.s, z2.s\n"
500 "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
501 "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
502 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
503 "fmul z8.s, z4.s, z2.s\n"
504 "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
505 "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
506 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
507 "fmul z9.s, z5.s, z2.s\n"
508 "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
509 "ld1w z6.s, p0/z, [x8, #-6, MUL VL]\n"
510 "fmul z10.s, z6.s, z2.s\n"
511 "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
512 "ld1w z7.s, p0/z, [x8, #-3, MUL VL]\n"
513 "fmul z11.s, z7.s, z2.s\n"
514 "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
515 "ld1w z4.s, p0/z, [x8]\n"
516 "fmul z8.s, z4.s, z2.s\n"
517 "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
518 "ld1w z5.s, p0/z, [x8, #3, MUL VL]\n"
519 "fmul z9.s, z5.s, z2.s\n"
520 "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
521 "whilelt p0.s, %[p], %[w]\n"
522 "b.none 1f\n"
523 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
524 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
525 "fmul z10.s, z6.s, z2.s\n"
526 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
527 "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
528 "addvl %[outptr0], %[outptr0], #3\n"
529 "fmul z11.s, z7.s, z2.s\n"
530 "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
531 "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
532 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
533 "fmul z8.s, z4.s, z2.s\n"
534 "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
535 "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
536 "addvl %[outptr1], %[outptr1], #3\n"
537 "fmul z9.s, z5.s, z2.s\n"
538 "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
539 "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
540 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
541 "fmul z10.s, z6.s, z2.s\n"
542 "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
543 "ld1w z7.s, p0/z, [x8, #1, MUL VL]\n"
544 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
545 "fmul z11.s, z7.s, z2.s\n"
546 "st1w z11.s, p0, [%[outptr5], #2, MUL VL]\n"
547 "ld1w z4.s, p0/z, [x8, #4, MUL VL]\n"
548 "addvl %[outptr2], %[outptr2], #3\n"
549 "fmul z8.s, z4.s, z2.s\n"
550 "st1w z8.s, p0, [%[outptr6], #2, MUL VL]\n"
551 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
552 "addvl %[outptr3], %[outptr3], #3\n"
553 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
554 "addvl %[outptr4], %[outptr4], #3\n"
555 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
556 "addvl %[outptr5], %[outptr5], #3\n"
557 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
558 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
559 "addvl %[outptr6], %[outptr6], #3\n"
560 "1:\n"
561 "addvl %[inptr], %[inptr], #24\n"
562 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
563 [inptr] "+r" (inptr), [p] "+r" (p)
564 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
565 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
566 );
567 }
568 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100569
Georgios Pinitas421405b2018-10-26 19:05:32 +0100570 default:
571 case 8:
572 {
573 long w = xmax - i;
574 long p = 0;
575 /* Optimized routine to copy an entire block */
576 __asm __volatile (
577 "mov z2.s, %s[alpha]\n"
578 "addvl x8, %[inptr], #16\n"
579 "mov z3.s, %s[beta]\n"
580 "whilelt p0.s, %[p], %[w]\n"
581 "b.none 1f\n"
582 "ld1w z4.s, p0/z, [%[inptr]]\n"
583 "incw %[p], all, mul #1\n"
584 "fmul z8.s, z4.s, z2.s\n"
585 "st1w z8.s, p0, [%[outptr0]]\n"
586 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
587 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
588 "fmul z9.s, z5.s, z2.s\n"
589 "st1w z9.s, p0, [%[outptr1]]\n"
590 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
591 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
592 "fmul z10.s, z6.s, z2.s\n"
593 "st1w z10.s, p0, [%[outptr2]]\n"
594 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
595 "fmul z11.s, z7.s, z2.s\n"
596 "st1w z11.s, p0, [%[outptr3]]\n"
597 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
598 "fmul z8.s, z4.s, z2.s\n"
599 "st1w z8.s, p0, [%[outptr4]]\n"
600 "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
601 "fmul z9.s, z5.s, z2.s\n"
602 "st1w z9.s, p0, [%[outptr5]]\n"
603 "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
604 "fmul z10.s, z6.s, z2.s\n"
605 "st1w z10.s, p0, [%[outptr6]]\n"
606 "ld1w z7.s, p0/z, [x8, #5, MUL VL]\n"
607 "fmul z11.s, z7.s, z2.s\n"
608 "st1w z11.s, p0, [%[outptr7]]\n"
609 "whilelt p0.s, %[p], %[w]\n"
610 "b.none 1f\n"
611 "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
612 "incw %[p], all, mul #1\n"
613 "fmul z8.s, z4.s, z2.s\n"
614 "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
615 "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
616 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
617 "fmul z9.s, z5.s, z2.s\n"
618 "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
619 "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
620 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
621 "fmul z10.s, z6.s, z2.s\n"
622 "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
623 "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
624 "fmul z11.s, z7.s, z2.s\n"
625 "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
626 "ld1w z4.s, p0/z, [x8, #-3, MUL VL]\n"
627 "fmul z8.s, z4.s, z2.s\n"
628 "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
629 "ld1w z5.s, p0/z, [x8]\n"
630 "fmul z9.s, z5.s, z2.s\n"
631 "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
632 "ld1w z6.s, p0/z, [x8, #3, MUL VL]\n"
633 "fmul z10.s, z6.s, z2.s\n"
634 "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
635 "ld1w z7.s, p0/z, [x8, #6, MUL VL]\n"
636 "fmul z11.s, z7.s, z2.s\n"
637 "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
638 "whilelt p0.s, %[p], %[w]\n"
639 "b.none 1f\n"
640 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
641 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
642 "fmul z8.s, z4.s, z2.s\n"
643 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
644 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
645 "addvl %[outptr0], %[outptr0], #3\n"
646 "fmul z9.s, z5.s, z2.s\n"
647 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
648 "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
649 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
650 "fmul z10.s, z6.s, z2.s\n"
651 "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
652 "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
653 "addvl %[outptr1], %[outptr1], #3\n"
654 "fmul z11.s, z7.s, z2.s\n"
655 "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
656 "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
657 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
658 "fmul z8.s, z4.s, z2.s\n"
659 "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
660 "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
661 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
662 "fmul z9.s, z5.s, z2.s\n"
663 "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
664 "ld1w z6.s, p0/z, [x8, #4, MUL VL]\n"
665 "addvl %[outptr2], %[outptr2], #3\n"
666 "fmul z10.s, z6.s, z2.s\n"
667 "st1w z10.s, p0, [%[outptr6], #2, MUL VL]\n"
668 "ld1w z7.s, p0/z, [x8, #7, MUL VL]\n"
669 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
670 "fmul z11.s, z7.s, z2.s\n"
671 "st1w z11.s, p0, [%[outptr7], #2, MUL VL]\n"
672 "addvl %[outptr3], %[outptr3], #3\n"
673 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
674 "addvl %[outptr4], %[outptr4], #3\n"
675 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
676 "addvl %[outptr5], %[outptr5], #3\n"
677 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
678 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
679 "addvl %[outptr6], %[outptr6], #3\n"
680 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
681 "addvl %[outptr7], %[outptr7], #3\n"
682 "1:\n"
683 "addvl %[inptr], %[inptr], #24\n"
684 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
685 [inptr] "+r" (inptr), [p] "+r" (p)
686 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
687 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
688 );
689 }
690 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100691
692
Georgios Pinitas421405b2018-10-26 19:05:32 +0100693 }
694 }
695 else
696 {
697 switch(height) {
698 case 1:
699 {
700 long w = xmax - i;
701 long p = 0;
702 /* Optimized routine to copy an entire block */
703 __asm __volatile (
704 "mov z2.s, %s[alpha]\n"
705 "addvl x8, %[inptr], #16\n"
706 "mov z3.s, %s[beta]\n"
707 "whilelt p0.s, %[p], %[w]\n"
708 "b.none 1f\n"
709 "ld1w z8.s, p0/z, [%[outptr0]]\n"
710 "incw %[p], all, mul #1\n"
711 "fmul z8.s, z8.s, z3.s\n"
712 "ld1w z4.s, p0/z, [%[inptr]]\n"
713 "fmla z8.s, p0/m, z4.s, z2.s\n"
714 "st1w z8.s, p0, [%[outptr0]]\n"
715 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
716 "whilelt p0.s, %[p], %[w]\n"
717 "b.none 1f\n"
718 "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
719 "incw %[p], all, mul #1\n"
720 "fmul z9.s, z9.s, z3.s\n"
721 "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
722 "fmla z9.s, p0/m, z5.s, z2.s\n"
723 "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
724 "whilelt p0.s, %[p], %[w]\n"
725 "b.none 1f\n"
726 "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
727 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
728 "fmul z10.s, z10.s, z3.s\n"
729 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
730 "fmla z10.s, p0/m, z6.s, z2.s\n"
731 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
732 "addvl %[outptr0], %[outptr0], #3\n"
733 "1:\n"
734 "addvl %[inptr], %[inptr], #24\n"
735 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
736 [inptr] "+r" (inptr), [p] "+r" (p)
737 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
738 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
739 );
740 }
741 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100742
Georgios Pinitas421405b2018-10-26 19:05:32 +0100743 case 2:
744 {
745 long w = xmax - i;
746 long p = 0;
747 /* Optimized routine to copy an entire block */
748 __asm __volatile (
749 "mov z2.s, %s[alpha]\n"
750 "addvl x8, %[inptr], #16\n"
751 "mov z3.s, %s[beta]\n"
752 "whilelt p0.s, %[p], %[w]\n"
753 "b.none 1f\n"
754 "ld1w z8.s, p0/z, [%[outptr0]]\n"
755 "incw %[p], all, mul #1\n"
756 "fmul z8.s, z8.s, z3.s\n"
757 "ld1w z4.s, p0/z, [%[inptr]]\n"
758 "fmla z8.s, p0/m, z4.s, z2.s\n"
759 "st1w z8.s, p0, [%[outptr0]]\n"
760 "ld1w z9.s, p0/z, [%[outptr1]]\n"
761 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
762 "fmul z9.s, z9.s, z3.s\n"
763 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
764 "fmla z9.s, p0/m, z5.s, z2.s\n"
765 "st1w z9.s, p0, [%[outptr1]]\n"
766 "whilelt p0.s, %[p], %[w]\n"
767 "b.none 1f\n"
768 "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
769 "incw %[p], all, mul #1\n"
770 "fmul z10.s, z10.s, z3.s\n"
771 "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
772 "fmla z10.s, p0/m, z6.s, z2.s\n"
773 "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
774 "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
775 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
776 "fmul z11.s, z11.s, z3.s\n"
777 "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
778 "fmla z11.s, p0/m, z7.s, z2.s\n"
779 "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
780 "whilelt p0.s, %[p], %[w]\n"
781 "b.none 1f\n"
782 "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
783 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
784 "fmul z8.s, z8.s, z3.s\n"
785 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
786 "fmla z8.s, p0/m, z4.s, z2.s\n"
787 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
788 "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
789 "addvl %[outptr0], %[outptr0], #3\n"
790 "fmul z9.s, z9.s, z3.s\n"
791 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
792 "fmla z9.s, p0/m, z5.s, z2.s\n"
793 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
794 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
795 "addvl %[outptr1], %[outptr1], #3\n"
796 "1:\n"
797 "addvl %[inptr], %[inptr], #24\n"
798 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
799 [inptr] "+r" (inptr), [p] "+r" (p)
800 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
801 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
802 );
803 }
804 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100805
Georgios Pinitas421405b2018-10-26 19:05:32 +0100806 case 3:
807 {
808 long w = xmax - i;
809 long p = 0;
810 /* Optimized routine to copy an entire block */
811 __asm __volatile (
812 "mov z2.s, %s[alpha]\n"
813 "addvl x8, %[inptr], #16\n"
814 "mov z3.s, %s[beta]\n"
815 "whilelt p0.s, %[p], %[w]\n"
816 "b.none 1f\n"
817 "ld1w z8.s, p0/z, [%[outptr0]]\n"
818 "incw %[p], all, mul #1\n"
819 "fmul z8.s, z8.s, z3.s\n"
820 "ld1w z4.s, p0/z, [%[inptr]]\n"
821 "fmla z8.s, p0/m, z4.s, z2.s\n"
822 "st1w z8.s, p0, [%[outptr0]]\n"
823 "ld1w z9.s, p0/z, [%[outptr1]]\n"
824 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
825 "fmul z9.s, z9.s, z3.s\n"
826 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
827 "fmla z9.s, p0/m, z5.s, z2.s\n"
828 "st1w z9.s, p0, [%[outptr1]]\n"
829 "ld1w z10.s, p0/z, [%[outptr2]]\n"
830 "fmul z10.s, z10.s, z3.s\n"
831 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
832 "fmla z10.s, p0/m, z6.s, z2.s\n"
833 "st1w z10.s, p0, [%[outptr2]]\n"
834 "whilelt p0.s, %[p], %[w]\n"
835 "b.none 1f\n"
836 "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
837 "incw %[p], all, mul #1\n"
838 "fmul z11.s, z11.s, z3.s\n"
839 "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
840 "fmla z11.s, p0/m, z7.s, z2.s\n"
841 "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
842 "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
843 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
844 "fmul z8.s, z8.s, z3.s\n"
845 "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
846 "fmla z8.s, p0/m, z4.s, z2.s\n"
847 "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
848 "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
849 "fmul z9.s, z9.s, z3.s\n"
850 "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
851 "fmla z9.s, p0/m, z5.s, z2.s\n"
852 "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
853 "whilelt p0.s, %[p], %[w]\n"
854 "b.none 1f\n"
855 "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
856 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
857 "fmul z10.s, z10.s, z3.s\n"
858 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
859 "fmla z10.s, p0/m, z6.s, z2.s\n"
860 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
861 "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
862 "addvl %[outptr0], %[outptr0], #3\n"
863 "fmul z11.s, z11.s, z3.s\n"
864 "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
865 "fmla z11.s, p0/m, z7.s, z2.s\n"
866 "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
867 "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
868 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
869 "fmul z8.s, z8.s, z3.s\n"
870 "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
871 "fmla z8.s, p0/m, z4.s, z2.s\n"
872 "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
873 "addvl %[outptr1], %[outptr1], #3\n"
874 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
875 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
876 "addvl %[outptr2], %[outptr2], #3\n"
877 "1:\n"
878 "addvl %[inptr], %[inptr], #24\n"
879 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
880 [inptr] "+r" (inptr), [p] "+r" (p)
881 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
882 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
883 );
884 }
885 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100886
Georgios Pinitas421405b2018-10-26 19:05:32 +0100887 case 4:
888 {
889 long w = xmax - i;
890 long p = 0;
891 /* Optimized routine to copy an entire block */
892 __asm __volatile (
893 "mov z2.s, %s[alpha]\n"
894 "addvl x8, %[inptr], #16\n"
895 "mov z3.s, %s[beta]\n"
896 "whilelt p0.s, %[p], %[w]\n"
897 "b.none 1f\n"
898 "ld1w z8.s, p0/z, [%[outptr0]]\n"
899 "incw %[p], all, mul #1\n"
900 "fmul z8.s, z8.s, z3.s\n"
901 "ld1w z4.s, p0/z, [%[inptr]]\n"
902 "fmla z8.s, p0/m, z4.s, z2.s\n"
903 "st1w z8.s, p0, [%[outptr0]]\n"
904 "ld1w z9.s, p0/z, [%[outptr1]]\n"
905 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
906 "fmul z9.s, z9.s, z3.s\n"
907 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
908 "fmla z9.s, p0/m, z5.s, z2.s\n"
909 "st1w z9.s, p0, [%[outptr1]]\n"
910 "ld1w z10.s, p0/z, [%[outptr2]]\n"
911 "fmul z10.s, z10.s, z3.s\n"
912 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
913 "fmla z10.s, p0/m, z6.s, z2.s\n"
914 "st1w z10.s, p0, [%[outptr2]]\n"
915 "ld1w z11.s, p0/z, [%[outptr3]]\n"
916 "fmul z11.s, z11.s, z3.s\n"
917 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
918 "fmla z11.s, p0/m, z7.s, z2.s\n"
919 "st1w z11.s, p0, [%[outptr3]]\n"
920 "whilelt p0.s, %[p], %[w]\n"
921 "b.none 1f\n"
922 "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
923 "incw %[p], all, mul #1\n"
924 "fmul z8.s, z8.s, z3.s\n"
925 "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
926 "fmla z8.s, p0/m, z4.s, z2.s\n"
927 "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
928 "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
929 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
930 "fmul z9.s, z9.s, z3.s\n"
931 "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
932 "fmla z9.s, p0/m, z5.s, z2.s\n"
933 "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
934 "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
935 "fmul z10.s, z10.s, z3.s\n"
936 "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
937 "fmla z10.s, p0/m, z6.s, z2.s\n"
938 "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
939 "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
940 "fmul z11.s, z11.s, z3.s\n"
941 "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
942 "fmla z11.s, p0/m, z7.s, z2.s\n"
943 "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
944 "whilelt p0.s, %[p], %[w]\n"
945 "b.none 1f\n"
946 "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
947 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
948 "fmul z8.s, z8.s, z3.s\n"
949 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
950 "fmla z8.s, p0/m, z4.s, z2.s\n"
951 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
952 "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
953 "addvl %[outptr0], %[outptr0], #3\n"
954 "fmul z9.s, z9.s, z3.s\n"
955 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
956 "fmla z9.s, p0/m, z5.s, z2.s\n"
957 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
958 "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
959 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
960 "fmul z10.s, z10.s, z3.s\n"
961 "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
962 "fmla z10.s, p0/m, z6.s, z2.s\n"
963 "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
964 "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
965 "addvl %[outptr1], %[outptr1], #3\n"
966 "fmul z11.s, z11.s, z3.s\n"
967 "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
968 "fmla z11.s, p0/m, z7.s, z2.s\n"
969 "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
970 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
971 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
972 "addvl %[outptr2], %[outptr2], #3\n"
973 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
974 "addvl %[outptr3], %[outptr3], #3\n"
975 "1:\n"
976 "addvl %[inptr], %[inptr], #24\n"
977 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
978 [inptr] "+r" (inptr), [p] "+r" (p)
979 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
980 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
981 );
982 }
983 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100984
Georgios Pinitas421405b2018-10-26 19:05:32 +0100985 case 5:
986 {
987 long w = xmax - i;
988 long p = 0;
989 /* Optimized routine to copy an entire block */
990 __asm __volatile (
991 "mov z2.s, %s[alpha]\n"
992 "addvl x8, %[inptr], #16\n"
993 "mov z3.s, %s[beta]\n"
994 "whilelt p0.s, %[p], %[w]\n"
995 "b.none 1f\n"
996 "ld1w z8.s, p0/z, [%[outptr0]]\n"
997 "incw %[p], all, mul #1\n"
998 "fmul z8.s, z8.s, z3.s\n"
999 "ld1w z4.s, p0/z, [%[inptr]]\n"
1000 "fmla z8.s, p0/m, z4.s, z2.s\n"
1001 "st1w z8.s, p0, [%[outptr0]]\n"
1002 "ld1w z9.s, p0/z, [%[outptr1]]\n"
1003 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1004 "fmul z9.s, z9.s, z3.s\n"
1005 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
1006 "fmla z9.s, p0/m, z5.s, z2.s\n"
1007 "st1w z9.s, p0, [%[outptr1]]\n"
1008 "ld1w z10.s, p0/z, [%[outptr2]]\n"
1009 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1010 "fmul z10.s, z10.s, z3.s\n"
1011 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
1012 "fmla z10.s, p0/m, z6.s, z2.s\n"
1013 "st1w z10.s, p0, [%[outptr2]]\n"
1014 "ld1w z11.s, p0/z, [%[outptr3]]\n"
1015 "fmul z11.s, z11.s, z3.s\n"
1016 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
1017 "fmla z11.s, p0/m, z7.s, z2.s\n"
1018 "st1w z11.s, p0, [%[outptr3]]\n"
1019 "ld1w z8.s, p0/z, [%[outptr4]]\n"
1020 "fmul z8.s, z8.s, z3.s\n"
1021 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
1022 "fmla z8.s, p0/m, z4.s, z2.s\n"
1023 "st1w z8.s, p0, [%[outptr4]]\n"
1024 "whilelt p0.s, %[p], %[w]\n"
1025 "b.none 1f\n"
1026 "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
1027 "incw %[p], all, mul #1\n"
1028 "fmul z9.s, z9.s, z3.s\n"
1029 "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
1030 "fmla z9.s, p0/m, z5.s, z2.s\n"
1031 "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
1032 "ld1w z10.s, p0/z, [%[outptr1], #1, MUL VL]\n"
1033 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1034 "fmul z10.s, z10.s, z3.s\n"
1035 "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
1036 "fmla z10.s, p0/m, z6.s, z2.s\n"
1037 "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
1038 "ld1w z11.s, p0/z, [%[outptr2], #1, MUL VL]\n"
1039 "fmul z11.s, z11.s, z3.s\n"
1040 "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
1041 "fmla z11.s, p0/m, z7.s, z2.s\n"
1042 "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
1043 "ld1w z8.s, p0/z, [%[outptr3], #1, MUL VL]\n"
1044 "fmul z8.s, z8.s, z3.s\n"
1045 "ld1w z4.s, p0/z, [x8, #-6, MUL VL]\n"
1046 "fmla z8.s, p0/m, z4.s, z2.s\n"
1047 "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
1048 "ld1w z9.s, p0/z, [%[outptr4], #1, MUL VL]\n"
1049 "fmul z9.s, z9.s, z3.s\n"
1050 "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
1051 "fmla z9.s, p0/m, z5.s, z2.s\n"
1052 "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
1053 "whilelt p0.s, %[p], %[w]\n"
1054 "b.none 1f\n"
1055 "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
1056 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1057 "fmul z10.s, z10.s, z3.s\n"
1058 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
1059 "fmla z10.s, p0/m, z6.s, z2.s\n"
1060 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
1061 "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
1062 "addvl %[outptr0], %[outptr0], #3\n"
1063 "fmul z11.s, z11.s, z3.s\n"
1064 "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
1065 "fmla z11.s, p0/m, z7.s, z2.s\n"
1066 "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
1067 "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
1068 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1069 "fmul z8.s, z8.s, z3.s\n"
1070 "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
1071 "fmla z8.s, p0/m, z4.s, z2.s\n"
1072 "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
1073 "ld1w z9.s, p0/z, [%[outptr3], #2, MUL VL]\n"
1074 "addvl %[outptr1], %[outptr1], #3\n"
1075 "fmul z9.s, z9.s, z3.s\n"
1076 "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
1077 "fmla z9.s, p0/m, z5.s, z2.s\n"
1078 "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
1079 "ld1w z10.s, p0/z, [%[outptr4], #2, MUL VL]\n"
1080 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1081 "fmul z10.s, z10.s, z3.s\n"
1082 "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
1083 "fmla z10.s, p0/m, z6.s, z2.s\n"
1084 "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
1085 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1086 "addvl %[outptr2], %[outptr2], #3\n"
1087 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1088 "addvl %[outptr3], %[outptr3], #3\n"
1089 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1090 "addvl %[outptr4], %[outptr4], #3\n"
1091 "1:\n"
1092 "addvl %[inptr], %[inptr], #24\n"
1093 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1094 [inptr] "+r" (inptr), [p] "+r" (p)
1095 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
1096 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
1097 );
1098 }
1099 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001100
Georgios Pinitas421405b2018-10-26 19:05:32 +01001101 case 6:
1102 {
1103 long w = xmax - i;
1104 long p = 0;
1105 /* Optimized routine to copy an entire block */
1106 __asm __volatile (
1107 "mov z2.s, %s[alpha]\n"
1108 "addvl x8, %[inptr], #16\n"
1109 "mov z3.s, %s[beta]\n"
1110 "whilelt p0.s, %[p], %[w]\n"
1111 "b.none 1f\n"
1112 "ld1w z8.s, p0/z, [%[outptr0]]\n"
1113 "incw %[p], all, mul #1\n"
1114 "fmul z8.s, z8.s, z3.s\n"
1115 "ld1w z4.s, p0/z, [%[inptr]]\n"
1116 "fmla z8.s, p0/m, z4.s, z2.s\n"
1117 "st1w z8.s, p0, [%[outptr0]]\n"
1118 "ld1w z9.s, p0/z, [%[outptr1]]\n"
1119 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1120 "fmul z9.s, z9.s, z3.s\n"
1121 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
1122 "fmla z9.s, p0/m, z5.s, z2.s\n"
1123 "st1w z9.s, p0, [%[outptr1]]\n"
1124 "ld1w z10.s, p0/z, [%[outptr2]]\n"
1125 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1126 "fmul z10.s, z10.s, z3.s\n"
1127 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
1128 "fmla z10.s, p0/m, z6.s, z2.s\n"
1129 "st1w z10.s, p0, [%[outptr2]]\n"
1130 "ld1w z11.s, p0/z, [%[outptr3]]\n"
1131 "fmul z11.s, z11.s, z3.s\n"
1132 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
1133 "fmla z11.s, p0/m, z7.s, z2.s\n"
1134 "st1w z11.s, p0, [%[outptr3]]\n"
1135 "ld1w z8.s, p0/z, [%[outptr4]]\n"
1136 "fmul z8.s, z8.s, z3.s\n"
1137 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
1138 "fmla z8.s, p0/m, z4.s, z2.s\n"
1139 "st1w z8.s, p0, [%[outptr4]]\n"
1140 "ld1w z9.s, p0/z, [%[outptr5]]\n"
1141 "fmul z9.s, z9.s, z3.s\n"
1142 "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
1143 "fmla z9.s, p0/m, z5.s, z2.s\n"
1144 "st1w z9.s, p0, [%[outptr5]]\n"
1145 "whilelt p0.s, %[p], %[w]\n"
1146 "b.none 1f\n"
1147 "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
1148 "incw %[p], all, mul #1\n"
1149 "fmul z10.s, z10.s, z3.s\n"
1150 "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
1151 "fmla z10.s, p0/m, z6.s, z2.s\n"
1152 "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
1153 "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
1154 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1155 "fmul z11.s, z11.s, z3.s\n"
1156 "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
1157 "fmla z11.s, p0/m, z7.s, z2.s\n"
1158 "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
1159 "ld1w z8.s, p0/z, [%[outptr2], #1, MUL VL]\n"
1160 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1161 "fmul z8.s, z8.s, z3.s\n"
1162 "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
1163 "fmla z8.s, p0/m, z4.s, z2.s\n"
1164 "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
1165 "ld1w z9.s, p0/z, [%[outptr3], #1, MUL VL]\n"
1166 "fmul z9.s, z9.s, z3.s\n"
1167 "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
1168 "fmla z9.s, p0/m, z5.s, z2.s\n"
1169 "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
1170 "ld1w z10.s, p0/z, [%[outptr4], #1, MUL VL]\n"
1171 "fmul z10.s, z10.s, z3.s\n"
1172 "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
1173 "fmla z10.s, p0/m, z6.s, z2.s\n"
1174 "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
1175 "ld1w z11.s, p0/z, [%[outptr5], #1, MUL VL]\n"
1176 "fmul z11.s, z11.s, z3.s\n"
1177 "ld1w z7.s, p0/z, [x8]\n"
1178 "fmla z11.s, p0/m, z7.s, z2.s\n"
1179 "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
1180 "whilelt p0.s, %[p], %[w]\n"
1181 "b.none 1f\n"
1182 "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
1183 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1184 "fmul z8.s, z8.s, z3.s\n"
1185 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
1186 "fmla z8.s, p0/m, z4.s, z2.s\n"
1187 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
1188 "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
1189 "addvl %[outptr0], %[outptr0], #3\n"
1190 "fmul z9.s, z9.s, z3.s\n"
1191 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
1192 "fmla z9.s, p0/m, z5.s, z2.s\n"
1193 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
1194 "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
1195 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1196 "fmul z10.s, z10.s, z3.s\n"
1197 "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
1198 "fmla z10.s, p0/m, z6.s, z2.s\n"
1199 "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
1200 "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
1201 "addvl %[outptr1], %[outptr1], #3\n"
1202 "fmul z11.s, z11.s, z3.s\n"
1203 "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
1204 "fmla z11.s, p0/m, z7.s, z2.s\n"
1205 "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
1206 "ld1w z8.s, p0/z, [%[outptr4], #2, MUL VL]\n"
1207 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1208 "fmul z8.s, z8.s, z3.s\n"
1209 "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
1210 "fmla z8.s, p0/m, z4.s, z2.s\n"
1211 "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
1212 "ld1w z9.s, p0/z, [%[outptr5], #2, MUL VL]\n"
1213 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1214 "fmul z9.s, z9.s, z3.s\n"
1215 "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
1216 "fmla z9.s, p0/m, z5.s, z2.s\n"
1217 "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
1218 "addvl %[outptr2], %[outptr2], #3\n"
1219 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1220 "addvl %[outptr3], %[outptr3], #3\n"
1221 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1222 "addvl %[outptr4], %[outptr4], #3\n"
1223 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1224 "addvl %[outptr5], %[outptr5], #3\n"
1225 "1:\n"
1226 "addvl %[inptr], %[inptr], #24\n"
1227 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1228 [inptr] "+r" (inptr), [p] "+r" (p)
1229 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
1230 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
1231 );
1232 }
1233 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001234
Georgios Pinitas421405b2018-10-26 19:05:32 +01001235 case 7:
1236 {
1237 long w = xmax - i;
1238 long p = 0;
1239 /* Optimized routine to copy an entire block */
1240 __asm __volatile (
1241 "mov z2.s, %s[alpha]\n"
1242 "addvl x8, %[inptr], #16\n"
1243 "mov z3.s, %s[beta]\n"
1244 "whilelt p0.s, %[p], %[w]\n"
1245 "b.none 1f\n"
1246 "ld1w z8.s, p0/z, [%[outptr0]]\n"
1247 "incw %[p], all, mul #1\n"
1248 "fmul z8.s, z8.s, z3.s\n"
1249 "ld1w z4.s, p0/z, [%[inptr]]\n"
1250 "fmla z8.s, p0/m, z4.s, z2.s\n"
1251 "st1w z8.s, p0, [%[outptr0]]\n"
1252 "ld1w z9.s, p0/z, [%[outptr1]]\n"
1253 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1254 "fmul z9.s, z9.s, z3.s\n"
1255 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
1256 "fmla z9.s, p0/m, z5.s, z2.s\n"
1257 "st1w z9.s, p0, [%[outptr1]]\n"
1258 "ld1w z10.s, p0/z, [%[outptr2]]\n"
1259 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1260 "fmul z10.s, z10.s, z3.s\n"
1261 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
1262 "fmla z10.s, p0/m, z6.s, z2.s\n"
1263 "st1w z10.s, p0, [%[outptr2]]\n"
1264 "ld1w z11.s, p0/z, [%[outptr3]]\n"
1265 "fmul z11.s, z11.s, z3.s\n"
1266 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
1267 "fmla z11.s, p0/m, z7.s, z2.s\n"
1268 "st1w z11.s, p0, [%[outptr3]]\n"
1269 "ld1w z8.s, p0/z, [%[outptr4]]\n"
1270 "fmul z8.s, z8.s, z3.s\n"
1271 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
1272 "fmla z8.s, p0/m, z4.s, z2.s\n"
1273 "st1w z8.s, p0, [%[outptr4]]\n"
1274 "ld1w z9.s, p0/z, [%[outptr5]]\n"
1275 "fmul z9.s, z9.s, z3.s\n"
1276 "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
1277 "fmla z9.s, p0/m, z5.s, z2.s\n"
1278 "st1w z9.s, p0, [%[outptr5]]\n"
1279 "ld1w z10.s, p0/z, [%[outptr6]]\n"
1280 "fmul z10.s, z10.s, z3.s\n"
1281 "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
1282 "fmla z10.s, p0/m, z6.s, z2.s\n"
1283 "st1w z10.s, p0, [%[outptr6]]\n"
1284 "whilelt p0.s, %[p], %[w]\n"
1285 "b.none 1f\n"
1286 "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
1287 "incw %[p], all, mul #1\n"
1288 "fmul z11.s, z11.s, z3.s\n"
1289 "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
1290 "fmla z11.s, p0/m, z7.s, z2.s\n"
1291 "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
1292 "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
1293 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1294 "fmul z8.s, z8.s, z3.s\n"
1295 "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
1296 "fmla z8.s, p0/m, z4.s, z2.s\n"
1297 "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
1298 "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
1299 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1300 "fmul z9.s, z9.s, z3.s\n"
1301 "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
1302 "fmla z9.s, p0/m, z5.s, z2.s\n"
1303 "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
1304 "ld1w z10.s, p0/z, [%[outptr3], #1, MUL VL]\n"
1305 "fmul z10.s, z10.s, z3.s\n"
1306 "ld1w z6.s, p0/z, [x8, #-6, MUL VL]\n"
1307 "fmla z10.s, p0/m, z6.s, z2.s\n"
1308 "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
1309 "ld1w z11.s, p0/z, [%[outptr4], #1, MUL VL]\n"
1310 "fmul z11.s, z11.s, z3.s\n"
1311 "ld1w z7.s, p0/z, [x8, #-3, MUL VL]\n"
1312 "fmla z11.s, p0/m, z7.s, z2.s\n"
1313 "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
1314 "ld1w z8.s, p0/z, [%[outptr5], #1, MUL VL]\n"
1315 "fmul z8.s, z8.s, z3.s\n"
1316 "ld1w z4.s, p0/z, [x8]\n"
1317 "fmla z8.s, p0/m, z4.s, z2.s\n"
1318 "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
1319 "ld1w z9.s, p0/z, [%[outptr6], #1, MUL VL]\n"
1320 "fmul z9.s, z9.s, z3.s\n"
1321 "ld1w z5.s, p0/z, [x8, #3, MUL VL]\n"
1322 "fmla z9.s, p0/m, z5.s, z2.s\n"
1323 "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
1324 "whilelt p0.s, %[p], %[w]\n"
1325 "b.none 1f\n"
1326 "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
1327 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1328 "fmul z10.s, z10.s, z3.s\n"
1329 "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
1330 "fmla z10.s, p0/m, z6.s, z2.s\n"
1331 "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
1332 "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
1333 "addvl %[outptr0], %[outptr0], #3\n"
1334 "fmul z11.s, z11.s, z3.s\n"
1335 "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
1336 "fmla z11.s, p0/m, z7.s, z2.s\n"
1337 "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
1338 "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
1339 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1340 "fmul z8.s, z8.s, z3.s\n"
1341 "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
1342 "fmla z8.s, p0/m, z4.s, z2.s\n"
1343 "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
1344 "ld1w z9.s, p0/z, [%[outptr3], #2, MUL VL]\n"
1345 "addvl %[outptr1], %[outptr1], #3\n"
1346 "fmul z9.s, z9.s, z3.s\n"
1347 "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
1348 "fmla z9.s, p0/m, z5.s, z2.s\n"
1349 "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
1350 "ld1w z10.s, p0/z, [%[outptr4], #2, MUL VL]\n"
1351 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1352 "fmul z10.s, z10.s, z3.s\n"
1353 "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
1354 "fmla z10.s, p0/m, z6.s, z2.s\n"
1355 "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
1356 "ld1w z11.s, p0/z, [%[outptr5], #2, MUL VL]\n"
1357 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1358 "fmul z11.s, z11.s, z3.s\n"
1359 "ld1w z7.s, p0/z, [x8, #1, MUL VL]\n"
1360 "fmla z11.s, p0/m, z7.s, z2.s\n"
1361 "st1w z11.s, p0, [%[outptr5], #2, MUL VL]\n"
1362 "ld1w z8.s, p0/z, [%[outptr6], #2, MUL VL]\n"
1363 "addvl %[outptr2], %[outptr2], #3\n"
1364 "fmul z8.s, z8.s, z3.s\n"
1365 "ld1w z4.s, p0/z, [x8, #4, MUL VL]\n"
1366 "fmla z8.s, p0/m, z4.s, z2.s\n"
1367 "st1w z8.s, p0, [%[outptr6], #2, MUL VL]\n"
1368 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1369 "addvl %[outptr3], %[outptr3], #3\n"
1370 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1371 "addvl %[outptr4], %[outptr4], #3\n"
1372 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1373 "addvl %[outptr5], %[outptr5], #3\n"
1374 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1375 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1376 "addvl %[outptr6], %[outptr6], #3\n"
1377 "1:\n"
1378 "addvl %[inptr], %[inptr], #24\n"
1379 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1380 [inptr] "+r" (inptr), [p] "+r" (p)
1381 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
1382 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
1383 );
1384 }
1385 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001386
Georgios Pinitas421405b2018-10-26 19:05:32 +01001387 default:
1388 case 8:
1389 {
1390 long w = xmax - i;
1391 long p = 0;
1392 /* Optimized routine to copy an entire block */
1393 __asm __volatile (
1394 "mov z2.s, %s[alpha]\n"
1395 "addvl x8, %[inptr], #16\n"
1396 "mov z3.s, %s[beta]\n"
1397 "whilelt p0.s, %[p], %[w]\n"
1398 "b.none 1f\n"
1399 "ld1w z8.s, p0/z, [%[outptr0]]\n"
1400 "incw %[p], all, mul #1\n"
1401 "fmul z8.s, z8.s, z3.s\n"
1402 "ld1w z4.s, p0/z, [%[inptr]]\n"
1403 "fmla z8.s, p0/m, z4.s, z2.s\n"
1404 "st1w z8.s, p0, [%[outptr0]]\n"
1405 "ld1w z9.s, p0/z, [%[outptr1]]\n"
1406 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1407 "fmul z9.s, z9.s, z3.s\n"
1408 "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
1409 "fmla z9.s, p0/m, z5.s, z2.s\n"
1410 "st1w z9.s, p0, [%[outptr1]]\n"
1411 "ld1w z10.s, p0/z, [%[outptr2]]\n"
1412 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1413 "fmul z10.s, z10.s, z3.s\n"
1414 "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
1415 "fmla z10.s, p0/m, z6.s, z2.s\n"
1416 "st1w z10.s, p0, [%[outptr2]]\n"
1417 "ld1w z11.s, p0/z, [%[outptr3]]\n"
1418 "fmul z11.s, z11.s, z3.s\n"
1419 "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
1420 "fmla z11.s, p0/m, z7.s, z2.s\n"
1421 "st1w z11.s, p0, [%[outptr3]]\n"
1422 "ld1w z8.s, p0/z, [%[outptr4]]\n"
1423 "fmul z8.s, z8.s, z3.s\n"
1424 "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
1425 "fmla z8.s, p0/m, z4.s, z2.s\n"
1426 "st1w z8.s, p0, [%[outptr4]]\n"
1427 "ld1w z9.s, p0/z, [%[outptr5]]\n"
1428 "fmul z9.s, z9.s, z3.s\n"
1429 "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
1430 "fmla z9.s, p0/m, z5.s, z2.s\n"
1431 "st1w z9.s, p0, [%[outptr5]]\n"
1432 "ld1w z10.s, p0/z, [%[outptr6]]\n"
1433 "fmul z10.s, z10.s, z3.s\n"
1434 "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
1435 "fmla z10.s, p0/m, z6.s, z2.s\n"
1436 "st1w z10.s, p0, [%[outptr6]]\n"
1437 "ld1w z11.s, p0/z, [%[outptr7]]\n"
1438 "fmul z11.s, z11.s, z3.s\n"
1439 "ld1w z7.s, p0/z, [x8, #5, MUL VL]\n"
1440 "fmla z11.s, p0/m, z7.s, z2.s\n"
1441 "st1w z11.s, p0, [%[outptr7]]\n"
1442 "whilelt p0.s, %[p], %[w]\n"
1443 "b.none 1f\n"
1444 "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
1445 "incw %[p], all, mul #1\n"
1446 "fmul z8.s, z8.s, z3.s\n"
1447 "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
1448 "fmla z8.s, p0/m, z4.s, z2.s\n"
1449 "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
1450 "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
1451 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1452 "fmul z9.s, z9.s, z3.s\n"
1453 "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
1454 "fmla z9.s, p0/m, z5.s, z2.s\n"
1455 "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
1456 "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
1457 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1458 "fmul z10.s, z10.s, z3.s\n"
1459 "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
1460 "fmla z10.s, p0/m, z6.s, z2.s\n"
1461 "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
1462 "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
1463 "fmul z11.s, z11.s, z3.s\n"
1464 "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
1465 "fmla z11.s, p0/m, z7.s, z2.s\n"
1466 "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
1467 "ld1w z8.s, p0/z, [%[outptr4], #1, MUL VL]\n"
1468 "fmul z8.s, z8.s, z3.s\n"
1469 "ld1w z4.s, p0/z, [x8, #-3, MUL VL]\n"
1470 "fmla z8.s, p0/m, z4.s, z2.s\n"
1471 "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
1472 "ld1w z9.s, p0/z, [%[outptr5], #1, MUL VL]\n"
1473 "fmul z9.s, z9.s, z3.s\n"
1474 "ld1w z5.s, p0/z, [x8]\n"
1475 "fmla z9.s, p0/m, z5.s, z2.s\n"
1476 "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
1477 "ld1w z10.s, p0/z, [%[outptr6], #1, MUL VL]\n"
1478 "fmul z10.s, z10.s, z3.s\n"
1479 "ld1w z6.s, p0/z, [x8, #3, MUL VL]\n"
1480 "fmla z10.s, p0/m, z6.s, z2.s\n"
1481 "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
1482 "ld1w z11.s, p0/z, [%[outptr7], #1, MUL VL]\n"
1483 "fmul z11.s, z11.s, z3.s\n"
1484 "ld1w z7.s, p0/z, [x8, #6, MUL VL]\n"
1485 "fmla z11.s, p0/m, z7.s, z2.s\n"
1486 "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
1487 "whilelt p0.s, %[p], %[w]\n"
1488 "b.none 1f\n"
1489 "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
1490 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1491 "fmul z8.s, z8.s, z3.s\n"
1492 "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
1493 "fmla z8.s, p0/m, z4.s, z2.s\n"
1494 "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
1495 "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
1496 "addvl %[outptr0], %[outptr0], #3\n"
1497 "fmul z9.s, z9.s, z3.s\n"
1498 "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
1499 "fmla z9.s, p0/m, z5.s, z2.s\n"
1500 "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
1501 "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
1502 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1503 "fmul z10.s, z10.s, z3.s\n"
1504 "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
1505 "fmla z10.s, p0/m, z6.s, z2.s\n"
1506 "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
1507 "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
1508 "addvl %[outptr1], %[outptr1], #3\n"
1509 "fmul z11.s, z11.s, z3.s\n"
1510 "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
1511 "fmla z11.s, p0/m, z7.s, z2.s\n"
1512 "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
1513 "ld1w z8.s, p0/z, [%[outptr4], #2, MUL VL]\n"
1514 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1515 "fmul z8.s, z8.s, z3.s\n"
1516 "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
1517 "fmla z8.s, p0/m, z4.s, z2.s\n"
1518 "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
1519 "ld1w z9.s, p0/z, [%[outptr5], #2, MUL VL]\n"
1520 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1521 "fmul z9.s, z9.s, z3.s\n"
1522 "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
1523 "fmla z9.s, p0/m, z5.s, z2.s\n"
1524 "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
1525 "ld1w z10.s, p0/z, [%[outptr6], #2, MUL VL]\n"
1526 "addvl %[outptr2], %[outptr2], #3\n"
1527 "fmul z10.s, z10.s, z3.s\n"
1528 "ld1w z6.s, p0/z, [x8, #4, MUL VL]\n"
1529 "fmla z10.s, p0/m, z6.s, z2.s\n"
1530 "st1w z10.s, p0, [%[outptr6], #2, MUL VL]\n"
1531 "ld1w z11.s, p0/z, [%[outptr7], #2, MUL VL]\n"
1532 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1533 "fmul z11.s, z11.s, z3.s\n"
1534 "ld1w z7.s, p0/z, [x8, #7, MUL VL]\n"
1535 "fmla z11.s, p0/m, z7.s, z2.s\n"
1536 "st1w z11.s, p0, [%[outptr7], #2, MUL VL]\n"
1537 "addvl %[outptr3], %[outptr3], #3\n"
1538 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1539 "addvl %[outptr4], %[outptr4], #3\n"
1540 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1541 "addvl %[outptr5], %[outptr5], #3\n"
1542 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1543 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1544 "addvl %[outptr6], %[outptr6], #3\n"
1545 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
1546 "addvl %[outptr7], %[outptr7], #3\n"
1547 "1:\n"
1548 "addvl %[inptr], %[inptr], #24\n"
1549 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1550 [inptr] "+r" (inptr), [p] "+r" (p)
1551 : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
1552 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
1553 );
1554 }
1555 break;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +01001556
1557
Georgios Pinitas421405b2018-10-26 19:05:32 +01001558 }
1559 }
1560 }
1561 }
1562}
1563
1564#endif // __ARM_FEATURE_SVE