blob: fcdca59bdd8f96480507d82eaf7c8dc08c96c2c5 [file] [log] [blame]
Georgios Pinitas7cd26d42019-01-09 18:35:17 +00001/*
2 * Copyright (c) 2019 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __aarch64__
27
28template<>
29inline void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
30{
31 const float *inptr = in;
32
33 for (int y=y0; y<ymax; y+=8) {
34 float *outptr0 = out + (y * ldout) + x0;
35 float *outptr1 = outptr0 + ldout;
36 float *outptr2 = outptr1 + ldout;
37 float *outptr3 = outptr2 + ldout;
38 float *outptr4 = outptr3 + ldout;
39 float *outptr5 = outptr4 + ldout;
40 float *outptr6 = outptr5 + ldout;
41 float *outptr7 = outptr6 + ldout;
42
43 const int height = ymax - y;
44
45 for (int i=x0; i<xmax; i+=12) {
46 if (beta==0.0f)
47 {
48 switch(height) {
49 case 1:
50 {
51 if ((i+11) >= xmax)
52 {
53 for (int xi=0; xi<12; xi++)
54 {
55 if ((i+xi) < xmax)
56 {
57 *outptr0 = (alpha * inptr[xi]);
58 outptr0++;
59 }
60 }
61 inptr += 96;
62 } else {
63 /* Optimized routine to copy an entire block */
64 __asm __volatile (
65 "ldr q4, [%[inptr]]\n"
66 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
67 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
68 "str q8, [%[outptr0]]\n"
69 "ldr q5, [%[inptr], #0x10]\n"
70 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
71 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
72 "str q9, [%[outptr0], #0x10]\n"
73 "ldr q6, [%[inptr], #0x20]\n"
74 "add %[inptr], %[inptr], #0x180\n"
75 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
76 "str q10, [%[outptr0], #0x20]\n"
77 "add %[outptr0], %[outptr0], #0x30\n"
78 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
79 [inptr] "+r" (inptr)
80 : [alpha] "w" (alpha), [beta] "w" (beta)
81 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
82 );
83 }
84 }
85 break;
86
87 case 2:
88 {
89 if ((i+11) >= xmax)
90 {
91 for (int xi=0; xi<12; xi++)
92 {
93 if ((i+xi) < xmax)
94 {
95 *outptr0 = (alpha * inptr[xi]);
96 outptr0++;
97 *outptr1 = (alpha * inptr[xi + 12]);
98 outptr1++;
99 }
100 }
101 inptr += 96;
102 } else {
103 /* Optimized routine to copy an entire block */
104 __asm __volatile (
105 "ldr q4, [%[inptr]]\n"
106 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
107 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
108 "str q8, [%[outptr0]]\n"
109 "ldr q5, [%[inptr], #0x30]\n"
110 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
111 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
112 "str q9, [%[outptr1]]\n"
113 "ldr q6, [%[inptr], #0x10]\n"
114 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
115 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
116 "str q10, [%[outptr0], #0x10]\n"
117 "ldr q7, [%[inptr], #0x40]\n"
118 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
119 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
120 "str q11, [%[outptr1], #0x10]\n"
121 "ldr q4, [%[inptr], #0x20]\n"
122 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
123 "str q8, [%[outptr0], #0x20]\n"
124 "ldr q5, [%[inptr], #0x50]\n"
125 "add %[outptr0], %[outptr0], #0x30\n"
126 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
127 "str q9, [%[outptr1], #0x20]\n"
128 "add %[outptr1], %[outptr1], #0x30\n"
129 "add %[inptr], %[inptr], #0x180\n"
130 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
131 [inptr] "+r" (inptr)
132 : [alpha] "w" (alpha), [beta] "w" (beta)
133 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
134 );
135 }
136 }
137 break;
138
139 case 3:
140 {
141 if ((i+11) >= xmax)
142 {
143 for (int xi=0; xi<12; xi++)
144 {
145 if ((i+xi) < xmax)
146 {
147 *outptr0 = (alpha * inptr[xi]);
148 outptr0++;
149 *outptr1 = (alpha * inptr[xi + 12]);
150 outptr1++;
151 *outptr2 = (alpha * inptr[xi + 24]);
152 outptr2++;
153 }
154 }
155 inptr += 96;
156 } else {
157 /* Optimized routine to copy an entire block */
158 __asm __volatile (
159 "ldr q4, [%[inptr]]\n"
160 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
161 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
162 "str q8, [%[outptr0]]\n"
163 "ldr q5, [%[inptr], #0x30]\n"
164 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
165 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
166 "str q9, [%[outptr1]]\n"
167 "ldr q6, [%[inptr], #0x60]\n"
168 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
169 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
170 "str q10, [%[outptr2]]\n"
171 "ldr q7, [%[inptr], #0x10]\n"
172 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
173 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
174 "str q11, [%[outptr0], #0x10]\n"
175 "ldr q4, [%[inptr], #0x40]\n"
176 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
177 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
178 "str q8, [%[outptr1], #0x10]\n"
179 "ldr q5, [%[inptr], #0x70]\n"
180 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
181 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
182 "str q9, [%[outptr2], #0x10]\n"
183 "ldr q6, [%[inptr], #0x20]\n"
184 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
185 "str q10, [%[outptr0], #0x20]\n"
186 "ldr q7, [%[inptr], #0x50]\n"
187 "add %[outptr0], %[outptr0], #0x30\n"
188 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
189 "str q11, [%[outptr1], #0x20]\n"
190 "ldr q4, [%[inptr], #0x80]\n"
191 "add %[outptr1], %[outptr1], #0x30\n"
192 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
193 "str q8, [%[outptr2], #0x20]\n"
194 "add %[outptr2], %[outptr2], #0x30\n"
195 "add %[inptr], %[inptr], #0x180\n"
196 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
197 [inptr] "+r" (inptr)
198 : [alpha] "w" (alpha), [beta] "w" (beta)
199 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
200 );
201 }
202 }
203 break;
204
205 case 4:
206 {
207 if ((i+11) >= xmax)
208 {
209 for (int xi=0; xi<12; xi++)
210 {
211 if ((i+xi) < xmax)
212 {
213 *outptr0 = (alpha * inptr[xi]);
214 outptr0++;
215 *outptr1 = (alpha * inptr[xi + 12]);
216 outptr1++;
217 *outptr2 = (alpha * inptr[xi + 24]);
218 outptr2++;
219 *outptr3 = (alpha * inptr[xi + 36]);
220 outptr3++;
221 }
222 }
223 inptr += 96;
224 } else {
225 /* Optimized routine to copy an entire block */
226 __asm __volatile (
227 "ldr q4, [%[inptr]]\n"
228 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
229 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
230 "str q8, [%[outptr0]]\n"
231 "ldr q5, [%[inptr], #0x30]\n"
232 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
233 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
234 "str q9, [%[outptr1]]\n"
235 "ldr q6, [%[inptr], #0x60]\n"
236 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
237 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
238 "str q10, [%[outptr2]]\n"
239 "ldr q7, [%[inptr], #0x90]\n"
240 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
241 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
242 "str q11, [%[outptr3]]\n"
243 "ldr q4, [%[inptr], #0x10]\n"
244 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
245 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
246 "str q8, [%[outptr0], #0x10]\n"
247 "ldr q5, [%[inptr], #0x40]\n"
248 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
249 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
250 "str q9, [%[outptr1], #0x10]\n"
251 "ldr q6, [%[inptr], #0x70]\n"
252 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
253 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
254 "str q10, [%[outptr2], #0x10]\n"
255 "ldr q7, [%[inptr], #0xa0]\n"
256 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
257 "str q11, [%[outptr3], #0x10]\n"
258 "ldr q4, [%[inptr], #0x20]\n"
259 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
260 "str q8, [%[outptr0], #0x20]\n"
261 "ldr q5, [%[inptr], #0x50]\n"
262 "add %[outptr0], %[outptr0], #0x30\n"
263 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
264 "str q9, [%[outptr1], #0x20]\n"
265 "ldr q6, [%[inptr], #0x80]\n"
266 "add %[outptr1], %[outptr1], #0x30\n"
267 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
268 "str q10, [%[outptr2], #0x20]\n"
269 "ldr q7, [%[inptr], #0xb0]\n"
270 "add %[outptr2], %[outptr2], #0x30\n"
271 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
272 "str q11, [%[outptr3], #0x20]\n"
273 "add %[outptr3], %[outptr3], #0x30\n"
274 "add %[inptr], %[inptr], #0x180\n"
275 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
276 [inptr] "+r" (inptr)
277 : [alpha] "w" (alpha), [beta] "w" (beta)
278 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
279 );
280 }
281 }
282 break;
283
284 case 5:
285 {
286 if ((i+11) >= xmax)
287 {
288 for (int xi=0; xi<12; xi++)
289 {
290 if ((i+xi) < xmax)
291 {
292 *outptr0 = (alpha * inptr[xi]);
293 outptr0++;
294 *outptr1 = (alpha * inptr[xi + 12]);
295 outptr1++;
296 *outptr2 = (alpha * inptr[xi + 24]);
297 outptr2++;
298 *outptr3 = (alpha * inptr[xi + 36]);
299 outptr3++;
300 *outptr4 = (alpha * inptr[xi + 48]);
301 outptr4++;
302 }
303 }
304 inptr += 96;
305 } else {
306 /* Optimized routine to copy an entire block */
307 __asm __volatile (
308 "ldr q4, [%[inptr]]\n"
309 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
310 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
311 "str q8, [%[outptr0]]\n"
312 "ldr q5, [%[inptr], #0x30]\n"
313 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
314 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
315 "str q9, [%[outptr1]]\n"
316 "ldr q6, [%[inptr], #0x60]\n"
317 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
318 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
319 "str q10, [%[outptr2]]\n"
320 "ldr q7, [%[inptr], #0x90]\n"
321 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
322 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
323 "str q11, [%[outptr3]]\n"
324 "ldr q4, [%[inptr], #0xc0]\n"
325 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
326 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
327 "str q8, [%[outptr4]]\n"
328 "ldr q5, [%[inptr], #0x10]\n"
329 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
330 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
331 "str q9, [%[outptr0], #0x10]\n"
332 "ldr q6, [%[inptr], #0x40]\n"
333 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
334 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
335 "str q10, [%[outptr1], #0x10]\n"
336 "ldr q7, [%[inptr], #0x70]\n"
337 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
338 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
339 "str q11, [%[outptr2], #0x10]\n"
340 "ldr q4, [%[inptr], #0xa0]\n"
341 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
342 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
343 "str q8, [%[outptr3], #0x10]\n"
344 "ldr q5, [%[inptr], #0xd0]\n"
345 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
346 "str q9, [%[outptr4], #0x10]\n"
347 "ldr q6, [%[inptr], #0x20]\n"
348 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
349 "str q10, [%[outptr0], #0x20]\n"
350 "ldr q7, [%[inptr], #0x50]\n"
351 "add %[outptr0], %[outptr0], #0x30\n"
352 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
353 "str q11, [%[outptr1], #0x20]\n"
354 "ldr q4, [%[inptr], #0x80]\n"
355 "add %[outptr1], %[outptr1], #0x30\n"
356 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
357 "str q8, [%[outptr2], #0x20]\n"
358 "ldr q5, [%[inptr], #0xb0]\n"
359 "add %[outptr2], %[outptr2], #0x30\n"
360 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
361 "str q9, [%[outptr3], #0x20]\n"
362 "ldr q6, [%[inptr], #0xe0]\n"
363 "add %[outptr3], %[outptr3], #0x30\n"
364 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
365 "str q10, [%[outptr4], #0x20]\n"
366 "add %[outptr4], %[outptr4], #0x30\n"
367 "add %[inptr], %[inptr], #0x180\n"
368 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
369 [inptr] "+r" (inptr)
370 : [alpha] "w" (alpha), [beta] "w" (beta)
371 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
372 );
373 }
374 }
375 break;
376
377 case 6:
378 {
379 if ((i+11) >= xmax)
380 {
381 for (int xi=0; xi<12; xi++)
382 {
383 if ((i+xi) < xmax)
384 {
385 *outptr0 = (alpha * inptr[xi]);
386 outptr0++;
387 *outptr1 = (alpha * inptr[xi + 12]);
388 outptr1++;
389 *outptr2 = (alpha * inptr[xi + 24]);
390 outptr2++;
391 *outptr3 = (alpha * inptr[xi + 36]);
392 outptr3++;
393 *outptr4 = (alpha * inptr[xi + 48]);
394 outptr4++;
395 *outptr5 = (alpha * inptr[xi + 60]);
396 outptr5++;
397 }
398 }
399 inptr += 96;
400 } else {
401 /* Optimized routine to copy an entire block */
402 __asm __volatile (
403 "ldr q4, [%[inptr]]\n"
404 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
405 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
406 "str q8, [%[outptr0]]\n"
407 "ldr q5, [%[inptr], #0x30]\n"
408 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
409 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
410 "str q9, [%[outptr1]]\n"
411 "ldr q6, [%[inptr], #0x60]\n"
412 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
413 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
414 "str q10, [%[outptr2]]\n"
415 "ldr q7, [%[inptr], #0x90]\n"
416 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
417 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
418 "str q11, [%[outptr3]]\n"
419 "ldr q4, [%[inptr], #0xc0]\n"
420 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
421 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
422 "str q8, [%[outptr4]]\n"
423 "ldr q5, [%[inptr], #0xf0]\n"
424 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
425 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
426 "str q9, [%[outptr5]]\n"
427 "ldr q6, [%[inptr], #0x10]\n"
428 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
429 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
430 "str q10, [%[outptr0], #0x10]\n"
431 "ldr q7, [%[inptr], #0x40]\n"
432 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
433 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
434 "str q11, [%[outptr1], #0x10]\n"
435 "ldr q4, [%[inptr], #0x70]\n"
436 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
437 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
438 "str q8, [%[outptr2], #0x10]\n"
439 "ldr q5, [%[inptr], #0xa0]\n"
440 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
441 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
442 "str q9, [%[outptr3], #0x10]\n"
443 "ldr q6, [%[inptr], #0xd0]\n"
444 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
445 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
446 "str q10, [%[outptr4], #0x10]\n"
447 "ldr q7, [%[inptr], #0x100]\n"
448 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
449 "str q11, [%[outptr5], #0x10]\n"
450 "ldr q4, [%[inptr], #0x20]\n"
451 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
452 "str q8, [%[outptr0], #0x20]\n"
453 "ldr q5, [%[inptr], #0x50]\n"
454 "add %[outptr0], %[outptr0], #0x30\n"
455 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
456 "str q9, [%[outptr1], #0x20]\n"
457 "ldr q6, [%[inptr], #0x80]\n"
458 "add %[outptr1], %[outptr1], #0x30\n"
459 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
460 "str q10, [%[outptr2], #0x20]\n"
461 "ldr q7, [%[inptr], #0xb0]\n"
462 "add %[outptr2], %[outptr2], #0x30\n"
463 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
464 "str q11, [%[outptr3], #0x20]\n"
465 "ldr q4, [%[inptr], #0xe0]\n"
466 "add %[outptr3], %[outptr3], #0x30\n"
467 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
468 "str q8, [%[outptr4], #0x20]\n"
469 "ldr q5, [%[inptr], #0x110]\n"
470 "add %[outptr4], %[outptr4], #0x30\n"
471 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
472 "str q9, [%[outptr5], #0x20]\n"
473 "add %[outptr5], %[outptr5], #0x30\n"
474 "add %[inptr], %[inptr], #0x180\n"
475 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
476 [inptr] "+r" (inptr)
477 : [alpha] "w" (alpha), [beta] "w" (beta)
478 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
479 );
480 }
481 }
482 break;
483
484 case 7:
485 {
486 if ((i+11) >= xmax)
487 {
488 for (int xi=0; xi<12; xi++)
489 {
490 if ((i+xi) < xmax)
491 {
492 *outptr0 = (alpha * inptr[xi]);
493 outptr0++;
494 *outptr1 = (alpha * inptr[xi + 12]);
495 outptr1++;
496 *outptr2 = (alpha * inptr[xi + 24]);
497 outptr2++;
498 *outptr3 = (alpha * inptr[xi + 36]);
499 outptr3++;
500 *outptr4 = (alpha * inptr[xi + 48]);
501 outptr4++;
502 *outptr5 = (alpha * inptr[xi + 60]);
503 outptr5++;
504 *outptr6 = (alpha * inptr[xi + 72]);
505 outptr6++;
506 }
507 }
508 inptr += 96;
509 } else {
510 /* Optimized routine to copy an entire block */
511 __asm __volatile (
512 "ldr q4, [%[inptr]]\n"
513 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
514 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
515 "str q8, [%[outptr0]]\n"
516 "ldr q5, [%[inptr], #0x30]\n"
517 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
518 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
519 "str q9, [%[outptr1]]\n"
520 "ldr q6, [%[inptr], #0x60]\n"
521 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
522 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
523 "str q10, [%[outptr2]]\n"
524 "ldr q7, [%[inptr], #0x90]\n"
525 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
526 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
527 "str q11, [%[outptr3]]\n"
528 "ldr q4, [%[inptr], #0xc0]\n"
529 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
530 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
531 "str q8, [%[outptr4]]\n"
532 "ldr q5, [%[inptr], #0xf0]\n"
533 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
534 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
535 "str q9, [%[outptr5]]\n"
536 "ldr q6, [%[inptr], #0x120]\n"
537 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
538 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
539 "str q10, [%[outptr6]]\n"
540 "ldr q7, [%[inptr], #0x10]\n"
541 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
542 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
543 "str q11, [%[outptr0], #0x10]\n"
544 "ldr q4, [%[inptr], #0x40]\n"
545 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
546 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
547 "str q8, [%[outptr1], #0x10]\n"
548 "ldr q5, [%[inptr], #0x70]\n"
549 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
550 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
551 "str q9, [%[outptr2], #0x10]\n"
552 "ldr q6, [%[inptr], #0xa0]\n"
553 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
554 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
555 "str q10, [%[outptr3], #0x10]\n"
556 "ldr q7, [%[inptr], #0xd0]\n"
557 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
558 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
559 "str q11, [%[outptr4], #0x10]\n"
560 "ldr q4, [%[inptr], #0x100]\n"
561 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
562 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
563 "str q8, [%[outptr5], #0x10]\n"
564 "ldr q5, [%[inptr], #0x130]\n"
565 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
566 "str q9, [%[outptr6], #0x10]\n"
567 "ldr q6, [%[inptr], #0x20]\n"
568 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
569 "str q10, [%[outptr0], #0x20]\n"
570 "ldr q7, [%[inptr], #0x50]\n"
571 "add %[outptr0], %[outptr0], #0x30\n"
572 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
573 "str q11, [%[outptr1], #0x20]\n"
574 "ldr q4, [%[inptr], #0x80]\n"
575 "add %[outptr1], %[outptr1], #0x30\n"
576 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
577 "str q8, [%[outptr2], #0x20]\n"
578 "ldr q5, [%[inptr], #0xb0]\n"
579 "add %[outptr2], %[outptr2], #0x30\n"
580 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
581 "str q9, [%[outptr3], #0x20]\n"
582 "ldr q6, [%[inptr], #0xe0]\n"
583 "add %[outptr3], %[outptr3], #0x30\n"
584 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
585 "str q10, [%[outptr4], #0x20]\n"
586 "ldr q7, [%[inptr], #0x110]\n"
587 "add %[outptr4], %[outptr4], #0x30\n"
588 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
589 "str q11, [%[outptr5], #0x20]\n"
590 "ldr q4, [%[inptr], #0x140]\n"
591 "add %[outptr5], %[outptr5], #0x30\n"
592 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
593 "str q8, [%[outptr6], #0x20]\n"
594 "add %[outptr6], %[outptr6], #0x30\n"
595 "add %[inptr], %[inptr], #0x180\n"
596 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
597 [inptr] "+r" (inptr)
598 : [alpha] "w" (alpha), [beta] "w" (beta)
599 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
600 );
601 }
602 }
603 break;
604
605 default:
606 case 8:
607 {
608 if ((i+11) >= xmax)
609 {
610 for (int xi=0; xi<12; xi++)
611 {
612 if ((i+xi) < xmax)
613 {
614 *outptr0 = (alpha * inptr[xi]);
615 outptr0++;
616 *outptr1 = (alpha * inptr[xi + 12]);
617 outptr1++;
618 *outptr2 = (alpha * inptr[xi + 24]);
619 outptr2++;
620 *outptr3 = (alpha * inptr[xi + 36]);
621 outptr3++;
622 *outptr4 = (alpha * inptr[xi + 48]);
623 outptr4++;
624 *outptr5 = (alpha * inptr[xi + 60]);
625 outptr5++;
626 *outptr6 = (alpha * inptr[xi + 72]);
627 outptr6++;
628 *outptr7 = (alpha * inptr[xi + 84]);
629 outptr7++;
630 }
631 }
632 inptr += 96;
633 } else {
634 /* Optimized routine to copy an entire block */
635 __asm __volatile (
636 "ldr q4, [%[inptr]]\n"
637 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
638 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
639 "str q8, [%[outptr0]]\n"
640 "ldr q5, [%[inptr], #0x30]\n"
641 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
642 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
643 "str q9, [%[outptr1]]\n"
644 "ldr q6, [%[inptr], #0x60]\n"
645 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
646 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
647 "str q10, [%[outptr2]]\n"
648 "ldr q7, [%[inptr], #0x90]\n"
649 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
650 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
651 "str q11, [%[outptr3]]\n"
652 "ldr q4, [%[inptr], #0xc0]\n"
653 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
654 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
655 "str q8, [%[outptr4]]\n"
656 "ldr q5, [%[inptr], #0xf0]\n"
657 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
658 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
659 "str q9, [%[outptr5]]\n"
660 "ldr q6, [%[inptr], #0x120]\n"
661 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
662 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
663 "str q10, [%[outptr6]]\n"
664 "ldr q7, [%[inptr], #0x150]\n"
665 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
666 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
667 "str q11, [%[outptr7]]\n"
668 "ldr q4, [%[inptr], #0x10]\n"
669 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
670 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
671 "str q8, [%[outptr0], #0x10]\n"
672 "ldr q5, [%[inptr], #0x40]\n"
673 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
674 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
675 "str q9, [%[outptr1], #0x10]\n"
676 "ldr q6, [%[inptr], #0x70]\n"
677 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
678 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
679 "str q10, [%[outptr2], #0x10]\n"
680 "ldr q7, [%[inptr], #0xa0]\n"
681 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
682 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
683 "str q11, [%[outptr3], #0x10]\n"
684 "ldr q4, [%[inptr], #0xd0]\n"
685 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
686 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
687 "str q8, [%[outptr4], #0x10]\n"
688 "ldr q5, [%[inptr], #0x100]\n"
689 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
690 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
691 "str q9, [%[outptr5], #0x10]\n"
692 "ldr q6, [%[inptr], #0x130]\n"
693 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
694 "str q10, [%[outptr6], #0x10]\n"
695 "ldr q7, [%[inptr], #0x160]\n"
696 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
697 "str q11, [%[outptr7], #0x10]\n"
698 "ldr q4, [%[inptr], #0x20]\n"
699 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
700 "str q8, [%[outptr0], #0x20]\n"
701 "ldr q5, [%[inptr], #0x50]\n"
702 "add %[outptr0], %[outptr0], #0x30\n"
703 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
704 "str q9, [%[outptr1], #0x20]\n"
705 "ldr q6, [%[inptr], #0x80]\n"
706 "add %[outptr1], %[outptr1], #0x30\n"
707 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
708 "str q10, [%[outptr2], #0x20]\n"
709 "ldr q7, [%[inptr], #0xb0]\n"
710 "add %[outptr2], %[outptr2], #0x30\n"
711 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
712 "str q11, [%[outptr3], #0x20]\n"
713 "ldr q4, [%[inptr], #0xe0]\n"
714 "add %[outptr3], %[outptr3], #0x30\n"
715 "fmul v8.4s, v4.4s, %[alpha].s[0]\n"
716 "str q8, [%[outptr4], #0x20]\n"
717 "ldr q5, [%[inptr], #0x110]\n"
718 "add %[outptr4], %[outptr4], #0x30\n"
719 "fmul v9.4s, v5.4s, %[alpha].s[0]\n"
720 "str q9, [%[outptr5], #0x20]\n"
721 "ldr q6, [%[inptr], #0x140]\n"
722 "add %[outptr5], %[outptr5], #0x30\n"
723 "fmul v10.4s, v6.4s, %[alpha].s[0]\n"
724 "str q10, [%[outptr6], #0x20]\n"
725 "ldr q7, [%[inptr], #0x170]\n"
726 "add %[outptr6], %[outptr6], #0x30\n"
727 "fmul v11.4s, v7.4s, %[alpha].s[0]\n"
728 "str q11, [%[outptr7], #0x20]\n"
729 "add %[outptr7], %[outptr7], #0x30\n"
730 "add %[inptr], %[inptr], #0x180\n"
731 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
732 [inptr] "+r" (inptr)
733 : [alpha] "w" (alpha), [beta] "w" (beta)
734 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
735 );
736 }
737 }
738 break;
739
740
741 }
742 }
743 else
744 {
745 switch(height) {
746 case 1:
747 {
748 if ((i+11) >= xmax)
749 {
750 for (int xi=0; xi<12; xi++)
751 {
752 if ((i+xi) < xmax)
753 {
754 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
755 outptr0++;
756 }
757 }
758 inptr += 96;
759 } else {
760 /* Optimized routine to copy an entire block */
761 __asm __volatile (
762 "ldr q8, [%[outptr0]]\n"
763 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
764 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
765 "ldr q4, [%[inptr]]\n"
766 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
767 "str q8, [%[outptr0]]\n"
768 "ldr q9, [%[outptr0], #0x10]\n"
769 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
770 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
771 "ldr q5, [%[inptr], #0x10]\n"
772 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
773 "str q9, [%[outptr0], #0x10]\n"
774 "ldr q10, [%[outptr0], #0x20]\n"
775 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
776 "ldr q6, [%[inptr], #0x20]\n"
777 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
778 "str q10, [%[outptr0], #0x20]\n"
779 "add %[outptr0], %[outptr0], #0x30\n"
780 "add %[inptr], %[inptr], #0x180\n"
781 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
782 [inptr] "+r" (inptr)
783 : [alpha] "w" (alpha), [beta] "w" (beta)
784 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
785 );
786 }
787 }
788 break;
789
790 case 2:
791 {
792 if ((i+11) >= xmax)
793 {
794 for (int xi=0; xi<12; xi++)
795 {
796 if ((i+xi) < xmax)
797 {
798 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
799 outptr0++;
800 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
801 outptr1++;
802 }
803 }
804 inptr += 96;
805 } else {
806 /* Optimized routine to copy an entire block */
807 __asm __volatile (
808 "ldr q8, [%[outptr0]]\n"
809 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
810 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
811 "ldr q4, [%[inptr]]\n"
812 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
813 "str q8, [%[outptr0]]\n"
814 "ldr q9, [%[outptr1]]\n"
815 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
816 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
817 "ldr q5, [%[inptr], #0x30]\n"
818 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
819 "str q9, [%[outptr1]]\n"
820 "ldr q10, [%[outptr0], #0x10]\n"
821 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
822 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
823 "ldr q6, [%[inptr], #0x10]\n"
824 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
825 "str q10, [%[outptr0], #0x10]\n"
826 "ldr q11, [%[outptr1], #0x10]\n"
827 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
828 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
829 "ldr q7, [%[inptr], #0x40]\n"
830 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
831 "str q11, [%[outptr1], #0x10]\n"
832 "ldr q8, [%[outptr0], #0x20]\n"
833 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
834 "ldr q4, [%[inptr], #0x20]\n"
835 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
836 "str q8, [%[outptr0], #0x20]\n"
837 "ldr q9, [%[outptr1], #0x20]\n"
838 "add %[outptr0], %[outptr0], #0x30\n"
839 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
840 "ldr q5, [%[inptr], #0x50]\n"
841 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
842 "str q9, [%[outptr1], #0x20]\n"
843 "add %[outptr1], %[outptr1], #0x30\n"
844 "add %[inptr], %[inptr], #0x180\n"
845 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
846 [inptr] "+r" (inptr)
847 : [alpha] "w" (alpha), [beta] "w" (beta)
848 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
849 );
850 }
851 }
852 break;
853
854 case 3:
855 {
856 if ((i+11) >= xmax)
857 {
858 for (int xi=0; xi<12; xi++)
859 {
860 if ((i+xi) < xmax)
861 {
862 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
863 outptr0++;
864 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
865 outptr1++;
866 *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
867 outptr2++;
868 }
869 }
870 inptr += 96;
871 } else {
872 /* Optimized routine to copy an entire block */
873 __asm __volatile (
874 "ldr q8, [%[outptr0]]\n"
875 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
876 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
877 "ldr q4, [%[inptr]]\n"
878 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
879 "str q8, [%[outptr0]]\n"
880 "ldr q9, [%[outptr1]]\n"
881 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
882 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
883 "ldr q5, [%[inptr], #0x30]\n"
884 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
885 "str q9, [%[outptr1]]\n"
886 "ldr q10, [%[outptr2]]\n"
887 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
888 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
889 "ldr q6, [%[inptr], #0x60]\n"
890 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
891 "str q10, [%[outptr2]]\n"
892 "ldr q11, [%[outptr0], #0x10]\n"
893 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
894 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
895 "ldr q7, [%[inptr], #0x10]\n"
896 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
897 "str q11, [%[outptr0], #0x10]\n"
898 "ldr q8, [%[outptr1], #0x10]\n"
899 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
900 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
901 "ldr q4, [%[inptr], #0x40]\n"
902 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
903 "str q8, [%[outptr1], #0x10]\n"
904 "ldr q9, [%[outptr2], #0x10]\n"
905 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
906 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
907 "ldr q5, [%[inptr], #0x70]\n"
908 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
909 "str q9, [%[outptr2], #0x10]\n"
910 "ldr q10, [%[outptr0], #0x20]\n"
911 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
912 "ldr q6, [%[inptr], #0x20]\n"
913 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
914 "str q10, [%[outptr0], #0x20]\n"
915 "ldr q11, [%[outptr1], #0x20]\n"
916 "add %[outptr0], %[outptr0], #0x30\n"
917 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
918 "ldr q7, [%[inptr], #0x50]\n"
919 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
920 "str q11, [%[outptr1], #0x20]\n"
921 "ldr q8, [%[outptr2], #0x20]\n"
922 "add %[outptr1], %[outptr1], #0x30\n"
923 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
924 "ldr q4, [%[inptr], #0x80]\n"
925 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
926 "str q8, [%[outptr2], #0x20]\n"
927 "add %[outptr2], %[outptr2], #0x30\n"
928 "add %[inptr], %[inptr], #0x180\n"
929 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
930 [inptr] "+r" (inptr)
931 : [alpha] "w" (alpha), [beta] "w" (beta)
932 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
933 );
934 }
935 }
936 break;
937
938 case 4:
939 {
940 if ((i+11) >= xmax)
941 {
942 for (int xi=0; xi<12; xi++)
943 {
944 if ((i+xi) < xmax)
945 {
946 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
947 outptr0++;
948 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
949 outptr1++;
950 *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
951 outptr2++;
952 *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
953 outptr3++;
954 }
955 }
956 inptr += 96;
957 } else {
958 /* Optimized routine to copy an entire block */
959 __asm __volatile (
960 "ldr q8, [%[outptr0]]\n"
961 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
962 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
963 "ldr q4, [%[inptr]]\n"
964 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
965 "str q8, [%[outptr0]]\n"
966 "ldr q9, [%[outptr1]]\n"
967 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
968 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
969 "ldr q5, [%[inptr], #0x30]\n"
970 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
971 "str q9, [%[outptr1]]\n"
972 "ldr q10, [%[outptr2]]\n"
973 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
974 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
975 "ldr q6, [%[inptr], #0x60]\n"
976 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
977 "str q10, [%[outptr2]]\n"
978 "ldr q11, [%[outptr3]]\n"
979 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
980 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
981 "ldr q7, [%[inptr], #0x90]\n"
982 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
983 "str q11, [%[outptr3]]\n"
984 "ldr q8, [%[outptr0], #0x10]\n"
985 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
986 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
987 "ldr q4, [%[inptr], #0x10]\n"
988 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
989 "str q8, [%[outptr0], #0x10]\n"
990 "ldr q9, [%[outptr1], #0x10]\n"
991 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
992 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
993 "ldr q5, [%[inptr], #0x40]\n"
994 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
995 "str q9, [%[outptr1], #0x10]\n"
996 "ldr q10, [%[outptr2], #0x10]\n"
997 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
998 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
999 "ldr q6, [%[inptr], #0x70]\n"
1000 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1001 "str q10, [%[outptr2], #0x10]\n"
1002 "ldr q11, [%[outptr3], #0x10]\n"
1003 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1004 "ldr q7, [%[inptr], #0xa0]\n"
1005 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1006 "str q11, [%[outptr3], #0x10]\n"
1007 "ldr q8, [%[outptr0], #0x20]\n"
1008 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1009 "ldr q4, [%[inptr], #0x20]\n"
1010 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1011 "str q8, [%[outptr0], #0x20]\n"
1012 "ldr q9, [%[outptr1], #0x20]\n"
1013 "add %[outptr0], %[outptr0], #0x30\n"
1014 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1015 "ldr q5, [%[inptr], #0x50]\n"
1016 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1017 "str q9, [%[outptr1], #0x20]\n"
1018 "ldr q10, [%[outptr2], #0x20]\n"
1019 "add %[outptr1], %[outptr1], #0x30\n"
1020 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1021 "ldr q6, [%[inptr], #0x80]\n"
1022 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1023 "str q10, [%[outptr2], #0x20]\n"
1024 "ldr q11, [%[outptr3], #0x20]\n"
1025 "add %[outptr2], %[outptr2], #0x30\n"
1026 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1027 "ldr q7, [%[inptr], #0xb0]\n"
1028 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1029 "str q11, [%[outptr3], #0x20]\n"
1030 "add %[outptr3], %[outptr3], #0x30\n"
1031 "add %[inptr], %[inptr], #0x180\n"
1032 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1033 [inptr] "+r" (inptr)
1034 : [alpha] "w" (alpha), [beta] "w" (beta)
1035 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
1036 );
1037 }
1038 }
1039 break;
1040
1041 case 5:
1042 {
1043 if ((i+11) >= xmax)
1044 {
1045 for (int xi=0; xi<12; xi++)
1046 {
1047 if ((i+xi) < xmax)
1048 {
1049 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
1050 outptr0++;
1051 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
1052 outptr1++;
1053 *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
1054 outptr2++;
1055 *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
1056 outptr3++;
1057 *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
1058 outptr4++;
1059 }
1060 }
1061 inptr += 96;
1062 } else {
1063 /* Optimized routine to copy an entire block */
1064 __asm __volatile (
1065 "ldr q8, [%[outptr0]]\n"
1066 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1067 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1068 "ldr q4, [%[inptr]]\n"
1069 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1070 "str q8, [%[outptr0]]\n"
1071 "ldr q9, [%[outptr1]]\n"
1072 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1073 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1074 "ldr q5, [%[inptr], #0x30]\n"
1075 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1076 "str q9, [%[outptr1]]\n"
1077 "ldr q10, [%[outptr2]]\n"
1078 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1079 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1080 "ldr q6, [%[inptr], #0x60]\n"
1081 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1082 "str q10, [%[outptr2]]\n"
1083 "ldr q11, [%[outptr3]]\n"
1084 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1085 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1086 "ldr q7, [%[inptr], #0x90]\n"
1087 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1088 "str q11, [%[outptr3]]\n"
1089 "ldr q8, [%[outptr4]]\n"
1090 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1091 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1092 "ldr q4, [%[inptr], #0xc0]\n"
1093 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1094 "str q8, [%[outptr4]]\n"
1095 "ldr q9, [%[outptr0], #0x10]\n"
1096 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1097 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1098 "ldr q5, [%[inptr], #0x10]\n"
1099 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1100 "str q9, [%[outptr0], #0x10]\n"
1101 "ldr q10, [%[outptr1], #0x10]\n"
1102 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1103 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1104 "ldr q6, [%[inptr], #0x40]\n"
1105 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1106 "str q10, [%[outptr1], #0x10]\n"
1107 "ldr q11, [%[outptr2], #0x10]\n"
1108 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1109 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1110 "ldr q7, [%[inptr], #0x70]\n"
1111 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1112 "str q11, [%[outptr2], #0x10]\n"
1113 "ldr q8, [%[outptr3], #0x10]\n"
1114 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1115 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1116 "ldr q4, [%[inptr], #0xa0]\n"
1117 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1118 "str q8, [%[outptr3], #0x10]\n"
1119 "ldr q9, [%[outptr4], #0x10]\n"
1120 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1121 "ldr q5, [%[inptr], #0xd0]\n"
1122 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1123 "str q9, [%[outptr4], #0x10]\n"
1124 "ldr q10, [%[outptr0], #0x20]\n"
1125 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1126 "ldr q6, [%[inptr], #0x20]\n"
1127 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1128 "str q10, [%[outptr0], #0x20]\n"
1129 "ldr q11, [%[outptr1], #0x20]\n"
1130 "add %[outptr0], %[outptr0], #0x30\n"
1131 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1132 "ldr q7, [%[inptr], #0x50]\n"
1133 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1134 "str q11, [%[outptr1], #0x20]\n"
1135 "ldr q8, [%[outptr2], #0x20]\n"
1136 "add %[outptr1], %[outptr1], #0x30\n"
1137 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1138 "ldr q4, [%[inptr], #0x80]\n"
1139 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1140 "str q8, [%[outptr2], #0x20]\n"
1141 "ldr q9, [%[outptr3], #0x20]\n"
1142 "add %[outptr2], %[outptr2], #0x30\n"
1143 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1144 "ldr q5, [%[inptr], #0xb0]\n"
1145 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1146 "str q9, [%[outptr3], #0x20]\n"
1147 "ldr q10, [%[outptr4], #0x20]\n"
1148 "add %[outptr3], %[outptr3], #0x30\n"
1149 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1150 "ldr q6, [%[inptr], #0xe0]\n"
1151 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1152 "str q10, [%[outptr4], #0x20]\n"
1153 "add %[outptr4], %[outptr4], #0x30\n"
1154 "add %[inptr], %[inptr], #0x180\n"
1155 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1156 [inptr] "+r" (inptr)
1157 : [alpha] "w" (alpha), [beta] "w" (beta)
1158 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
1159 );
1160 }
1161 }
1162 break;
1163
1164 case 6:
1165 {
1166 if ((i+11) >= xmax)
1167 {
1168 for (int xi=0; xi<12; xi++)
1169 {
1170 if ((i+xi) < xmax)
1171 {
1172 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
1173 outptr0++;
1174 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
1175 outptr1++;
1176 *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
1177 outptr2++;
1178 *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
1179 outptr3++;
1180 *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
1181 outptr4++;
1182 *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
1183 outptr5++;
1184 }
1185 }
1186 inptr += 96;
1187 } else {
1188 /* Optimized routine to copy an entire block */
1189 __asm __volatile (
1190 "ldr q8, [%[outptr0]]\n"
1191 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1192 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1193 "ldr q4, [%[inptr]]\n"
1194 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1195 "str q8, [%[outptr0]]\n"
1196 "ldr q9, [%[outptr1]]\n"
1197 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1198 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1199 "ldr q5, [%[inptr], #0x30]\n"
1200 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1201 "str q9, [%[outptr1]]\n"
1202 "ldr q10, [%[outptr2]]\n"
1203 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1204 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1205 "ldr q6, [%[inptr], #0x60]\n"
1206 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1207 "str q10, [%[outptr2]]\n"
1208 "ldr q11, [%[outptr3]]\n"
1209 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1210 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1211 "ldr q7, [%[inptr], #0x90]\n"
1212 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1213 "str q11, [%[outptr3]]\n"
1214 "ldr q8, [%[outptr4]]\n"
1215 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1216 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1217 "ldr q4, [%[inptr], #0xc0]\n"
1218 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1219 "str q8, [%[outptr4]]\n"
1220 "ldr q9, [%[outptr5]]\n"
1221 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1222 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1223 "ldr q5, [%[inptr], #0xf0]\n"
1224 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1225 "str q9, [%[outptr5]]\n"
1226 "ldr q10, [%[outptr0], #0x10]\n"
1227 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1228 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1229 "ldr q6, [%[inptr], #0x10]\n"
1230 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1231 "str q10, [%[outptr0], #0x10]\n"
1232 "ldr q11, [%[outptr1], #0x10]\n"
1233 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1234 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1235 "ldr q7, [%[inptr], #0x40]\n"
1236 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1237 "str q11, [%[outptr1], #0x10]\n"
1238 "ldr q8, [%[outptr2], #0x10]\n"
1239 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1240 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1241 "ldr q4, [%[inptr], #0x70]\n"
1242 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1243 "str q8, [%[outptr2], #0x10]\n"
1244 "ldr q9, [%[outptr3], #0x10]\n"
1245 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1246 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1247 "ldr q5, [%[inptr], #0xa0]\n"
1248 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1249 "str q9, [%[outptr3], #0x10]\n"
1250 "ldr q10, [%[outptr4], #0x10]\n"
1251 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1252 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1253 "ldr q6, [%[inptr], #0xd0]\n"
1254 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1255 "str q10, [%[outptr4], #0x10]\n"
1256 "ldr q11, [%[outptr5], #0x10]\n"
1257 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1258 "ldr q7, [%[inptr], #0x100]\n"
1259 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1260 "str q11, [%[outptr5], #0x10]\n"
1261 "ldr q8, [%[outptr0], #0x20]\n"
1262 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1263 "ldr q4, [%[inptr], #0x20]\n"
1264 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1265 "str q8, [%[outptr0], #0x20]\n"
1266 "ldr q9, [%[outptr1], #0x20]\n"
1267 "add %[outptr0], %[outptr0], #0x30\n"
1268 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1269 "ldr q5, [%[inptr], #0x50]\n"
1270 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1271 "str q9, [%[outptr1], #0x20]\n"
1272 "ldr q10, [%[outptr2], #0x20]\n"
1273 "add %[outptr1], %[outptr1], #0x30\n"
1274 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1275 "ldr q6, [%[inptr], #0x80]\n"
1276 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1277 "str q10, [%[outptr2], #0x20]\n"
1278 "ldr q11, [%[outptr3], #0x20]\n"
1279 "add %[outptr2], %[outptr2], #0x30\n"
1280 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1281 "ldr q7, [%[inptr], #0xb0]\n"
1282 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1283 "str q11, [%[outptr3], #0x20]\n"
1284 "ldr q8, [%[outptr4], #0x20]\n"
1285 "add %[outptr3], %[outptr3], #0x30\n"
1286 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1287 "ldr q4, [%[inptr], #0xe0]\n"
1288 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1289 "str q8, [%[outptr4], #0x20]\n"
1290 "ldr q9, [%[outptr5], #0x20]\n"
1291 "add %[outptr4], %[outptr4], #0x30\n"
1292 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1293 "ldr q5, [%[inptr], #0x110]\n"
1294 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1295 "str q9, [%[outptr5], #0x20]\n"
1296 "add %[outptr5], %[outptr5], #0x30\n"
1297 "add %[inptr], %[inptr], #0x180\n"
1298 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1299 [inptr] "+r" (inptr)
1300 : [alpha] "w" (alpha), [beta] "w" (beta)
1301 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
1302 );
1303 }
1304 }
1305 break;
1306
1307 case 7:
1308 {
1309 if ((i+11) >= xmax)
1310 {
1311 for (int xi=0; xi<12; xi++)
1312 {
1313 if ((i+xi) < xmax)
1314 {
1315 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
1316 outptr0++;
1317 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
1318 outptr1++;
1319 *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
1320 outptr2++;
1321 *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
1322 outptr3++;
1323 *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
1324 outptr4++;
1325 *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
1326 outptr5++;
1327 *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
1328 outptr6++;
1329 }
1330 }
1331 inptr += 96;
1332 } else {
1333 /* Optimized routine to copy an entire block */
1334 __asm __volatile (
1335 "ldr q8, [%[outptr0]]\n"
1336 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1337 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1338 "ldr q4, [%[inptr]]\n"
1339 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1340 "str q8, [%[outptr0]]\n"
1341 "ldr q9, [%[outptr1]]\n"
1342 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1343 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1344 "ldr q5, [%[inptr], #0x30]\n"
1345 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1346 "str q9, [%[outptr1]]\n"
1347 "ldr q10, [%[outptr2]]\n"
1348 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1349 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1350 "ldr q6, [%[inptr], #0x60]\n"
1351 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1352 "str q10, [%[outptr2]]\n"
1353 "ldr q11, [%[outptr3]]\n"
1354 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1355 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1356 "ldr q7, [%[inptr], #0x90]\n"
1357 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1358 "str q11, [%[outptr3]]\n"
1359 "ldr q8, [%[outptr4]]\n"
1360 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1361 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1362 "ldr q4, [%[inptr], #0xc0]\n"
1363 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1364 "str q8, [%[outptr4]]\n"
1365 "ldr q9, [%[outptr5]]\n"
1366 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1367 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1368 "ldr q5, [%[inptr], #0xf0]\n"
1369 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1370 "str q9, [%[outptr5]]\n"
1371 "ldr q10, [%[outptr6]]\n"
1372 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1373 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1374 "ldr q6, [%[inptr], #0x120]\n"
1375 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1376 "str q10, [%[outptr6]]\n"
1377 "ldr q11, [%[outptr0], #0x10]\n"
1378 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1379 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1380 "ldr q7, [%[inptr], #0x10]\n"
1381 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1382 "str q11, [%[outptr0], #0x10]\n"
1383 "ldr q8, [%[outptr1], #0x10]\n"
1384 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1385 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1386 "ldr q4, [%[inptr], #0x40]\n"
1387 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1388 "str q8, [%[outptr1], #0x10]\n"
1389 "ldr q9, [%[outptr2], #0x10]\n"
1390 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1391 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1392 "ldr q5, [%[inptr], #0x70]\n"
1393 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1394 "str q9, [%[outptr2], #0x10]\n"
1395 "ldr q10, [%[outptr3], #0x10]\n"
1396 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1397 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1398 "ldr q6, [%[inptr], #0xa0]\n"
1399 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1400 "str q10, [%[outptr3], #0x10]\n"
1401 "ldr q11, [%[outptr4], #0x10]\n"
1402 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1403 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1404 "ldr q7, [%[inptr], #0xd0]\n"
1405 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1406 "str q11, [%[outptr4], #0x10]\n"
1407 "ldr q8, [%[outptr5], #0x10]\n"
1408 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1409 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1410 "ldr q4, [%[inptr], #0x100]\n"
1411 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1412 "str q8, [%[outptr5], #0x10]\n"
1413 "ldr q9, [%[outptr6], #0x10]\n"
1414 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1415 "ldr q5, [%[inptr], #0x130]\n"
1416 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1417 "str q9, [%[outptr6], #0x10]\n"
1418 "ldr q10, [%[outptr0], #0x20]\n"
1419 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1420 "ldr q6, [%[inptr], #0x20]\n"
1421 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1422 "str q10, [%[outptr0], #0x20]\n"
1423 "ldr q11, [%[outptr1], #0x20]\n"
1424 "add %[outptr0], %[outptr0], #0x30\n"
1425 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1426 "ldr q7, [%[inptr], #0x50]\n"
1427 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1428 "str q11, [%[outptr1], #0x20]\n"
1429 "ldr q8, [%[outptr2], #0x20]\n"
1430 "add %[outptr1], %[outptr1], #0x30\n"
1431 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1432 "ldr q4, [%[inptr], #0x80]\n"
1433 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1434 "str q8, [%[outptr2], #0x20]\n"
1435 "ldr q9, [%[outptr3], #0x20]\n"
1436 "add %[outptr2], %[outptr2], #0x30\n"
1437 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1438 "ldr q5, [%[inptr], #0xb0]\n"
1439 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1440 "str q9, [%[outptr3], #0x20]\n"
1441 "ldr q10, [%[outptr4], #0x20]\n"
1442 "add %[outptr3], %[outptr3], #0x30\n"
1443 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1444 "ldr q6, [%[inptr], #0xe0]\n"
1445 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1446 "str q10, [%[outptr4], #0x20]\n"
1447 "ldr q11, [%[outptr5], #0x20]\n"
1448 "add %[outptr4], %[outptr4], #0x30\n"
1449 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1450 "ldr q7, [%[inptr], #0x110]\n"
1451 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1452 "str q11, [%[outptr5], #0x20]\n"
1453 "ldr q8, [%[outptr6], #0x20]\n"
1454 "add %[outptr5], %[outptr5], #0x30\n"
1455 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1456 "ldr q4, [%[inptr], #0x140]\n"
1457 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1458 "str q8, [%[outptr6], #0x20]\n"
1459 "add %[outptr6], %[outptr6], #0x30\n"
1460 "add %[inptr], %[inptr], #0x180\n"
1461 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1462 [inptr] "+r" (inptr)
1463 : [alpha] "w" (alpha), [beta] "w" (beta)
1464 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
1465 );
1466 }
1467 }
1468 break;
1469
1470 default:
1471 case 8:
1472 {
1473 if ((i+11) >= xmax)
1474 {
1475 for (int xi=0; xi<12; xi++)
1476 {
1477 if ((i+xi) < xmax)
1478 {
1479 *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
1480 outptr0++;
1481 *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
1482 outptr1++;
1483 *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
1484 outptr2++;
1485 *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
1486 outptr3++;
1487 *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
1488 outptr4++;
1489 *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
1490 outptr5++;
1491 *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
1492 outptr6++;
1493 *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
1494 outptr7++;
1495 }
1496 }
1497 inptr += 96;
1498 } else {
1499 /* Optimized routine to copy an entire block */
1500 __asm __volatile (
1501 "ldr q8, [%[outptr0]]\n"
1502 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1503 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1504 "ldr q4, [%[inptr]]\n"
1505 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1506 "str q8, [%[outptr0]]\n"
1507 "ldr q9, [%[outptr1]]\n"
1508 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1509 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1510 "ldr q5, [%[inptr], #0x30]\n"
1511 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1512 "str q9, [%[outptr1]]\n"
1513 "ldr q10, [%[outptr2]]\n"
1514 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1515 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1516 "ldr q6, [%[inptr], #0x60]\n"
1517 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1518 "str q10, [%[outptr2]]\n"
1519 "ldr q11, [%[outptr3]]\n"
1520 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1521 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1522 "ldr q7, [%[inptr], #0x90]\n"
1523 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1524 "str q11, [%[outptr3]]\n"
1525 "ldr q8, [%[outptr4]]\n"
1526 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
1527 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1528 "ldr q4, [%[inptr], #0xc0]\n"
1529 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1530 "str q8, [%[outptr4]]\n"
1531 "ldr q9, [%[outptr5]]\n"
1532 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
1533 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1534 "ldr q5, [%[inptr], #0xf0]\n"
1535 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1536 "str q9, [%[outptr5]]\n"
1537 "ldr q10, [%[outptr6]]\n"
1538 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1539 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1540 "ldr q6, [%[inptr], #0x120]\n"
1541 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1542 "str q10, [%[outptr6]]\n"
1543 "ldr q11, [%[outptr7]]\n"
1544 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1545 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1546 "ldr q7, [%[inptr], #0x150]\n"
1547 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1548 "str q11, [%[outptr7]]\n"
1549 "ldr q8, [%[outptr0], #0x10]\n"
1550 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1551 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1552 "ldr q4, [%[inptr], #0x10]\n"
1553 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1554 "str q8, [%[outptr0], #0x10]\n"
1555 "ldr q9, [%[outptr1], #0x10]\n"
1556 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1557 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1558 "ldr q5, [%[inptr], #0x40]\n"
1559 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1560 "str q9, [%[outptr1], #0x10]\n"
1561 "ldr q10, [%[outptr2], #0x10]\n"
1562 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1563 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1564 "ldr q6, [%[inptr], #0x70]\n"
1565 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1566 "str q10, [%[outptr2], #0x10]\n"
1567 "ldr q11, [%[outptr3], #0x10]\n"
1568 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1569 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1570 "ldr q7, [%[inptr], #0xa0]\n"
1571 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1572 "str q11, [%[outptr3], #0x10]\n"
1573 "ldr q8, [%[outptr4], #0x10]\n"
1574 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1575 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1576 "ldr q4, [%[inptr], #0xd0]\n"
1577 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1578 "str q8, [%[outptr4], #0x10]\n"
1579 "ldr q9, [%[outptr5], #0x10]\n"
1580 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
1581 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1582 "ldr q5, [%[inptr], #0x100]\n"
1583 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1584 "str q9, [%[outptr5], #0x10]\n"
1585 "ldr q10, [%[outptr6], #0x10]\n"
1586 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1587 "ldr q6, [%[inptr], #0x130]\n"
1588 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1589 "str q10, [%[outptr6], #0x10]\n"
1590 "ldr q11, [%[outptr7], #0x10]\n"
1591 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1592 "ldr q7, [%[inptr], #0x160]\n"
1593 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1594 "str q11, [%[outptr7], #0x10]\n"
1595 "ldr q8, [%[outptr0], #0x20]\n"
1596 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1597 "ldr q4, [%[inptr], #0x20]\n"
1598 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1599 "str q8, [%[outptr0], #0x20]\n"
1600 "ldr q9, [%[outptr1], #0x20]\n"
1601 "add %[outptr0], %[outptr0], #0x30\n"
1602 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1603 "ldr q5, [%[inptr], #0x50]\n"
1604 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1605 "str q9, [%[outptr1], #0x20]\n"
1606 "ldr q10, [%[outptr2], #0x20]\n"
1607 "add %[outptr1], %[outptr1], #0x30\n"
1608 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1609 "ldr q6, [%[inptr], #0x80]\n"
1610 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1611 "str q10, [%[outptr2], #0x20]\n"
1612 "ldr q11, [%[outptr3], #0x20]\n"
1613 "add %[outptr2], %[outptr2], #0x30\n"
1614 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1615 "ldr q7, [%[inptr], #0xb0]\n"
1616 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1617 "str q11, [%[outptr3], #0x20]\n"
1618 "ldr q8, [%[outptr4], #0x20]\n"
1619 "add %[outptr3], %[outptr3], #0x30\n"
1620 "fmul v8.4s, v8.4s, %[beta].s[0]\n"
1621 "ldr q4, [%[inptr], #0xe0]\n"
1622 "fmla v8.4s, v4.4s, %[alpha].s[0]\n"
1623 "str q8, [%[outptr4], #0x20]\n"
1624 "ldr q9, [%[outptr5], #0x20]\n"
1625 "add %[outptr4], %[outptr4], #0x30\n"
1626 "fmul v9.4s, v9.4s, %[beta].s[0]\n"
1627 "ldr q5, [%[inptr], #0x110]\n"
1628 "fmla v9.4s, v5.4s, %[alpha].s[0]\n"
1629 "str q9, [%[outptr5], #0x20]\n"
1630 "ldr q10, [%[outptr6], #0x20]\n"
1631 "add %[outptr5], %[outptr5], #0x30\n"
1632 "fmul v10.4s, v10.4s, %[beta].s[0]\n"
1633 "ldr q6, [%[inptr], #0x140]\n"
1634 "fmla v10.4s, v6.4s, %[alpha].s[0]\n"
1635 "str q10, [%[outptr6], #0x20]\n"
1636 "ldr q11, [%[outptr7], #0x20]\n"
1637 "add %[outptr6], %[outptr6], #0x30\n"
1638 "fmul v11.4s, v11.4s, %[beta].s[0]\n"
1639 "ldr q7, [%[inptr], #0x170]\n"
1640 "fmla v11.4s, v7.4s, %[alpha].s[0]\n"
1641 "str q11, [%[outptr7], #0x20]\n"
1642 "add %[outptr7], %[outptr7], #0x30\n"
1643 "add %[inptr], %[inptr], #0x180\n"
1644 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1645 [inptr] "+r" (inptr)
1646 : [alpha] "w" (alpha), [beta] "w" (beta)
1647 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
1648 );
1649 }
1650 }
1651 break;
1652
1653
1654 }
1655 }
1656 }
1657 }
1658}
1659
1660#endif // __aarch64__