blob: fcf08e4e1528c8f33a00dd8d24cce4553781539b [file] [log] [blame]
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001/*
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01002 * Copyright (c) 2019-2020 Arm Limited.
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __aarch64__
27
28template<>
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010029void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010030{
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010031 const int32_t *inptr = in;
Georgios Pinitasc7b183a2020-03-06 18:12:09 +000032 int32_t nullbias[12];
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010033
34
35 if (!append && !bias)
36 {
37 memset(nullbias, 0, (12 * sizeof(int32_t)));
38 }
39
40 for (int y=y0; y<ymax; y+=8)
41 {
42 int32_t *outptr0 = out + (y * ldout) + x0;
43 int32_t *outptr1 = outptr0 + ldout;
44 int32_t *outptr2 = outptr1 + ldout;
45 int32_t *outptr3 = outptr2 + ldout;
46 int32_t *outptr4 = outptr3 + ldout;
47 int32_t *outptr5 = outptr4 + ldout;
48 int32_t *outptr6 = outptr5 + ldout;
49 int32_t *outptr7 = outptr6 + ldout;
50
51 const int height = ymax - y;
52
53 for (int i=x0; i<xmax; i+=12)
54 {
55 if (append)
56 {
57 switch(height)
58 {
59 case 1:
60 {
61 if ((i+11) >= xmax)
62 {
63 for (int xi=0; xi<11; xi++)
64 {
65 if ((i+xi) < xmax)
66 {
67 *outptr0 += inptr[xi];
68 outptr0++;
69 }
70 }
71 inptr += 96;
72 } else {
73 /* Optimized routine to copy an entire block */
74 __asm __volatile (
75 "ldr q2, [%[outptr0]]\n"
76 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
77 "ldr q10, [%[inptr]]\n"
78 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
79 "ldr q3, [%[outptr0], #0x10]\n"
80 "ldr q11, [%[inptr], #0x10]\n"
81 "add v10.4s, v10.4s, v2.4s\n"
82 "ldr q4, [%[outptr0], #0x20]\n"
83 "ldr q12, [%[inptr], #0x20]\n"
84 "add %[inptr], %[inptr], #0x180\n"
85 "add v11.4s, v11.4s, v3.4s\n"
86 "str q10, [%[outptr0]]\n"
87 "add v12.4s, v12.4s, v4.4s\n"
88 "str q11, [%[outptr0], #0x10]\n"
89 "str q12, [%[outptr0], #0x20]\n"
90 "add %[outptr0], %[outptr0], #0x30\n"
91 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
92 [inptr] "+r" (inptr)
93 :
94 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
95 );
96 }
97 }
98 break;
99
100 case 2:
101 {
102 if ((i+11) >= xmax)
103 {
104 for (int xi=0; xi<11; xi++)
105 {
106 if ((i+xi) < xmax)
107 {
108 *outptr0 += inptr[xi];
109 outptr0++;
110 *outptr1 += inptr[xi + 12];
111 outptr1++;
112 }
113 }
114 inptr += 96;
115 } else {
116 /* Optimized routine to copy an entire block */
117 __asm __volatile (
118 "ldr q2, [%[outptr0]]\n"
119 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
120 "ldr q10, [%[inptr]]\n"
121 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
122 "ldr q3, [%[outptr0], #0x10]\n"
123 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
124 "add v10.4s, v10.4s, v2.4s\n"
125 "ldr q11, [%[inptr], #0x10]\n"
126 "ldr q4, [%[outptr0], #0x20]\n"
127 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
128 "ldr q12, [%[inptr], #0x20]\n"
129 "add v11.4s, v11.4s, v3.4s\n"
130 "str q10, [%[outptr0]]\n"
131 "ldr q5, [%[outptr1]]\n"
132 "ldr q13, [%[inptr], #0x30]\n"
133 "add v12.4s, v12.4s, v4.4s\n"
134 "str q11, [%[outptr0], #0x10]\n"
135 "ldr q6, [%[outptr1], #0x10]\n"
136 "ldr q14, [%[inptr], #0x40]\n"
137 "add v13.4s, v13.4s, v5.4s\n"
138 "str q12, [%[outptr0], #0x20]\n"
139 "ldr q7, [%[outptr1], #0x20]\n"
140 "add %[outptr0], %[outptr0], #0x30\n"
141 "add v14.4s, v14.4s, v6.4s\n"
142 "str q13, [%[outptr1]]\n"
143 "ldr q15, [%[inptr], #0x50]\n"
144 "add %[inptr], %[inptr], #0x180\n"
145 "str q14, [%[outptr1], #0x10]\n"
146 "add v15.4s, v15.4s, v7.4s\n"
147 "str q15, [%[outptr1], #0x20]\n"
148 "add %[outptr1], %[outptr1], #0x30\n"
149 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
150 [inptr] "+r" (inptr)
151 :
152 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
153 );
154 }
155 }
156 break;
157
158 case 3:
159 {
160 if ((i+11) >= xmax)
161 {
162 for (int xi=0; xi<11; xi++)
163 {
164 if ((i+xi) < xmax)
165 {
166 *outptr0 += inptr[xi];
167 outptr0++;
168 *outptr1 += inptr[xi + 12];
169 outptr1++;
170 *outptr2 += inptr[xi + 24];
171 outptr2++;
172 }
173 }
174 inptr += 96;
175 } else {
176 /* Optimized routine to copy an entire block */
177 __asm __volatile (
178 "ldr q2, [%[outptr0]]\n"
179 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
180 "ldr q10, [%[inptr]]\n"
181 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
182 "ldr q3, [%[outptr0], #0x10]\n"
183 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
184 "add v10.4s, v10.4s, v2.4s\n"
185 "ldr q11, [%[inptr], #0x10]\n"
186 "ldr q4, [%[outptr0], #0x20]\n"
187 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
188 "ldr q12, [%[inptr], #0x20]\n"
189 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
190 "add v11.4s, v11.4s, v3.4s\n"
191 "str q10, [%[outptr0]]\n"
192 "ldr q5, [%[outptr1]]\n"
193 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
194 "add v12.4s, v12.4s, v4.4s\n"
195 "str q11, [%[outptr0], #0x10]\n"
196 "ldr q13, [%[inptr], #0x30]\n"
197 "ldr q6, [%[outptr1], #0x10]\n"
198 "ldr q14, [%[inptr], #0x40]\n"
199 "str q12, [%[outptr0], #0x20]\n"
200 "add %[outptr0], %[outptr0], #0x30\n"
201 "add v13.4s, v13.4s, v5.4s\n"
202 "ldr q7, [%[outptr1], #0x20]\n"
203 "add v14.4s, v14.4s, v6.4s\n"
204 "ldr q15, [%[inptr], #0x50]\n"
205 "ldr q8, [%[outptr2]]\n"
206 "ldr q16, [%[inptr], #0x60]\n"
207 "str q13, [%[outptr1]]\n"
208 "add v15.4s, v15.4s, v7.4s\n"
209 "ldr q9, [%[outptr2], #0x10]\n"
210 "ldr q17, [%[inptr], #0x70]\n"
211 "add v16.4s, v16.4s, v8.4s\n"
212 "str q14, [%[outptr1], #0x10]\n"
213 "ldr q2, [%[outptr2], #0x20]\n"
214 "ldr q10, [%[inptr], #0x80]\n"
215 "add %[inptr], %[inptr], #0x180\n"
216 "add v17.4s, v17.4s, v9.4s\n"
217 "str q15, [%[outptr1], #0x20]\n"
218 "add %[outptr1], %[outptr1], #0x30\n"
219 "add v10.4s, v10.4s, v2.4s\n"
220 "str q16, [%[outptr2]]\n"
221 "str q17, [%[outptr2], #0x10]\n"
222 "str q10, [%[outptr2], #0x20]\n"
223 "add %[outptr2], %[outptr2], #0x30\n"
224 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
225 [inptr] "+r" (inptr)
226 :
227 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
228 );
229 }
230 }
231 break;
232
233 case 4:
234 {
235 if ((i+11) >= xmax)
236 {
237 for (int xi=0; xi<11; xi++)
238 {
239 if ((i+xi) < xmax)
240 {
241 *outptr0 += inptr[xi];
242 outptr0++;
243 *outptr1 += inptr[xi + 12];
244 outptr1++;
245 *outptr2 += inptr[xi + 24];
246 outptr2++;
247 *outptr3 += inptr[xi + 36];
248 outptr3++;
249 }
250 }
251 inptr += 96;
252 } else {
253 /* Optimized routine to copy an entire block */
254 __asm __volatile (
255 "ldr q2, [%[outptr0]]\n"
256 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
257 "ldr q10, [%[inptr]]\n"
258 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
259 "ldr q3, [%[outptr0], #0x10]\n"
260 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
261 "add v10.4s, v10.4s, v2.4s\n"
262 "ldr q11, [%[inptr], #0x10]\n"
263 "ldr q4, [%[outptr0], #0x20]\n"
264 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
265 "ldr q12, [%[inptr], #0x20]\n"
266 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
267 "add v11.4s, v11.4s, v3.4s\n"
268 "str q10, [%[outptr0]]\n"
269 "ldr q5, [%[outptr1]]\n"
270 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
271 "add v12.4s, v12.4s, v4.4s\n"
272 "str q11, [%[outptr0], #0x10]\n"
273 "ldr q13, [%[inptr], #0x30]\n"
274 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
275 "ldr q6, [%[outptr1], #0x10]\n"
276 "str q12, [%[outptr0], #0x20]\n"
277 "add %[outptr0], %[outptr0], #0x30\n"
278 "add v13.4s, v13.4s, v5.4s\n"
279 "ldr q14, [%[inptr], #0x40]\n"
280 "ldr q7, [%[outptr1], #0x20]\n"
281 "ldr q15, [%[inptr], #0x50]\n"
282 "ldr q8, [%[outptr2]]\n"
283 "add v14.4s, v14.4s, v6.4s\n"
284 "str q13, [%[outptr1]]\n"
285 "ldr q16, [%[inptr], #0x60]\n"
286 "add v15.4s, v15.4s, v7.4s\n"
287 "ldr q9, [%[outptr2], #0x10]\n"
288 "ldr q17, [%[inptr], #0x70]\n"
289 "str q14, [%[outptr1], #0x10]\n"
290 "add v16.4s, v16.4s, v8.4s\n"
291 "ldr q2, [%[outptr2], #0x20]\n"
292 "ldr q10, [%[inptr], #0x80]\n"
293 "add v17.4s, v17.4s, v9.4s\n"
294 "str q15, [%[outptr1], #0x20]\n"
295 "ldr q3, [%[outptr3]]\n"
296 "add %[outptr1], %[outptr1], #0x30\n"
297 "add v10.4s, v10.4s, v2.4s\n"
298 "str q16, [%[outptr2]]\n"
299 "ldr q11, [%[inptr], #0x90]\n"
300 "ldr q4, [%[outptr3], #0x10]\n"
301 "ldr q12, [%[inptr], #0xa0]\n"
302 "str q17, [%[outptr2], #0x10]\n"
303 "add v11.4s, v11.4s, v3.4s\n"
304 "ldr q5, [%[outptr3], #0x20]\n"
305 "ldr q13, [%[inptr], #0xb0]\n"
306 "add %[inptr], %[inptr], #0x180\n"
307 "add v12.4s, v12.4s, v4.4s\n"
308 "str q10, [%[outptr2], #0x20]\n"
309 "add %[outptr2], %[outptr2], #0x30\n"
310 "add v13.4s, v13.4s, v5.4s\n"
311 "str q11, [%[outptr3]]\n"
312 "str q12, [%[outptr3], #0x10]\n"
313 "str q13, [%[outptr3], #0x20]\n"
314 "add %[outptr3], %[outptr3], #0x30\n"
315 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
316 [inptr] "+r" (inptr)
317 :
318 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
319 );
320 }
321 }
322 break;
323
324 case 5:
325 {
326 if ((i+11) >= xmax)
327 {
328 for (int xi=0; xi<11; xi++)
329 {
330 if ((i+xi) < xmax)
331 {
332 *outptr0 += inptr[xi];
333 outptr0++;
334 *outptr1 += inptr[xi + 12];
335 outptr1++;
336 *outptr2 += inptr[xi + 24];
337 outptr2++;
338 *outptr3 += inptr[xi + 36];
339 outptr3++;
340 *outptr4 += inptr[xi + 48];
341 outptr4++;
342 }
343 }
344 inptr += 96;
345 } else {
346 /* Optimized routine to copy an entire block */
347 __asm __volatile (
348 "ldr q2, [%[outptr0]]\n"
349 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
350 "ldr q10, [%[inptr]]\n"
351 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
352 "ldr q3, [%[outptr0], #0x10]\n"
353 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
354 "add v10.4s, v10.4s, v2.4s\n"
355 "ldr q11, [%[inptr], #0x10]\n"
356 "ldr q4, [%[outptr0], #0x20]\n"
357 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
358 "ldr q12, [%[inptr], #0x20]\n"
359 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
360 "add v11.4s, v11.4s, v3.4s\n"
361 "str q10, [%[outptr0]]\n"
362 "ldr q5, [%[outptr1]]\n"
363 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
364 "add v12.4s, v12.4s, v4.4s\n"
365 "str q11, [%[outptr0], #0x10]\n"
366 "ldr q13, [%[inptr], #0x30]\n"
367 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
368 "ldr q6, [%[outptr1], #0x10]\n"
369 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
370 "add v13.4s, v13.4s, v5.4s\n"
371 "str q12, [%[outptr0], #0x20]\n"
372 "ldr q14, [%[inptr], #0x40]\n"
373 "add %[outptr0], %[outptr0], #0x30\n"
374 "ldr q7, [%[outptr1], #0x20]\n"
375 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
376 "add v14.4s, v14.4s, v6.4s\n"
377 "str q13, [%[outptr1]]\n"
378 "ldr q15, [%[inptr], #0x50]\n"
379 "ldr q8, [%[outptr2]]\n"
380 "ldr q16, [%[inptr], #0x60]\n"
381 "str q14, [%[outptr1], #0x10]\n"
382 "add v15.4s, v15.4s, v7.4s\n"
383 "ldr q9, [%[outptr2], #0x10]\n"
384 "ldr q17, [%[inptr], #0x70]\n"
385 "add v16.4s, v16.4s, v8.4s\n"
386 "ldr q2, [%[outptr2], #0x20]\n"
387 "ldr q10, [%[inptr], #0x80]\n"
388 "str q15, [%[outptr1], #0x20]\n"
389 "add %[outptr1], %[outptr1], #0x30\n"
390 "add v17.4s, v17.4s, v9.4s\n"
391 "ldr q3, [%[outptr3]]\n"
392 "add v10.4s, v10.4s, v2.4s\n"
393 "str q16, [%[outptr2]]\n"
394 "ldr q11, [%[inptr], #0x90]\n"
395 "ldr q4, [%[outptr3], #0x10]\n"
396 "ldr q12, [%[inptr], #0xa0]\n"
397 "str q17, [%[outptr2], #0x10]\n"
398 "add v11.4s, v11.4s, v3.4s\n"
399 "ldr q5, [%[outptr3], #0x20]\n"
400 "ldr q13, [%[inptr], #0xb0]\n"
401 "add v12.4s, v12.4s, v4.4s\n"
402 "str q10, [%[outptr2], #0x20]\n"
403 "ldr q6, [%[outptr4]]\n"
404 "add %[outptr2], %[outptr2], #0x30\n"
405 "add v13.4s, v13.4s, v5.4s\n"
406 "str q11, [%[outptr3]]\n"
407 "ldr q14, [%[inptr], #0xc0]\n"
408 "ldr q7, [%[outptr4], #0x10]\n"
409 "ldr q15, [%[inptr], #0xd0]\n"
410 "str q12, [%[outptr3], #0x10]\n"
411 "add v14.4s, v14.4s, v6.4s\n"
412 "ldr q8, [%[outptr4], #0x20]\n"
413 "ldr q16, [%[inptr], #0xe0]\n"
414 "add %[inptr], %[inptr], #0x180\n"
415 "add v15.4s, v15.4s, v7.4s\n"
416 "str q13, [%[outptr3], #0x20]\n"
417 "add %[outptr3], %[outptr3], #0x30\n"
418 "add v16.4s, v16.4s, v8.4s\n"
419 "str q14, [%[outptr4]]\n"
420 "str q15, [%[outptr4], #0x10]\n"
421 "str q16, [%[outptr4], #0x20]\n"
422 "add %[outptr4], %[outptr4], #0x30\n"
423 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
424 [inptr] "+r" (inptr)
425 :
426 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
427 );
428 }
429 }
430 break;
431
432 case 6:
433 {
434 if ((i+11) >= xmax)
435 {
436 for (int xi=0; xi<11; xi++)
437 {
438 if ((i+xi) < xmax)
439 {
440 *outptr0 += inptr[xi];
441 outptr0++;
442 *outptr1 += inptr[xi + 12];
443 outptr1++;
444 *outptr2 += inptr[xi + 24];
445 outptr2++;
446 *outptr3 += inptr[xi + 36];
447 outptr3++;
448 *outptr4 += inptr[xi + 48];
449 outptr4++;
450 *outptr5 += inptr[xi + 60];
451 outptr5++;
452 }
453 }
454 inptr += 96;
455 } else {
456 /* Optimized routine to copy an entire block */
457 __asm __volatile (
458 "ldr q2, [%[outptr0]]\n"
459 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
460 "ldr q10, [%[inptr]]\n"
461 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
462 "ldr q3, [%[outptr0], #0x10]\n"
463 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
464 "add v10.4s, v10.4s, v2.4s\n"
465 "ldr q11, [%[inptr], #0x10]\n"
466 "ldr q4, [%[outptr0], #0x20]\n"
467 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
468 "ldr q12, [%[inptr], #0x20]\n"
469 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
470 "add v11.4s, v11.4s, v3.4s\n"
471 "str q10, [%[outptr0]]\n"
472 "ldr q5, [%[outptr1]]\n"
473 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
474 "add v12.4s, v12.4s, v4.4s\n"
475 "str q11, [%[outptr0], #0x10]\n"
476 "ldr q13, [%[inptr], #0x30]\n"
477 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
478 "ldr q6, [%[outptr1], #0x10]\n"
479 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
480 "add v13.4s, v13.4s, v5.4s\n"
481 "str q12, [%[outptr0], #0x20]\n"
482 "ldr q14, [%[inptr], #0x40]\n"
483 "add %[outptr0], %[outptr0], #0x30\n"
484 "ldr q7, [%[outptr1], #0x20]\n"
485 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
486 "add v14.4s, v14.4s, v6.4s\n"
487 "str q13, [%[outptr1]]\n"
488 "ldr q15, [%[inptr], #0x50]\n"
489 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
490 "ldr q8, [%[outptr2]]\n"
491 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
492 "add v15.4s, v15.4s, v7.4s\n"
493 "str q14, [%[outptr1], #0x10]\n"
494 "ldr q16, [%[inptr], #0x60]\n"
495 "ldr q9, [%[outptr2], #0x10]\n"
496 "ldr q17, [%[inptr], #0x70]\n"
497 "str q15, [%[outptr1], #0x20]\n"
498 "add %[outptr1], %[outptr1], #0x30\n"
499 "add v16.4s, v16.4s, v8.4s\n"
500 "ldr q2, [%[outptr2], #0x20]\n"
501 "add v17.4s, v17.4s, v9.4s\n"
502 "ldr q10, [%[inptr], #0x80]\n"
503 "ldr q3, [%[outptr3]]\n"
504 "ldr q11, [%[inptr], #0x90]\n"
505 "str q16, [%[outptr2]]\n"
506 "add v10.4s, v10.4s, v2.4s\n"
507 "ldr q4, [%[outptr3], #0x10]\n"
508 "ldr q12, [%[inptr], #0xa0]\n"
509 "add v11.4s, v11.4s, v3.4s\n"
510 "str q17, [%[outptr2], #0x10]\n"
511 "ldr q5, [%[outptr3], #0x20]\n"
512 "ldr q13, [%[inptr], #0xb0]\n"
513 "add v12.4s, v12.4s, v4.4s\n"
514 "str q10, [%[outptr2], #0x20]\n"
515 "ldr q6, [%[outptr4]]\n"
516 "add %[outptr2], %[outptr2], #0x30\n"
517 "add v13.4s, v13.4s, v5.4s\n"
518 "str q11, [%[outptr3]]\n"
519 "ldr q14, [%[inptr], #0xc0]\n"
520 "ldr q7, [%[outptr4], #0x10]\n"
521 "ldr q15, [%[inptr], #0xd0]\n"
522 "str q12, [%[outptr3], #0x10]\n"
523 "add v14.4s, v14.4s, v6.4s\n"
524 "ldr q8, [%[outptr4], #0x20]\n"
525 "ldr q16, [%[inptr], #0xe0]\n"
526 "add v15.4s, v15.4s, v7.4s\n"
527 "str q13, [%[outptr3], #0x20]\n"
528 "ldr q9, [%[outptr5]]\n"
529 "add %[outptr3], %[outptr3], #0x30\n"
530 "add v16.4s, v16.4s, v8.4s\n"
531 "str q14, [%[outptr4]]\n"
532 "ldr q17, [%[inptr], #0xf0]\n"
533 "ldr q2, [%[outptr5], #0x10]\n"
534 "ldr q10, [%[inptr], #0x100]\n"
535 "str q15, [%[outptr4], #0x10]\n"
536 "add v17.4s, v17.4s, v9.4s\n"
537 "ldr q3, [%[outptr5], #0x20]\n"
538 "ldr q11, [%[inptr], #0x110]\n"
539 "add %[inptr], %[inptr], #0x180\n"
540 "add v10.4s, v10.4s, v2.4s\n"
541 "str q16, [%[outptr4], #0x20]\n"
542 "add %[outptr4], %[outptr4], #0x30\n"
543 "add v11.4s, v11.4s, v3.4s\n"
544 "str q17, [%[outptr5]]\n"
545 "str q10, [%[outptr5], #0x10]\n"
546 "str q11, [%[outptr5], #0x20]\n"
547 "add %[outptr5], %[outptr5], #0x30\n"
548 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
549 [inptr] "+r" (inptr)
550 :
551 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
552 );
553 }
554 }
555 break;
556
557 case 7:
558 {
559 if ((i+11) >= xmax)
560 {
561 for (int xi=0; xi<11; xi++)
562 {
563 if ((i+xi) < xmax)
564 {
565 *outptr0 += inptr[xi];
566 outptr0++;
567 *outptr1 += inptr[xi + 12];
568 outptr1++;
569 *outptr2 += inptr[xi + 24];
570 outptr2++;
571 *outptr3 += inptr[xi + 36];
572 outptr3++;
573 *outptr4 += inptr[xi + 48];
574 outptr4++;
575 *outptr5 += inptr[xi + 60];
576 outptr5++;
577 *outptr6 += inptr[xi + 72];
578 outptr6++;
579 }
580 }
581 inptr += 96;
582 } else {
583 /* Optimized routine to copy an entire block */
584 __asm __volatile (
585 "ldr q2, [%[outptr0]]\n"
586 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
587 "ldr q10, [%[inptr]]\n"
588 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
589 "ldr q3, [%[outptr0], #0x10]\n"
590 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
591 "add v10.4s, v10.4s, v2.4s\n"
592 "ldr q11, [%[inptr], #0x10]\n"
593 "ldr q4, [%[outptr0], #0x20]\n"
594 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
595 "ldr q12, [%[inptr], #0x20]\n"
596 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
597 "add v11.4s, v11.4s, v3.4s\n"
598 "str q10, [%[outptr0]]\n"
599 "ldr q5, [%[outptr1]]\n"
600 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
601 "add v12.4s, v12.4s, v4.4s\n"
602 "str q11, [%[outptr0], #0x10]\n"
603 "ldr q13, [%[inptr], #0x30]\n"
604 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
605 "ldr q6, [%[outptr1], #0x10]\n"
606 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
607 "add v13.4s, v13.4s, v5.4s\n"
608 "str q12, [%[outptr0], #0x20]\n"
609 "ldr q14, [%[inptr], #0x40]\n"
610 "add %[outptr0], %[outptr0], #0x30\n"
611 "ldr q7, [%[outptr1], #0x20]\n"
612 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
613 "add v14.4s, v14.4s, v6.4s\n"
614 "str q13, [%[outptr1]]\n"
615 "ldr q15, [%[inptr], #0x50]\n"
616 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
617 "ldr q8, [%[outptr2]]\n"
618 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
619 "add v15.4s, v15.4s, v7.4s\n"
620 "str q14, [%[outptr1], #0x10]\n"
621 "ldr q16, [%[inptr], #0x60]\n"
622 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
623 "ldr q9, [%[outptr2], #0x10]\n"
624 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
625 "add v16.4s, v16.4s, v8.4s\n"
626 "str q15, [%[outptr1], #0x20]\n"
627 "ldr q17, [%[inptr], #0x70]\n"
628 "add %[outptr1], %[outptr1], #0x30\n"
629 "ldr q2, [%[outptr2], #0x20]\n"
630 "str q16, [%[outptr2]]\n"
631 "add v17.4s, v17.4s, v9.4s\n"
632 "ldr q10, [%[inptr], #0x80]\n"
633 "ldr q3, [%[outptr3]]\n"
634 "ldr q11, [%[inptr], #0x90]\n"
635 "ldr q4, [%[outptr3], #0x10]\n"
636 "add v10.4s, v10.4s, v2.4s\n"
637 "str q17, [%[outptr2], #0x10]\n"
638 "ldr q12, [%[inptr], #0xa0]\n"
639 "add v11.4s, v11.4s, v3.4s\n"
640 "ldr q5, [%[outptr3], #0x20]\n"
641 "ldr q13, [%[inptr], #0xb0]\n"
642 "str q10, [%[outptr2], #0x20]\n"
643 "add %[outptr2], %[outptr2], #0x30\n"
644 "add v12.4s, v12.4s, v4.4s\n"
645 "ldr q6, [%[outptr4]]\n"
646 "add v13.4s, v13.4s, v5.4s\n"
647 "str q11, [%[outptr3]]\n"
648 "ldr q14, [%[inptr], #0xc0]\n"
649 "ldr q7, [%[outptr4], #0x10]\n"
650 "ldr q15, [%[inptr], #0xd0]\n"
651 "str q12, [%[outptr3], #0x10]\n"
652 "add v14.4s, v14.4s, v6.4s\n"
653 "ldr q8, [%[outptr4], #0x20]\n"
654 "ldr q16, [%[inptr], #0xe0]\n"
655 "add v15.4s, v15.4s, v7.4s\n"
656 "str q13, [%[outptr3], #0x20]\n"
657 "ldr q9, [%[outptr5]]\n"
658 "add %[outptr3], %[outptr3], #0x30\n"
659 "add v16.4s, v16.4s, v8.4s\n"
660 "str q14, [%[outptr4]]\n"
661 "ldr q17, [%[inptr], #0xf0]\n"
662 "ldr q2, [%[outptr5], #0x10]\n"
663 "ldr q10, [%[inptr], #0x100]\n"
664 "str q15, [%[outptr4], #0x10]\n"
665 "add v17.4s, v17.4s, v9.4s\n"
666 "ldr q3, [%[outptr5], #0x20]\n"
667 "ldr q11, [%[inptr], #0x110]\n"
668 "add v10.4s, v10.4s, v2.4s\n"
669 "str q16, [%[outptr4], #0x20]\n"
670 "ldr q4, [%[outptr6]]\n"
671 "add %[outptr4], %[outptr4], #0x30\n"
672 "add v11.4s, v11.4s, v3.4s\n"
673 "str q17, [%[outptr5]]\n"
674 "ldr q12, [%[inptr], #0x120]\n"
675 "ldr q5, [%[outptr6], #0x10]\n"
676 "ldr q13, [%[inptr], #0x130]\n"
677 "str q10, [%[outptr5], #0x10]\n"
678 "add v12.4s, v12.4s, v4.4s\n"
679 "ldr q6, [%[outptr6], #0x20]\n"
680 "ldr q14, [%[inptr], #0x140]\n"
681 "add %[inptr], %[inptr], #0x180\n"
682 "add v13.4s, v13.4s, v5.4s\n"
683 "str q11, [%[outptr5], #0x20]\n"
684 "add %[outptr5], %[outptr5], #0x30\n"
685 "add v14.4s, v14.4s, v6.4s\n"
686 "str q12, [%[outptr6]]\n"
687 "str q13, [%[outptr6], #0x10]\n"
688 "str q14, [%[outptr6], #0x20]\n"
689 "add %[outptr6], %[outptr6], #0x30\n"
690 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
691 [inptr] "+r" (inptr)
692 :
693 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
694 );
695 }
696 }
697 break;
698
699 default:
700 case 8:
701 {
702 if ((i+11) >= xmax)
703 {
704 for (int xi=0; xi<11; xi++)
705 {
706 if ((i+xi) < xmax)
707 {
708 *outptr0 += inptr[xi];
709 outptr0++;
710 *outptr1 += inptr[xi + 12];
711 outptr1++;
712 *outptr2 += inptr[xi + 24];
713 outptr2++;
714 *outptr3 += inptr[xi + 36];
715 outptr3++;
716 *outptr4 += inptr[xi + 48];
717 outptr4++;
718 *outptr5 += inptr[xi + 60];
719 outptr5++;
720 *outptr6 += inptr[xi + 72];
721 outptr6++;
722 *outptr7 += inptr[xi + 84];
723 outptr7++;
724 }
725 }
726 inptr += 96;
727 } else {
728 /* Optimized routine to copy an entire block */
729 __asm __volatile (
730 "ldr q2, [%[outptr0]]\n"
731 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
732 "ldr q10, [%[inptr]]\n"
733 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
734 "ldr q3, [%[outptr0], #0x10]\n"
735 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
736 "add v10.4s, v10.4s, v2.4s\n"
737 "ldr q11, [%[inptr], #0x10]\n"
738 "ldr q4, [%[outptr0], #0x20]\n"
739 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
740 "ldr q12, [%[inptr], #0x20]\n"
741 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
742 "add v11.4s, v11.4s, v3.4s\n"
743 "str q10, [%[outptr0]]\n"
744 "ldr q5, [%[outptr1]]\n"
745 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
746 "add v12.4s, v12.4s, v4.4s\n"
747 "str q11, [%[outptr0], #0x10]\n"
748 "ldr q13, [%[inptr], #0x30]\n"
749 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
750 "ldr q6, [%[outptr1], #0x10]\n"
751 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
752 "add v13.4s, v13.4s, v5.4s\n"
753 "str q12, [%[outptr0], #0x20]\n"
754 "ldr q14, [%[inptr], #0x40]\n"
755 "add %[outptr0], %[outptr0], #0x30\n"
756 "ldr q7, [%[outptr1], #0x20]\n"
757 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
758 "add v14.4s, v14.4s, v6.4s\n"
759 "str q13, [%[outptr1]]\n"
760 "ldr q15, [%[inptr], #0x50]\n"
761 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
762 "ldr q8, [%[outptr2]]\n"
763 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
764 "add v15.4s, v15.4s, v7.4s\n"
765 "str q14, [%[outptr1], #0x10]\n"
766 "ldr q16, [%[inptr], #0x60]\n"
767 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
768 "ldr q9, [%[outptr2], #0x10]\n"
769 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
770 "add v16.4s, v16.4s, v8.4s\n"
771 "str q15, [%[outptr1], #0x20]\n"
772 "ldr q17, [%[inptr], #0x70]\n"
773 "add %[outptr1], %[outptr1], #0x30\n"
774 "ldr q2, [%[outptr2], #0x20]\n"
775 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
776 "add v17.4s, v17.4s, v9.4s\n"
777 "str q16, [%[outptr2]]\n"
778 "ldr q10, [%[inptr], #0x80]\n"
779 "ldr q3, [%[outptr3]]\n"
780 "ldr q11, [%[inptr], #0x90]\n"
781 "str q17, [%[outptr2], #0x10]\n"
782 "add v10.4s, v10.4s, v2.4s\n"
783 "ldr q4, [%[outptr3], #0x10]\n"
784 "ldr q12, [%[inptr], #0xa0]\n"
785 "add v11.4s, v11.4s, v3.4s\n"
786 "ldr q5, [%[outptr3], #0x20]\n"
787 "ldr q13, [%[inptr], #0xb0]\n"
788 "str q10, [%[outptr2], #0x20]\n"
789 "add %[outptr2], %[outptr2], #0x30\n"
790 "add v12.4s, v12.4s, v4.4s\n"
791 "ldr q6, [%[outptr4]]\n"
792 "add v13.4s, v13.4s, v5.4s\n"
793 "str q11, [%[outptr3]]\n"
794 "ldr q14, [%[inptr], #0xc0]\n"
795 "ldr q7, [%[outptr4], #0x10]\n"
796 "ldr q15, [%[inptr], #0xd0]\n"
797 "str q12, [%[outptr3], #0x10]\n"
798 "add v14.4s, v14.4s, v6.4s\n"
799 "ldr q8, [%[outptr4], #0x20]\n"
800 "ldr q16, [%[inptr], #0xe0]\n"
801 "add v15.4s, v15.4s, v7.4s\n"
802 "str q13, [%[outptr3], #0x20]\n"
803 "ldr q9, [%[outptr5]]\n"
804 "add %[outptr3], %[outptr3], #0x30\n"
805 "add v16.4s, v16.4s, v8.4s\n"
806 "str q14, [%[outptr4]]\n"
807 "ldr q17, [%[inptr], #0xf0]\n"
808 "ldr q2, [%[outptr5], #0x10]\n"
809 "ldr q10, [%[inptr], #0x100]\n"
810 "str q15, [%[outptr4], #0x10]\n"
811 "add v17.4s, v17.4s, v9.4s\n"
812 "ldr q3, [%[outptr5], #0x20]\n"
813 "ldr q11, [%[inptr], #0x110]\n"
814 "add v10.4s, v10.4s, v2.4s\n"
815 "str q16, [%[outptr4], #0x20]\n"
816 "ldr q4, [%[outptr6]]\n"
817 "add %[outptr4], %[outptr4], #0x30\n"
818 "add v11.4s, v11.4s, v3.4s\n"
819 "str q17, [%[outptr5]]\n"
820 "ldr q12, [%[inptr], #0x120]\n"
821 "ldr q5, [%[outptr6], #0x10]\n"
822 "ldr q13, [%[inptr], #0x130]\n"
823 "str q10, [%[outptr5], #0x10]\n"
824 "add v12.4s, v12.4s, v4.4s\n"
825 "ldr q6, [%[outptr6], #0x20]\n"
826 "ldr q14, [%[inptr], #0x140]\n"
827 "add v13.4s, v13.4s, v5.4s\n"
828 "str q11, [%[outptr5], #0x20]\n"
829 "ldr q7, [%[outptr7]]\n"
830 "add %[outptr5], %[outptr5], #0x30\n"
831 "add v14.4s, v14.4s, v6.4s\n"
832 "str q12, [%[outptr6]]\n"
833 "ldr q15, [%[inptr], #0x150]\n"
834 "ldr q8, [%[outptr7], #0x10]\n"
835 "ldr q16, [%[inptr], #0x160]\n"
836 "str q13, [%[outptr6], #0x10]\n"
837 "add v15.4s, v15.4s, v7.4s\n"
838 "ldr q9, [%[outptr7], #0x20]\n"
839 "ldr q17, [%[inptr], #0x170]\n"
840 "add %[inptr], %[inptr], #0x180\n"
841 "add v16.4s, v16.4s, v8.4s\n"
842 "str q14, [%[outptr6], #0x20]\n"
843 "add %[outptr6], %[outptr6], #0x30\n"
844 "add v17.4s, v17.4s, v9.4s\n"
845 "str q15, [%[outptr7]]\n"
846 "str q16, [%[outptr7], #0x10]\n"
847 "str q17, [%[outptr7], #0x20]\n"
848 "add %[outptr7], %[outptr7], #0x30\n"
849 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
850 [inptr] "+r" (inptr)
851 :
852 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
853 );
854 }
855 }
856 break;
857
858
859 }
860 }
861 else
862 {
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100863 const int32_t *biasptr = bias ? bias + i : nullbias;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +0100864
865 switch(height)
866 {
867 case 1:
868 {
869 if ((i+11) >= xmax)
870 {
871 for (int xi=0; xi<11; xi++)
872 {
873 if ((i+xi) < xmax)
874 {
875 *outptr0 = biasptr[xi] + inptr[xi];
876 outptr0++;
877 }
878 }
879 inptr += 96;
880 } else {
881 /* Optimized routine to copy an entire block */
882 __asm __volatile (
883 "ldr q2, [%[biasptr]]\n"
884 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
885 "ldr q3, [%[biasptr], #0x10]\n"
886 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
887 "ldr q4, [%[biasptr], #0x20]\n"
888 "ldr q13, [%[inptr]]\n"
889 "ldr q14, [%[inptr], #0x10]\n"
890 "ldr q15, [%[inptr], #0x20]\n"
891 "add %[inptr], %[inptr], #0x180\n"
892 "add v13.4s, v13.4s, v2.4s\n"
893 "add v14.4s, v14.4s, v3.4s\n"
894 "add v15.4s, v15.4s, v4.4s\n"
895 "str q13, [%[outptr0]]\n"
896 "str q14, [%[outptr0], #0x10]\n"
897 "str q15, [%[outptr0], #0x20]\n"
898 "add %[outptr0], %[outptr0], #0x30\n"
899 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
900 [inptr] "+r" (inptr)
901 : [biasptr] "r" (biasptr)
902 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
903 );
904 }
905 }
906 break;
907
908 case 2:
909 {
910 if ((i+11) >= xmax)
911 {
912 for (int xi=0; xi<11; xi++)
913 {
914 if ((i+xi) < xmax)
915 {
916 *outptr0 = biasptr[xi] + inptr[xi];
917 outptr0++;
918 *outptr1 = biasptr[xi] + inptr[xi + 12];
919 outptr1++;
920 }
921 }
922 inptr += 96;
923 } else {
924 /* Optimized routine to copy an entire block */
925 __asm __volatile (
926 "ldr q2, [%[biasptr]]\n"
927 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
928 "ldr q3, [%[biasptr], #0x10]\n"
929 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
930 "ldr q4, [%[biasptr], #0x20]\n"
931 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
932 "ldr q13, [%[inptr]]\n"
933 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
934 "ldr q14, [%[inptr], #0x10]\n"
935 "ldr q15, [%[inptr], #0x20]\n"
936 "add v13.4s, v13.4s, v2.4s\n"
937 "ldr q16, [%[inptr], #0x30]\n"
938 "ldr q17, [%[inptr], #0x40]\n"
939 "add v14.4s, v14.4s, v3.4s\n"
940 "ldr q18, [%[inptr], #0x50]\n"
941 "add v15.4s, v15.4s, v4.4s\n"
942 "str q13, [%[outptr0]]\n"
943 "add v16.4s, v16.4s, v2.4s\n"
944 "add %[inptr], %[inptr], #0x180\n"
945 "add v17.4s, v17.4s, v3.4s\n"
946 "str q14, [%[outptr0], #0x10]\n"
947 "add v18.4s, v18.4s, v4.4s\n"
948 "str q15, [%[outptr0], #0x20]\n"
949 "add %[outptr0], %[outptr0], #0x30\n"
950 "str q16, [%[outptr1]]\n"
951 "str q17, [%[outptr1], #0x10]\n"
952 "str q18, [%[outptr1], #0x20]\n"
953 "add %[outptr1], %[outptr1], #0x30\n"
954 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
955 [inptr] "+r" (inptr)
956 : [biasptr] "r" (biasptr)
957 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
958 );
959 }
960 }
961 break;
962
963 case 3:
964 {
965 if ((i+11) >= xmax)
966 {
967 for (int xi=0; xi<11; xi++)
968 {
969 if ((i+xi) < xmax)
970 {
971 *outptr0 = biasptr[xi] + inptr[xi];
972 outptr0++;
973 *outptr1 = biasptr[xi] + inptr[xi + 12];
974 outptr1++;
975 *outptr2 = biasptr[xi] + inptr[xi + 24];
976 outptr2++;
977 }
978 }
979 inptr += 96;
980 } else {
981 /* Optimized routine to copy an entire block */
982 __asm __volatile (
983 "ldr q2, [%[biasptr]]\n"
984 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
985 "ldr q3, [%[biasptr], #0x10]\n"
986 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
987 "ldr q4, [%[biasptr], #0x20]\n"
988 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
989 "ldr q13, [%[inptr]]\n"
990 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
991 "ldr q14, [%[inptr], #0x10]\n"
992 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
993 "add v13.4s, v13.4s, v2.4s\n"
994 "ldr q15, [%[inptr], #0x20]\n"
995 "ldr q16, [%[inptr], #0x30]\n"
996 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
997 "add v14.4s, v14.4s, v3.4s\n"
998 "str q13, [%[outptr0]]\n"
999 "add v15.4s, v15.4s, v4.4s\n"
1000 "ldr q17, [%[inptr], #0x40]\n"
1001 "add v16.4s, v16.4s, v2.4s\n"
1002 "ldr q18, [%[inptr], #0x50]\n"
1003 "ldr q19, [%[inptr], #0x60]\n"
1004 "str q14, [%[outptr0], #0x10]\n"
1005 "add v17.4s, v17.4s, v3.4s\n"
1006 "ldr q20, [%[inptr], #0x70]\n"
1007 "add v18.4s, v18.4s, v4.4s\n"
1008 "ldr q13, [%[inptr], #0x80]\n"
1009 "add v19.4s, v19.4s, v2.4s\n"
1010 "str q15, [%[outptr0], #0x20]\n"
1011 "add %[outptr0], %[outptr0], #0x30\n"
1012 "add v20.4s, v20.4s, v3.4s\n"
1013 "add %[inptr], %[inptr], #0x180\n"
1014 "add v13.4s, v13.4s, v4.4s\n"
1015 "str q16, [%[outptr1]]\n"
1016 "str q17, [%[outptr1], #0x10]\n"
1017 "str q18, [%[outptr1], #0x20]\n"
1018 "add %[outptr1], %[outptr1], #0x30\n"
1019 "str q19, [%[outptr2]]\n"
1020 "str q20, [%[outptr2], #0x10]\n"
1021 "str q13, [%[outptr2], #0x20]\n"
1022 "add %[outptr2], %[outptr2], #0x30\n"
1023 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1024 [inptr] "+r" (inptr)
1025 : [biasptr] "r" (biasptr)
1026 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1027 );
1028 }
1029 }
1030 break;
1031
1032 case 4:
1033 {
1034 if ((i+11) >= xmax)
1035 {
1036 for (int xi=0; xi<11; xi++)
1037 {
1038 if ((i+xi) < xmax)
1039 {
1040 *outptr0 = biasptr[xi] + inptr[xi];
1041 outptr0++;
1042 *outptr1 = biasptr[xi] + inptr[xi + 12];
1043 outptr1++;
1044 *outptr2 = biasptr[xi] + inptr[xi + 24];
1045 outptr2++;
1046 *outptr3 = biasptr[xi] + inptr[xi + 36];
1047 outptr3++;
1048 }
1049 }
1050 inptr += 96;
1051 } else {
1052 /* Optimized routine to copy an entire block */
1053 __asm __volatile (
1054 "ldr q2, [%[biasptr]]\n"
1055 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1056 "ldr q3, [%[biasptr], #0x10]\n"
1057 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1058 "ldr q4, [%[biasptr], #0x20]\n"
1059 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1060 "ldr q13, [%[inptr]]\n"
1061 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1062 "ldr q14, [%[inptr], #0x10]\n"
1063 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1064 "add v13.4s, v13.4s, v2.4s\n"
1065 "ldr q15, [%[inptr], #0x20]\n"
1066 "ldr q16, [%[inptr], #0x30]\n"
1067 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1068 "add v14.4s, v14.4s, v3.4s\n"
1069 "str q13, [%[outptr0]]\n"
1070 "add v15.4s, v15.4s, v4.4s\n"
1071 "ldr q17, [%[inptr], #0x40]\n"
1072 "add v16.4s, v16.4s, v2.4s\n"
1073 "ldr q18, [%[inptr], #0x50]\n"
1074 "ldr q19, [%[inptr], #0x60]\n"
1075 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1076 "add v17.4s, v17.4s, v3.4s\n"
1077 "str q14, [%[outptr0], #0x10]\n"
1078 "add v18.4s, v18.4s, v4.4s\n"
1079 "ldr q20, [%[inptr], #0x70]\n"
1080 "add v19.4s, v19.4s, v2.4s\n"
1081 "ldr q13, [%[inptr], #0x80]\n"
1082 "ldr q14, [%[inptr], #0x90]\n"
1083 "str q15, [%[outptr0], #0x20]\n"
1084 "add %[outptr0], %[outptr0], #0x30\n"
1085 "add v20.4s, v20.4s, v3.4s\n"
1086 "ldr q15, [%[inptr], #0xa0]\n"
1087 "add v13.4s, v13.4s, v4.4s\n"
1088 "str q16, [%[outptr1]]\n"
1089 "add v14.4s, v14.4s, v2.4s\n"
1090 "ldr q16, [%[inptr], #0xb0]\n"
1091 "add %[inptr], %[inptr], #0x180\n"
1092 "add v15.4s, v15.4s, v3.4s\n"
1093 "str q17, [%[outptr1], #0x10]\n"
1094 "add v16.4s, v16.4s, v4.4s\n"
1095 "str q18, [%[outptr1], #0x20]\n"
1096 "add %[outptr1], %[outptr1], #0x30\n"
1097 "str q19, [%[outptr2]]\n"
1098 "str q20, [%[outptr2], #0x10]\n"
1099 "str q13, [%[outptr2], #0x20]\n"
1100 "add %[outptr2], %[outptr2], #0x30\n"
1101 "str q14, [%[outptr3]]\n"
1102 "str q15, [%[outptr3], #0x10]\n"
1103 "str q16, [%[outptr3], #0x20]\n"
1104 "add %[outptr3], %[outptr3], #0x30\n"
1105 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1106 [inptr] "+r" (inptr)
1107 : [biasptr] "r" (biasptr)
1108 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1109 );
1110 }
1111 }
1112 break;
1113
1114 case 5:
1115 {
1116 if ((i+11) >= xmax)
1117 {
1118 for (int xi=0; xi<11; xi++)
1119 {
1120 if ((i+xi) < xmax)
1121 {
1122 *outptr0 = biasptr[xi] + inptr[xi];
1123 outptr0++;
1124 *outptr1 = biasptr[xi] + inptr[xi + 12];
1125 outptr1++;
1126 *outptr2 = biasptr[xi] + inptr[xi + 24];
1127 outptr2++;
1128 *outptr3 = biasptr[xi] + inptr[xi + 36];
1129 outptr3++;
1130 *outptr4 = biasptr[xi] + inptr[xi + 48];
1131 outptr4++;
1132 }
1133 }
1134 inptr += 96;
1135 } else {
1136 /* Optimized routine to copy an entire block */
1137 __asm __volatile (
1138 "ldr q2, [%[biasptr]]\n"
1139 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1140 "ldr q3, [%[biasptr], #0x10]\n"
1141 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1142 "ldr q4, [%[biasptr], #0x20]\n"
1143 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1144 "ldr q13, [%[inptr]]\n"
1145 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1146 "ldr q14, [%[inptr], #0x10]\n"
1147 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1148 "add v13.4s, v13.4s, v2.4s\n"
1149 "ldr q15, [%[inptr], #0x20]\n"
1150 "ldr q16, [%[inptr], #0x30]\n"
1151 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1152 "add v14.4s, v14.4s, v3.4s\n"
1153 "str q13, [%[outptr0]]\n"
1154 "add v15.4s, v15.4s, v4.4s\n"
1155 "ldr q17, [%[inptr], #0x40]\n"
1156 "add v16.4s, v16.4s, v2.4s\n"
1157 "ldr q18, [%[inptr], #0x50]\n"
1158 "ldr q19, [%[inptr], #0x60]\n"
1159 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1160 "add v17.4s, v17.4s, v3.4s\n"
1161 "str q14, [%[outptr0], #0x10]\n"
1162 "add v18.4s, v18.4s, v4.4s\n"
1163 "ldr q20, [%[inptr], #0x70]\n"
1164 "add v19.4s, v19.4s, v2.4s\n"
1165 "ldr q13, [%[inptr], #0x80]\n"
1166 "ldr q14, [%[inptr], #0x90]\n"
1167 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1168 "add v20.4s, v20.4s, v3.4s\n"
1169 "str q15, [%[outptr0], #0x20]\n"
1170 "add v13.4s, v13.4s, v4.4s\n"
1171 "ldr q15, [%[inptr], #0xa0]\n"
1172 "add v14.4s, v14.4s, v2.4s\n"
1173 "add %[outptr0], %[outptr0], #0x30\n"
1174 "str q16, [%[outptr1]]\n"
1175 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1176 "add v15.4s, v15.4s, v3.4s\n"
1177 "ldr q16, [%[inptr], #0xb0]\n"
1178 "str q17, [%[outptr1], #0x10]\n"
1179 "ldr q17, [%[inptr], #0xc0]\n"
1180 "add v16.4s, v16.4s, v4.4s\n"
1181 "str q18, [%[outptr1], #0x20]\n"
1182 "add %[outptr1], %[outptr1], #0x30\n"
1183 "add v17.4s, v17.4s, v2.4s\n"
1184 "ldr q18, [%[inptr], #0xd0]\n"
1185 "str q19, [%[outptr2]]\n"
1186 "ldr q19, [%[inptr], #0xe0]\n"
1187 "add %[inptr], %[inptr], #0x180\n"
1188 "add v18.4s, v18.4s, v3.4s\n"
1189 "str q20, [%[outptr2], #0x10]\n"
1190 "add v19.4s, v19.4s, v4.4s\n"
1191 "str q13, [%[outptr2], #0x20]\n"
1192 "add %[outptr2], %[outptr2], #0x30\n"
1193 "str q14, [%[outptr3]]\n"
1194 "str q15, [%[outptr3], #0x10]\n"
1195 "str q16, [%[outptr3], #0x20]\n"
1196 "add %[outptr3], %[outptr3], #0x30\n"
1197 "str q17, [%[outptr4]]\n"
1198 "str q18, [%[outptr4], #0x10]\n"
1199 "str q19, [%[outptr4], #0x20]\n"
1200 "add %[outptr4], %[outptr4], #0x30\n"
1201 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1202 [inptr] "+r" (inptr)
1203 : [biasptr] "r" (biasptr)
1204 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1205 );
1206 }
1207 }
1208 break;
1209
1210 case 6:
1211 {
1212 if ((i+11) >= xmax)
1213 {
1214 for (int xi=0; xi<11; xi++)
1215 {
1216 if ((i+xi) < xmax)
1217 {
1218 *outptr0 = biasptr[xi] + inptr[xi];
1219 outptr0++;
1220 *outptr1 = biasptr[xi] + inptr[xi + 12];
1221 outptr1++;
1222 *outptr2 = biasptr[xi] + inptr[xi + 24];
1223 outptr2++;
1224 *outptr3 = biasptr[xi] + inptr[xi + 36];
1225 outptr3++;
1226 *outptr4 = biasptr[xi] + inptr[xi + 48];
1227 outptr4++;
1228 *outptr5 = biasptr[xi] + inptr[xi + 60];
1229 outptr5++;
1230 }
1231 }
1232 inptr += 96;
1233 } else {
1234 /* Optimized routine to copy an entire block */
1235 __asm __volatile (
1236 "ldr q2, [%[biasptr]]\n"
1237 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1238 "ldr q3, [%[biasptr], #0x10]\n"
1239 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1240 "ldr q4, [%[biasptr], #0x20]\n"
1241 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1242 "ldr q13, [%[inptr]]\n"
1243 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1244 "ldr q14, [%[inptr], #0x10]\n"
1245 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1246 "add v13.4s, v13.4s, v2.4s\n"
1247 "ldr q15, [%[inptr], #0x20]\n"
1248 "ldr q16, [%[inptr], #0x30]\n"
1249 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1250 "add v14.4s, v14.4s, v3.4s\n"
1251 "str q13, [%[outptr0]]\n"
1252 "add v15.4s, v15.4s, v4.4s\n"
1253 "ldr q17, [%[inptr], #0x40]\n"
1254 "add v16.4s, v16.4s, v2.4s\n"
1255 "ldr q18, [%[inptr], #0x50]\n"
1256 "ldr q19, [%[inptr], #0x60]\n"
1257 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1258 "add v17.4s, v17.4s, v3.4s\n"
1259 "str q14, [%[outptr0], #0x10]\n"
1260 "add v18.4s, v18.4s, v4.4s\n"
1261 "ldr q20, [%[inptr], #0x70]\n"
1262 "add v19.4s, v19.4s, v2.4s\n"
1263 "ldr q13, [%[inptr], #0x80]\n"
1264 "ldr q14, [%[inptr], #0x90]\n"
1265 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1266 "add v20.4s, v20.4s, v3.4s\n"
1267 "str q15, [%[outptr0], #0x20]\n"
1268 "add v13.4s, v13.4s, v4.4s\n"
1269 "ldr q15, [%[inptr], #0xa0]\n"
1270 "add v14.4s, v14.4s, v2.4s\n"
1271 "add %[outptr0], %[outptr0], #0x30\n"
1272 "str q16, [%[outptr1]]\n"
1273 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1274 "add v15.4s, v15.4s, v3.4s\n"
1275 "ldr q16, [%[inptr], #0xb0]\n"
1276 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1277 "str q17, [%[outptr1], #0x10]\n"
1278 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1279 "add v16.4s, v16.4s, v4.4s\n"
1280 "ldr q17, [%[inptr], #0xc0]\n"
1281 "str q18, [%[outptr1], #0x20]\n"
1282 "add %[outptr1], %[outptr1], #0x30\n"
1283 "add v17.4s, v17.4s, v2.4s\n"
1284 "ldr q18, [%[inptr], #0xd0]\n"
1285 "str q19, [%[outptr2]]\n"
1286 "ldr q19, [%[inptr], #0xe0]\n"
1287 "add v18.4s, v18.4s, v3.4s\n"
1288 "str q20, [%[outptr2], #0x10]\n"
1289 "add v19.4s, v19.4s, v4.4s\n"
1290 "ldr q20, [%[inptr], #0xf0]\n"
1291 "str q13, [%[outptr2], #0x20]\n"
1292 "add %[outptr2], %[outptr2], #0x30\n"
1293 "add v20.4s, v20.4s, v2.4s\n"
1294 "ldr q13, [%[inptr], #0x100]\n"
1295 "str q14, [%[outptr3]]\n"
1296 "ldr q14, [%[inptr], #0x110]\n"
1297 "add %[inptr], %[inptr], #0x180\n"
1298 "add v13.4s, v13.4s, v3.4s\n"
1299 "str q15, [%[outptr3], #0x10]\n"
1300 "add v14.4s, v14.4s, v4.4s\n"
1301 "str q16, [%[outptr3], #0x20]\n"
1302 "add %[outptr3], %[outptr3], #0x30\n"
1303 "str q17, [%[outptr4]]\n"
1304 "str q18, [%[outptr4], #0x10]\n"
1305 "str q19, [%[outptr4], #0x20]\n"
1306 "add %[outptr4], %[outptr4], #0x30\n"
1307 "str q20, [%[outptr5]]\n"
1308 "str q13, [%[outptr5], #0x10]\n"
1309 "str q14, [%[outptr5], #0x20]\n"
1310 "add %[outptr5], %[outptr5], #0x30\n"
1311 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1312 [inptr] "+r" (inptr)
1313 : [biasptr] "r" (biasptr)
1314 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1315 );
1316 }
1317 }
1318 break;
1319
1320 case 7:
1321 {
1322 if ((i+11) >= xmax)
1323 {
1324 for (int xi=0; xi<11; xi++)
1325 {
1326 if ((i+xi) < xmax)
1327 {
1328 *outptr0 = biasptr[xi] + inptr[xi];
1329 outptr0++;
1330 *outptr1 = biasptr[xi] + inptr[xi + 12];
1331 outptr1++;
1332 *outptr2 = biasptr[xi] + inptr[xi + 24];
1333 outptr2++;
1334 *outptr3 = biasptr[xi] + inptr[xi + 36];
1335 outptr3++;
1336 *outptr4 = biasptr[xi] + inptr[xi + 48];
1337 outptr4++;
1338 *outptr5 = biasptr[xi] + inptr[xi + 60];
1339 outptr5++;
1340 *outptr6 = biasptr[xi] + inptr[xi + 72];
1341 outptr6++;
1342 }
1343 }
1344 inptr += 96;
1345 } else {
1346 /* Optimized routine to copy an entire block */
1347 __asm __volatile (
1348 "ldr q2, [%[biasptr]]\n"
1349 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1350 "ldr q3, [%[biasptr], #0x10]\n"
1351 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1352 "ldr q4, [%[biasptr], #0x20]\n"
1353 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1354 "ldr q13, [%[inptr]]\n"
1355 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1356 "ldr q14, [%[inptr], #0x10]\n"
1357 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1358 "add v13.4s, v13.4s, v2.4s\n"
1359 "ldr q15, [%[inptr], #0x20]\n"
1360 "ldr q16, [%[inptr], #0x30]\n"
1361 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1362 "add v14.4s, v14.4s, v3.4s\n"
1363 "str q13, [%[outptr0]]\n"
1364 "add v15.4s, v15.4s, v4.4s\n"
1365 "ldr q17, [%[inptr], #0x40]\n"
1366 "add v16.4s, v16.4s, v2.4s\n"
1367 "ldr q18, [%[inptr], #0x50]\n"
1368 "ldr q19, [%[inptr], #0x60]\n"
1369 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1370 "add v17.4s, v17.4s, v3.4s\n"
1371 "str q14, [%[outptr0], #0x10]\n"
1372 "add v18.4s, v18.4s, v4.4s\n"
1373 "ldr q20, [%[inptr], #0x70]\n"
1374 "add v19.4s, v19.4s, v2.4s\n"
1375 "ldr q13, [%[inptr], #0x80]\n"
1376 "ldr q14, [%[inptr], #0x90]\n"
1377 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1378 "add v20.4s, v20.4s, v3.4s\n"
1379 "str q15, [%[outptr0], #0x20]\n"
1380 "add v13.4s, v13.4s, v4.4s\n"
1381 "ldr q15, [%[inptr], #0xa0]\n"
1382 "add v14.4s, v14.4s, v2.4s\n"
1383 "add %[outptr0], %[outptr0], #0x30\n"
1384 "str q16, [%[outptr1]]\n"
1385 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1386 "add v15.4s, v15.4s, v3.4s\n"
1387 "ldr q16, [%[inptr], #0xb0]\n"
1388 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1389 "str q17, [%[outptr1], #0x10]\n"
1390 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1391 "add v16.4s, v16.4s, v4.4s\n"
1392 "ldr q17, [%[inptr], #0xc0]\n"
1393 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1394 "str q18, [%[outptr1], #0x20]\n"
1395 "add %[outptr1], %[outptr1], #0x30\n"
1396 "add v17.4s, v17.4s, v2.4s\n"
1397 "ldr q18, [%[inptr], #0xd0]\n"
1398 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1399 "str q19, [%[outptr2]]\n"
1400 "ldr q19, [%[inptr], #0xe0]\n"
1401 "add v18.4s, v18.4s, v3.4s\n"
1402 "str q20, [%[outptr2], #0x10]\n"
1403 "add v19.4s, v19.4s, v4.4s\n"
1404 "ldr q20, [%[inptr], #0xf0]\n"
1405 "str q13, [%[outptr2], #0x20]\n"
1406 "add %[outptr2], %[outptr2], #0x30\n"
1407 "add v20.4s, v20.4s, v2.4s\n"
1408 "ldr q13, [%[inptr], #0x100]\n"
1409 "str q14, [%[outptr3]]\n"
1410 "ldr q14, [%[inptr], #0x110]\n"
1411 "add v13.4s, v13.4s, v3.4s\n"
1412 "str q15, [%[outptr3], #0x10]\n"
1413 "add v14.4s, v14.4s, v4.4s\n"
1414 "ldr q15, [%[inptr], #0x120]\n"
1415 "str q16, [%[outptr3], #0x20]\n"
1416 "add %[outptr3], %[outptr3], #0x30\n"
1417 "add v15.4s, v15.4s, v2.4s\n"
1418 "ldr q16, [%[inptr], #0x130]\n"
1419 "str q17, [%[outptr4]]\n"
1420 "ldr q17, [%[inptr], #0x140]\n"
1421 "add %[inptr], %[inptr], #0x180\n"
1422 "add v16.4s, v16.4s, v3.4s\n"
1423 "str q18, [%[outptr4], #0x10]\n"
1424 "add v17.4s, v17.4s, v4.4s\n"
1425 "str q19, [%[outptr4], #0x20]\n"
1426 "add %[outptr4], %[outptr4], #0x30\n"
1427 "str q20, [%[outptr5]]\n"
1428 "str q13, [%[outptr5], #0x10]\n"
1429 "str q14, [%[outptr5], #0x20]\n"
1430 "add %[outptr5], %[outptr5], #0x30\n"
1431 "str q15, [%[outptr6]]\n"
1432 "str q16, [%[outptr6], #0x10]\n"
1433 "str q17, [%[outptr6], #0x20]\n"
1434 "add %[outptr6], %[outptr6], #0x30\n"
1435 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1436 [inptr] "+r" (inptr)
1437 : [biasptr] "r" (biasptr)
1438 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1439 );
1440 }
1441 }
1442 break;
1443
1444 default:
1445 case 8:
1446 {
1447 if ((i+11) >= xmax)
1448 {
1449 for (int xi=0; xi<11; xi++)
1450 {
1451 if ((i+xi) < xmax)
1452 {
1453 *outptr0 = biasptr[xi] + inptr[xi];
1454 outptr0++;
1455 *outptr1 = biasptr[xi] + inptr[xi + 12];
1456 outptr1++;
1457 *outptr2 = biasptr[xi] + inptr[xi + 24];
1458 outptr2++;
1459 *outptr3 = biasptr[xi] + inptr[xi + 36];
1460 outptr3++;
1461 *outptr4 = biasptr[xi] + inptr[xi + 48];
1462 outptr4++;
1463 *outptr5 = biasptr[xi] + inptr[xi + 60];
1464 outptr5++;
1465 *outptr6 = biasptr[xi] + inptr[xi + 72];
1466 outptr6++;
1467 *outptr7 = biasptr[xi] + inptr[xi + 84];
1468 outptr7++;
1469 }
1470 }
1471 inptr += 96;
1472 } else {
1473 /* Optimized routine to copy an entire block */
1474 __asm __volatile (
1475 "ldr q2, [%[biasptr]]\n"
1476 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1477 "ldr q3, [%[biasptr], #0x10]\n"
1478 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1479 "ldr q4, [%[biasptr], #0x20]\n"
1480 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1481 "ldr q13, [%[inptr]]\n"
1482 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1483 "ldr q14, [%[inptr], #0x10]\n"
1484 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1485 "add v13.4s, v13.4s, v2.4s\n"
1486 "ldr q15, [%[inptr], #0x20]\n"
1487 "ldr q16, [%[inptr], #0x30]\n"
1488 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1489 "add v14.4s, v14.4s, v3.4s\n"
1490 "str q13, [%[outptr0]]\n"
1491 "add v15.4s, v15.4s, v4.4s\n"
1492 "ldr q17, [%[inptr], #0x40]\n"
1493 "add v16.4s, v16.4s, v2.4s\n"
1494 "ldr q18, [%[inptr], #0x50]\n"
1495 "ldr q19, [%[inptr], #0x60]\n"
1496 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1497 "add v17.4s, v17.4s, v3.4s\n"
1498 "str q14, [%[outptr0], #0x10]\n"
1499 "add v18.4s, v18.4s, v4.4s\n"
1500 "ldr q20, [%[inptr], #0x70]\n"
1501 "add v19.4s, v19.4s, v2.4s\n"
1502 "ldr q13, [%[inptr], #0x80]\n"
1503 "ldr q14, [%[inptr], #0x90]\n"
1504 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1505 "add v20.4s, v20.4s, v3.4s\n"
1506 "str q15, [%[outptr0], #0x20]\n"
1507 "add v13.4s, v13.4s, v4.4s\n"
1508 "ldr q15, [%[inptr], #0xa0]\n"
1509 "add v14.4s, v14.4s, v2.4s\n"
1510 "add %[outptr0], %[outptr0], #0x30\n"
1511 "str q16, [%[outptr1]]\n"
1512 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1513 "add v15.4s, v15.4s, v3.4s\n"
1514 "ldr q16, [%[inptr], #0xb0]\n"
1515 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1516 "str q17, [%[outptr1], #0x10]\n"
1517 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1518 "add v16.4s, v16.4s, v4.4s\n"
1519 "ldr q17, [%[inptr], #0xc0]\n"
1520 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1521 "str q18, [%[outptr1], #0x20]\n"
1522 "add %[outptr1], %[outptr1], #0x30\n"
1523 "add v17.4s, v17.4s, v2.4s\n"
1524 "ldr q18, [%[inptr], #0xd0]\n"
1525 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1526 "str q19, [%[outptr2]]\n"
1527 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1528 "add v18.4s, v18.4s, v3.4s\n"
1529 "ldr q19, [%[inptr], #0xe0]\n"
1530 "str q20, [%[outptr2], #0x10]\n"
1531 "ldr q20, [%[inptr], #0xf0]\n"
1532 "add v19.4s, v19.4s, v4.4s\n"
1533 "str q13, [%[outptr2], #0x20]\n"
1534 "add %[outptr2], %[outptr2], #0x30\n"
1535 "add v20.4s, v20.4s, v2.4s\n"
1536 "ldr q13, [%[inptr], #0x100]\n"
1537 "str q14, [%[outptr3]]\n"
1538 "ldr q14, [%[inptr], #0x110]\n"
1539 "add v13.4s, v13.4s, v3.4s\n"
1540 "str q15, [%[outptr3], #0x10]\n"
1541 "add v14.4s, v14.4s, v4.4s\n"
1542 "ldr q15, [%[inptr], #0x120]\n"
1543 "str q16, [%[outptr3], #0x20]\n"
1544 "add %[outptr3], %[outptr3], #0x30\n"
1545 "add v15.4s, v15.4s, v2.4s\n"
1546 "ldr q16, [%[inptr], #0x130]\n"
1547 "str q17, [%[outptr4]]\n"
1548 "ldr q17, [%[inptr], #0x140]\n"
1549 "add v16.4s, v16.4s, v3.4s\n"
1550 "str q18, [%[outptr4], #0x10]\n"
1551 "add v17.4s, v17.4s, v4.4s\n"
1552 "ldr q18, [%[inptr], #0x150]\n"
1553 "str q19, [%[outptr4], #0x20]\n"
1554 "add %[outptr4], %[outptr4], #0x30\n"
1555 "add v18.4s, v18.4s, v2.4s\n"
1556 "ldr q19, [%[inptr], #0x160]\n"
1557 "str q20, [%[outptr5]]\n"
1558 "ldr q20, [%[inptr], #0x170]\n"
1559 "add %[inptr], %[inptr], #0x180\n"
1560 "add v19.4s, v19.4s, v3.4s\n"
1561 "str q13, [%[outptr5], #0x10]\n"
1562 "add v20.4s, v20.4s, v4.4s\n"
1563 "str q14, [%[outptr5], #0x20]\n"
1564 "add %[outptr5], %[outptr5], #0x30\n"
1565 "str q15, [%[outptr6]]\n"
1566 "str q16, [%[outptr6], #0x10]\n"
1567 "str q17, [%[outptr6], #0x20]\n"
1568 "add %[outptr6], %[outptr6], #0x30\n"
1569 "str q18, [%[outptr7]]\n"
1570 "str q19, [%[outptr7], #0x10]\n"
1571 "str q20, [%[outptr7], #0x20]\n"
1572 "add %[outptr7], %[outptr7], #0x30\n"
1573 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1574 [inptr] "+r" (inptr)
1575 : [biasptr] "r" (biasptr)
1576 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1577 );
1578 }
1579 }
1580 break;
1581
1582
1583 }
1584 }
1585 }
1586 }
1587}
1588
1589#endif // __aarch64__