blob: 752e837f8dc0902ca5352a43d8054b8ded8eee09 [file] [log] [blame]
Georgios Pinitas421405b2018-10-26 19:05:32 +01001/*
2 * Copyright (c) 2018 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29template<typename T>
30inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
31{
32 uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
33 const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
34
35 for (int y=y0; y<ymax; y+=8)
36 {
37 const int height = ymax-y;
38 const long inwidth = (kmax - k0);
39 const long outwidth = inwidth * 8;
40 long inpos = 0;
41 long outpos = 0;
42
43 uint32_t *outptr = master_outptr;
44 master_outptr += outwidth;
45
46 const uint32_t *inptr0 = inptr + y * ldin + k0;
47 const uint32_t *inptr1 = inptr0 + ldin;
48 const uint32_t *inptr2 = inptr1 + ldin;
49 const uint32_t *inptr3 = inptr2 + ldin;
50 const uint32_t *inptr4 = inptr3 + ldin;
51 const uint32_t *inptr5 = inptr4 + ldin;
52 const uint32_t *inptr6 = inptr5 + ldin;
53 const uint32_t *inptr7 = inptr6 + ldin;
54
55 switch(height)
56 {
57 case 1:
58 __asm __volatile(
59 "1:\n"
60 "whilelt p0.s, %[inpos], %[inwidth]\n"
61 "b.none 2f\n"
62 "mov z4.s, #0\n"
63 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
64 "incw %[inpos], all, mul #1\n"
65 "whilelt p0.s, %[outpos], %[outwidth]\n"
66 "incw %[outpos], all, mul #1\n"
67 "zip1 z8.s, z0.s, z4.s\n"
68 "zip2 z9.s, z0.s, z4.s\n"
69 "whilelt p1.s, %[outpos], %[outwidth]\n"
70 "incw %[outpos], all, mul #1\n"
71 "zip1 z0.s, z8.s, z4.s\n"
72 "zip2 z1.s, z8.s, z4.s\n"
73 "zip1 z2.s, z9.s, z4.s\n"
74 "zip2 z3.s, z9.s, z4.s\n"
75 "whilelt p2.s, %[outpos], %[outwidth]\n"
76 "zip1 z8.s, z0.s, z4.s\n"
77 "incw %[outpos], all, mul #1\n"
78 "zip2 z9.s, z0.s, z4.s\n"
79 "zip1 z10.s, z1.s, z4.s\n"
80 "zip2 z11.s, z1.s, z4.s\n"
81 "st1w z8.s, p0, [%[outptr]]\n"
82 "zip1 z12.s, z2.s, z4.s\n"
83 "whilelt p3.s, %[outpos], %[outwidth]\n"
84 "zip2 z13.s, z2.s, z4.s\n"
85 "incw %[outpos], all, mul #1\n"
86 "zip1 z14.s, z3.s, z4.s\n"
87 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
88 "zip2 z15.s, z3.s, z4.s\n"
89 "whilelt p4.s, %[outpos], %[outwidth]\n"
90 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
91 "incw %[outpos], all, mul #1\n"
92 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
93 "whilelt p5.s, %[outpos], %[outwidth]\n"
94 "incw %[outpos], all, mul #1\n"
95 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
96 "whilelt p6.s, %[outpos], %[outwidth]\n"
97 "incw %[outpos], all, mul #1\n"
98 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
99 "whilelt p7.s, %[outpos], %[outwidth]\n"
100 "incw %[outpos], all, mul #1\n"
101 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
102 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
103 "addvl %[outptr], %[outptr], #8\n"
104 "b 1b\n"
105 "2:\n"
106 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
107 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
108 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
109 );
110 break;
111
112 case 2:
113 __asm __volatile(
114 "1:\n"
115 "whilelt p0.s, %[inpos], %[inwidth]\n"
116 "b.none 2f\n"
117 "mov z4.s, #0\n"
118 "mov z14.s, #0\n"
119 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
120 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
121 "incw %[inpos], all, mul #1\n"
122 "whilelt p0.s, %[outpos], %[outwidth]\n"
123 "incw %[outpos], all, mul #1\n"
124 "zip1 z8.s, z0.s, z4.s\n"
125 "zip2 z9.s, z0.s, z4.s\n"
126 "zip1 z10.s, z1.s, z4.s\n"
127 "zip2 z11.s, z1.s, z4.s\n"
128 "whilelt p1.s, %[outpos], %[outwidth]\n"
129 "zip1 z0.s, z8.s, z4.s\n"
130 "incw %[outpos], all, mul #1\n"
131 "zip2 z1.s, z8.s, z4.s\n"
132 "zip1 z2.s, z9.s, z4.s\n"
133 "zip2 z3.s, z9.s, z4.s\n"
134 "zip1 z4.s, z10.s, z14.s\n"
135 "whilelt p2.s, %[outpos], %[outwidth]\n"
136 "zip2 z5.s, z10.s, z14.s\n"
137 "incw %[outpos], all, mul #1\n"
138 "zip1 z6.s, z11.s, z14.s\n"
139 "zip2 z7.s, z11.s, z14.s\n"
140 "zip1 z8.s, z0.s, z4.s\n"
141 "zip2 z9.s, z0.s, z4.s\n"
142 "whilelt p3.s, %[outpos], %[outwidth]\n"
143 "zip1 z10.s, z1.s, z5.s\n"
144 "incw %[outpos], all, mul #1\n"
145 "zip2 z11.s, z1.s, z5.s\n"
146 "st1w z8.s, p0, [%[outptr]]\n"
147 "zip1 z12.s, z2.s, z6.s\n"
148 "zip2 z13.s, z2.s, z6.s\n"
149 "zip1 z14.s, z3.s, z7.s\n"
150 "whilelt p4.s, %[outpos], %[outwidth]\n"
151 "zip2 z15.s, z3.s, z7.s\n"
152 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
153 "incw %[outpos], all, mul #1\n"
154 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
155 "whilelt p5.s, %[outpos], %[outwidth]\n"
156 "incw %[outpos], all, mul #1\n"
157 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
158 "whilelt p6.s, %[outpos], %[outwidth]\n"
159 "incw %[outpos], all, mul #1\n"
160 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
161 "whilelt p7.s, %[outpos], %[outwidth]\n"
162 "incw %[outpos], all, mul #1\n"
163 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
164 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
165 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
166 "addvl %[outptr], %[outptr], #8\n"
167 "b 1b\n"
168 "2:\n"
169 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
170 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
171 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
172 );
173 break;
174
175 case 3:
176 __asm __volatile(
177 "1:\n"
178 "whilelt p0.s, %[inpos], %[inwidth]\n"
179 "b.none 2f\n"
180 "mov z4.s, #0\n"
181 "mov z14.s, #0\n"
182 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
183 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
184 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
185 "incw %[inpos], all, mul #1\n"
186 "whilelt p0.s, %[outpos], %[outwidth]\n"
187 "zip1 z8.s, z0.s, z4.s\n"
188 "incw %[outpos], all, mul #1\n"
189 "zip2 z9.s, z0.s, z4.s\n"
190 "zip1 z10.s, z1.s, z4.s\n"
191 "zip2 z11.s, z1.s, z4.s\n"
192 "zip1 z12.s, z2.s, z4.s\n"
193 "whilelt p1.s, %[outpos], %[outwidth]\n"
194 "zip2 z13.s, z2.s, z4.s\n"
195 "incw %[outpos], all, mul #1\n"
196 "zip1 z4.s, z10.s, z14.s\n"
197 "zip1 z0.s, z8.s, z12.s\n"
198 "zip2 z1.s, z8.s, z12.s\n"
199 "zip1 z2.s, z9.s, z13.s\n"
200 "whilelt p2.s, %[outpos], %[outwidth]\n"
201 "zip2 z3.s, z9.s, z13.s\n"
202 "incw %[outpos], all, mul #1\n"
203 "zip2 z5.s, z10.s, z14.s\n"
204 "zip1 z6.s, z11.s, z14.s\n"
205 "zip2 z7.s, z11.s, z14.s\n"
206 "zip1 z8.s, z0.s, z4.s\n"
207 "whilelt p3.s, %[outpos], %[outwidth]\n"
208 "zip2 z9.s, z0.s, z4.s\n"
209 "incw %[outpos], all, mul #1\n"
210 "zip1 z10.s, z1.s, z5.s\n"
211 "st1w z8.s, p0, [%[outptr]]\n"
212 "zip2 z11.s, z1.s, z5.s\n"
213 "zip1 z12.s, z2.s, z6.s\n"
214 "zip2 z13.s, z2.s, z6.s\n"
215 "whilelt p4.s, %[outpos], %[outwidth]\n"
216 "zip1 z14.s, z3.s, z7.s\n"
217 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
218 "zip2 z15.s, z3.s, z7.s\n"
219 "incw %[outpos], all, mul #1\n"
220 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
221 "whilelt p5.s, %[outpos], %[outwidth]\n"
222 "incw %[outpos], all, mul #1\n"
223 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
224 "whilelt p6.s, %[outpos], %[outwidth]\n"
225 "incw %[outpos], all, mul #1\n"
226 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
227 "whilelt p7.s, %[outpos], %[outwidth]\n"
228 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
229 "incw %[outpos], all, mul #1\n"
230 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
231 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
232 "addvl %[outptr], %[outptr], #8\n"
233 "b 1b\n"
234 "2:\n"
235 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
236 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
237 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
238 );
239 break;
240
241 case 4:
242 __asm __volatile(
243 "1:\n"
244 "whilelt p0.s, %[inpos], %[inwidth]\n"
245 "b.none 2f\n"
246 "mov z4.s, #0\n"
247 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
248 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
249 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
250 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
251 "incw %[inpos], all, mul #1\n"
252 "zip1 z8.s, z0.s, z4.s\n"
253 "whilelt p0.s, %[outpos], %[outwidth]\n"
254 "zip2 z9.s, z0.s, z4.s\n"
255 "incw %[outpos], all, mul #1\n"
256 "zip1 z10.s, z1.s, z4.s\n"
257 "zip2 z11.s, z1.s, z4.s\n"
258 "zip1 z12.s, z2.s, z4.s\n"
259 "zip2 z13.s, z2.s, z4.s\n"
260 "whilelt p1.s, %[outpos], %[outwidth]\n"
261 "zip1 z14.s, z3.s, z4.s\n"
262 "incw %[outpos], all, mul #1\n"
263 "zip2 z15.s, z3.s, z4.s\n"
264 "zip1 z0.s, z8.s, z12.s\n"
265 "zip2 z1.s, z8.s, z12.s\n"
266 "zip1 z2.s, z9.s, z13.s\n"
267 "whilelt p2.s, %[outpos], %[outwidth]\n"
268 "zip2 z3.s, z9.s, z13.s\n"
269 "incw %[outpos], all, mul #1\n"
270 "zip1 z4.s, z10.s, z14.s\n"
271 "zip2 z5.s, z10.s, z14.s\n"
272 "zip1 z6.s, z11.s, z15.s\n"
273 "zip2 z7.s, z11.s, z15.s\n"
274 "whilelt p3.s, %[outpos], %[outwidth]\n"
275 "zip1 z8.s, z0.s, z4.s\n"
276 "incw %[outpos], all, mul #1\n"
277 "zip2 z9.s, z0.s, z4.s\n"
278 "zip1 z10.s, z1.s, z5.s\n"
279 "zip2 z11.s, z1.s, z5.s\n"
280 "st1w z8.s, p0, [%[outptr]]\n"
281 "zip1 z12.s, z2.s, z6.s\n"
282 "whilelt p4.s, %[outpos], %[outwidth]\n"
283 "zip2 z13.s, z2.s, z6.s\n"
284 "incw %[outpos], all, mul #1\n"
285 "zip1 z14.s, z3.s, z7.s\n"
286 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
287 "zip2 z15.s, z3.s, z7.s\n"
288 "whilelt p5.s, %[outpos], %[outwidth]\n"
289 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
290 "incw %[outpos], all, mul #1\n"
291 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
292 "whilelt p6.s, %[outpos], %[outwidth]\n"
293 "incw %[outpos], all, mul #1\n"
294 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
295 "whilelt p7.s, %[outpos], %[outwidth]\n"
296 "incw %[outpos], all, mul #1\n"
297 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
298 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
299 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
300 "addvl %[outptr], %[outptr], #8\n"
301 "b 1b\n"
302 "2:\n"
303 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
304 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
305 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
306 );
307 break;
308
309 case 5:
310 __asm __volatile(
311 "1:\n"
312 "whilelt p0.s, %[inpos], %[inwidth]\n"
313 "b.none 2f\n"
314 "mov z5.s, #0\n"
315 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
316 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
317 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
318 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
319 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
320 "incw %[inpos], all, mul #1\n"
321 "zip1 z10.s, z1.s, z5.s\n"
322 "whilelt p0.s, %[outpos], %[outwidth]\n"
323 "zip2 z11.s, z1.s, z5.s\n"
324 "incw %[outpos], all, mul #1\n"
325 "zip1 z8.s, z0.s, z4.s\n"
326 "zip2 z9.s, z0.s, z4.s\n"
327 "zip1 z12.s, z2.s, z5.s\n"
328 "zip2 z13.s, z2.s, z5.s\n"
329 "whilelt p1.s, %[outpos], %[outwidth]\n"
330 "zip1 z14.s, z3.s, z5.s\n"
331 "incw %[outpos], all, mul #1\n"
332 "zip2 z15.s, z3.s, z5.s\n"
333 "zip1 z0.s, z8.s, z12.s\n"
334 "zip2 z1.s, z8.s, z12.s\n"
335 "zip1 z2.s, z9.s, z13.s\n"
336 "whilelt p2.s, %[outpos], %[outwidth]\n"
337 "zip2 z3.s, z9.s, z13.s\n"
338 "incw %[outpos], all, mul #1\n"
339 "zip1 z4.s, z10.s, z14.s\n"
340 "zip2 z5.s, z10.s, z14.s\n"
341 "zip1 z6.s, z11.s, z15.s\n"
342 "zip2 z7.s, z11.s, z15.s\n"
343 "whilelt p3.s, %[outpos], %[outwidth]\n"
344 "zip1 z8.s, z0.s, z4.s\n"
345 "incw %[outpos], all, mul #1\n"
346 "zip2 z9.s, z0.s, z4.s\n"
347 "zip1 z10.s, z1.s, z5.s\n"
348 "zip2 z11.s, z1.s, z5.s\n"
349 "st1w z8.s, p0, [%[outptr]]\n"
350 "zip1 z12.s, z2.s, z6.s\n"
351 "whilelt p4.s, %[outpos], %[outwidth]\n"
352 "zip2 z13.s, z2.s, z6.s\n"
353 "incw %[outpos], all, mul #1\n"
354 "zip1 z14.s, z3.s, z7.s\n"
355 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
356 "zip2 z15.s, z3.s, z7.s\n"
357 "whilelt p5.s, %[outpos], %[outwidth]\n"
358 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
359 "incw %[outpos], all, mul #1\n"
360 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
361 "whilelt p6.s, %[outpos], %[outwidth]\n"
362 "incw %[outpos], all, mul #1\n"
363 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
364 "whilelt p7.s, %[outpos], %[outwidth]\n"
365 "incw %[outpos], all, mul #1\n"
366 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
367 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
368 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
369 "addvl %[outptr], %[outptr], #8\n"
370 "b 1b\n"
371 "2:\n"
372 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
373 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
374 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
375 );
376 break;
377
378 case 6:
379 __asm __volatile(
380 "1:\n"
381 "whilelt p0.s, %[inpos], %[inwidth]\n"
382 "b.none 2f\n"
383 "mov z6.s, #0\n"
384 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
385 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
386 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
387 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
388 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
389 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
390 "incw %[inpos], all, mul #1\n"
391 "zip1 z12.s, z2.s, z6.s\n"
392 "whilelt p0.s, %[outpos], %[outwidth]\n"
393 "zip1 z8.s, z0.s, z4.s\n"
394 "incw %[outpos], all, mul #1\n"
395 "zip2 z9.s, z0.s, z4.s\n"
396 "zip1 z10.s, z1.s, z5.s\n"
397 "zip2 z11.s, z1.s, z5.s\n"
398 "zip2 z13.s, z2.s, z6.s\n"
399 "whilelt p1.s, %[outpos], %[outwidth]\n"
400 "zip1 z14.s, z3.s, z6.s\n"
401 "incw %[outpos], all, mul #1\n"
402 "zip2 z15.s, z3.s, z6.s\n"
403 "zip1 z0.s, z8.s, z12.s\n"
404 "zip2 z1.s, z8.s, z12.s\n"
405 "zip1 z2.s, z9.s, z13.s\n"
406 "whilelt p2.s, %[outpos], %[outwidth]\n"
407 "zip2 z3.s, z9.s, z13.s\n"
408 "incw %[outpos], all, mul #1\n"
409 "zip1 z4.s, z10.s, z14.s\n"
410 "zip2 z5.s, z10.s, z14.s\n"
411 "zip1 z6.s, z11.s, z15.s\n"
412 "zip2 z7.s, z11.s, z15.s\n"
413 "whilelt p3.s, %[outpos], %[outwidth]\n"
414 "zip1 z8.s, z0.s, z4.s\n"
415 "incw %[outpos], all, mul #1\n"
416 "zip2 z9.s, z0.s, z4.s\n"
417 "zip1 z10.s, z1.s, z5.s\n"
418 "zip2 z11.s, z1.s, z5.s\n"
419 "st1w z8.s, p0, [%[outptr]]\n"
420 "zip1 z12.s, z2.s, z6.s\n"
421 "whilelt p4.s, %[outpos], %[outwidth]\n"
422 "zip2 z13.s, z2.s, z6.s\n"
423 "incw %[outpos], all, mul #1\n"
424 "zip1 z14.s, z3.s, z7.s\n"
425 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
426 "zip2 z15.s, z3.s, z7.s\n"
427 "whilelt p5.s, %[outpos], %[outwidth]\n"
428 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
429 "incw %[outpos], all, mul #1\n"
430 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
431 "whilelt p6.s, %[outpos], %[outwidth]\n"
432 "incw %[outpos], all, mul #1\n"
433 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
434 "whilelt p7.s, %[outpos], %[outwidth]\n"
435 "incw %[outpos], all, mul #1\n"
436 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
437 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
438 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
439 "addvl %[outptr], %[outptr], #8\n"
440 "b 1b\n"
441 "2:\n"
442 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
443 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
444 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
445 );
446 break;
447
448 case 7:
449 __asm __volatile(
450 "1:\n"
451 "whilelt p0.s, %[inpos], %[inwidth]\n"
452 "b.none 2f\n"
453 "mov z7.s, #0\n"
454 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
455 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
456 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
457 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
458 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
459 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
460 "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
461 "incw %[inpos], all, mul #1\n"
462 "zip1 z14.s, z3.s, z7.s\n"
463 "whilelt p0.s, %[outpos], %[outwidth]\n"
464 "zip1 z8.s, z0.s, z4.s\n"
465 "incw %[outpos], all, mul #1\n"
466 "zip2 z9.s, z0.s, z4.s\n"
467 "zip1 z10.s, z1.s, z5.s\n"
468 "zip2 z11.s, z1.s, z5.s\n"
469 "zip1 z12.s, z2.s, z6.s\n"
470 "whilelt p1.s, %[outpos], %[outwidth]\n"
471 "zip2 z13.s, z2.s, z6.s\n"
472 "incw %[outpos], all, mul #1\n"
473 "zip2 z15.s, z3.s, z7.s\n"
474 "zip1 z0.s, z8.s, z12.s\n"
475 "zip2 z1.s, z8.s, z12.s\n"
476 "zip1 z2.s, z9.s, z13.s\n"
477 "whilelt p2.s, %[outpos], %[outwidth]\n"
478 "zip2 z3.s, z9.s, z13.s\n"
479 "incw %[outpos], all, mul #1\n"
480 "zip1 z4.s, z10.s, z14.s\n"
481 "zip2 z5.s, z10.s, z14.s\n"
482 "zip1 z6.s, z11.s, z15.s\n"
483 "zip2 z7.s, z11.s, z15.s\n"
484 "whilelt p3.s, %[outpos], %[outwidth]\n"
485 "zip1 z8.s, z0.s, z4.s\n"
486 "incw %[outpos], all, mul #1\n"
487 "zip2 z9.s, z0.s, z4.s\n"
488 "zip1 z10.s, z1.s, z5.s\n"
489 "zip2 z11.s, z1.s, z5.s\n"
490 "st1w z8.s, p0, [%[outptr]]\n"
491 "zip1 z12.s, z2.s, z6.s\n"
492 "whilelt p4.s, %[outpos], %[outwidth]\n"
493 "zip2 z13.s, z2.s, z6.s\n"
494 "incw %[outpos], all, mul #1\n"
495 "zip1 z14.s, z3.s, z7.s\n"
496 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
497 "zip2 z15.s, z3.s, z7.s\n"
498 "whilelt p5.s, %[outpos], %[outwidth]\n"
499 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
500 "incw %[outpos], all, mul #1\n"
501 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
502 "whilelt p6.s, %[outpos], %[outwidth]\n"
503 "incw %[outpos], all, mul #1\n"
504 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
505 "whilelt p7.s, %[outpos], %[outwidth]\n"
506 "incw %[outpos], all, mul #1\n"
507 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
508 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
509 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
510 "addvl %[outptr], %[outptr], #8\n"
511 "b 1b\n"
512 "2:\n"
513 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
514 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
515 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
516 );
517 break;
518
519 default:
520 case 8:
521 __asm __volatile(
522 "1:\n"
523 "whilelt p0.s, %[inpos], %[inwidth]\n"
524 "b.none 2f\n"
525 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
526 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
527 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
528 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
529 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
530 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
531 "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
532 "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
533 "incw %[inpos], all, mul #1\n"
534 "zip1 z8.s, z0.s, z4.s\n"
535 "whilelt p0.s, %[outpos], %[outwidth]\n"
536 "zip2 z9.s, z0.s, z4.s\n"
537 "incw %[outpos], all, mul #1\n"
538 "zip1 z10.s, z1.s, z5.s\n"
539 "zip2 z11.s, z1.s, z5.s\n"
540 "zip1 z12.s, z2.s, z6.s\n"
541 "zip2 z13.s, z2.s, z6.s\n"
542 "whilelt p1.s, %[outpos], %[outwidth]\n"
543 "zip1 z14.s, z3.s, z7.s\n"
544 "incw %[outpos], all, mul #1\n"
545 "zip2 z15.s, z3.s, z7.s\n"
546 "zip1 z0.s, z8.s, z12.s\n"
547 "zip2 z1.s, z8.s, z12.s\n"
548 "zip1 z2.s, z9.s, z13.s\n"
549 "whilelt p2.s, %[outpos], %[outwidth]\n"
550 "zip2 z3.s, z9.s, z13.s\n"
551 "incw %[outpos], all, mul #1\n"
552 "zip1 z4.s, z10.s, z14.s\n"
553 "zip2 z5.s, z10.s, z14.s\n"
554 "zip1 z6.s, z11.s, z15.s\n"
555 "zip2 z7.s, z11.s, z15.s\n"
556 "whilelt p3.s, %[outpos], %[outwidth]\n"
557 "zip1 z8.s, z0.s, z4.s\n"
558 "incw %[outpos], all, mul #1\n"
559 "zip2 z9.s, z0.s, z4.s\n"
560 "zip1 z10.s, z1.s, z5.s\n"
561 "zip2 z11.s, z1.s, z5.s\n"
562 "st1w z8.s, p0, [%[outptr]]\n"
563 "zip1 z12.s, z2.s, z6.s\n"
564 "whilelt p4.s, %[outpos], %[outwidth]\n"
565 "zip2 z13.s, z2.s, z6.s\n"
566 "incw %[outpos], all, mul #1\n"
567 "zip1 z14.s, z3.s, z7.s\n"
568 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
569 "zip2 z15.s, z3.s, z7.s\n"
570 "whilelt p5.s, %[outpos], %[outwidth]\n"
571 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
572 "incw %[outpos], all, mul #1\n"
573 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
574 "whilelt p6.s, %[outpos], %[outwidth]\n"
575 "incw %[outpos], all, mul #1\n"
576 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
577 "whilelt p7.s, %[outpos], %[outwidth]\n"
578 "incw %[outpos], all, mul #1\n"
579 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
580 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
581 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
582 "addvl %[outptr], %[outptr], #8\n"
583 "b 1b\n"
584 "2:\n"
585 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
586 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
587 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
588 );
589 break;
590
591
592 }
593 }
594}
595
596#endif // __ARM_FEATURE_SVE