blob: 63c21be6bb00e84ad447bf73cb9a2a5e471c67bb [file] [log] [blame]
Georgios Pinitas421405b2018-10-26 19:05:32 +01001/*
2 * Copyright (c) 2018 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29template<typename T>
30inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
31{
32 uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
33 const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
34
35 for (int y=y0; y<ymax; y+=8)
36 {
37 const int height = ymax-y;
38 const long inwidth = (kmax - k0);
39 const long outwidth = (inwidth * 8 + 1) / 2;
40 long inpos = 0;
41 long outpos = 0;
42
43 uint16_t *outptr = master_outptr;
44 master_outptr += (outwidth * 2);
45
46 const uint16_t *inptr0 = inptr + y * ldin + k0;
47 const uint16_t *inptr1 = inptr0 + ldin;
48 const uint16_t *inptr2 = inptr1 + ldin;
49 const uint16_t *inptr3 = inptr2 + ldin;
50 const uint16_t *inptr4 = inptr3 + ldin;
51 const uint16_t *inptr5 = inptr4 + ldin;
52 const uint16_t *inptr6 = inptr5 + ldin;
53 const uint16_t *inptr7 = inptr6 + ldin;
54
55 switch(height)
56 {
57 case 1:
58 __asm __volatile(
59 "1:\n"
60 "whilelt p0.h, %[inpos], %[inwidth]\n"
61 "b.none 2f\n"
62 "mov z4.h, #0\n"
63 "ld1h z0.h, p0/z, [%[inptr0]]\n"
64 "zip1 z8.s, z0.s, z4.s\n"
65 "inch %[inpos], all, mul #1\n"
66 "zip2 z9.s, z0.s, z4.s\n"
67 "addvl %[inptr0], %[inptr0], #1\n"
68 "zip1 z0.s, z8.s, z4.s\n"
69 "whilelt p0.s, %[outpos], %[outwidth]\n"
70 "zip2 z1.s, z8.s, z4.s\n"
71 "incw %[outpos], all, mul #1\n"
72 "zip1 z2.s, z9.s, z4.s\n"
73 "whilelt p1.s, %[outpos], %[outwidth]\n"
74 "zip2 z3.s, z9.s, z4.s\n"
75 "incw %[outpos], all, mul #1\n"
76 "zip1 z8.s, z0.s, z4.s\n"
77 "st1w z8.s, p0, [%[outptr]]\n"
78 "zip2 z9.s, z0.s, z4.s\n"
79 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
80 "zip1 z10.s, z1.s, z4.s\n"
81 "whilelt p2.s, %[outpos], %[outwidth]\n"
82 "zip2 z11.s, z1.s, z4.s\n"
83 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
84 "zip1 z12.s, z2.s, z4.s\n"
85 "incw %[outpos], all, mul #1\n"
86 "zip2 z13.s, z2.s, z4.s\n"
87 "whilelt p3.s, %[outpos], %[outwidth]\n"
88 "zip1 z14.s, z3.s, z4.s\n"
89 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
90 "zip2 z15.s, z3.s, z4.s\n"
91 "incw %[outpos], all, mul #1\n"
92 "whilelt p0.s, %[outpos], %[outwidth]\n"
93 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
94 "incw %[outpos], all, mul #1\n"
95 "whilelt p1.s, %[outpos], %[outwidth]\n"
96 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
97 "incw %[outpos], all, mul #1\n"
98 "whilelt p2.s, %[outpos], %[outwidth]\n"
99 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
100 "incw %[outpos], all, mul #1\n"
101 "whilelt p3.s, %[outpos], %[outwidth]\n"
102 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
103 "incw %[outpos], all, mul #1\n"
104 "addvl %[outptr], %[outptr], #8\n"
105 "b 1b\n"
106 "2:\n"
107 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
108 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
109 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
110 );
111 break;
112
113 case 2:
114 __asm __volatile(
115 "1:\n"
116 "whilelt p0.h, %[inpos], %[inwidth]\n"
117 "b.none 2f\n"
118 "mov z4.h, #0\n"
119 "ld1h z0.h, p0/z, [%[inptr0]]\n"
120 "zip1 z8.s, z0.s, z4.s\n"
121 "ld1h z1.h, p0/z, [%[inptr1]]\n"
122 "zip2 z9.s, z0.s, z4.s\n"
123 "inch %[inpos], all, mul #1\n"
124 "zip1 z10.s, z1.s, z4.s\n"
125 "addvl %[inptr0], %[inptr0], #1\n"
126 "zip2 z11.s, z1.s, z4.s\n"
127 "addvl %[inptr1], %[inptr1], #1\n"
128 "zip1 z0.s, z8.s, z4.s\n"
129 "whilelt p0.s, %[outpos], %[outwidth]\n"
130 "zip2 z1.s, z8.s, z4.s\n"
131 "incw %[outpos], all, mul #1\n"
132 "zip1 z2.s, z9.s, z4.s\n"
133 "whilelt p1.s, %[outpos], %[outwidth]\n"
134 "zip2 z3.s, z9.s, z4.s\n"
135 "incw %[outpos], all, mul #1\n"
136 "mov z14.h, #0\n"
137 "whilelt p2.s, %[outpos], %[outwidth]\n"
138 "zip1 z4.s, z10.s, z14.s\n"
139 "incw %[outpos], all, mul #1\n"
140 "zip2 z5.s, z10.s, z14.s\n"
141 "whilelt p3.s, %[outpos], %[outwidth]\n"
142 "zip1 z6.s, z11.s, z14.s\n"
143 "incw %[outpos], all, mul #1\n"
144 "zip2 z7.s, z11.s, z14.s\n"
145 "zip1 z8.s, z0.s, z4.s\n"
146 "st1w z8.s, p0, [%[outptr]]\n"
147 "zip2 z9.s, z0.s, z4.s\n"
148 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
149 "zip1 z10.s, z1.s, z5.s\n"
150 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
151 "zip2 z11.s, z1.s, z5.s\n"
152 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
153 "zip1 z12.s, z2.s, z6.s\n"
154 "whilelt p0.s, %[outpos], %[outwidth]\n"
155 "zip2 z13.s, z2.s, z6.s\n"
156 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
157 "zip1 z14.s, z3.s, z7.s\n"
158 "incw %[outpos], all, mul #1\n"
159 "zip2 z15.s, z3.s, z7.s\n"
160 "whilelt p1.s, %[outpos], %[outwidth]\n"
161 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
162 "incw %[outpos], all, mul #1\n"
163 "whilelt p2.s, %[outpos], %[outwidth]\n"
164 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
165 "incw %[outpos], all, mul #1\n"
166 "whilelt p3.s, %[outpos], %[outwidth]\n"
167 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
168 "incw %[outpos], all, mul #1\n"
169 "addvl %[outptr], %[outptr], #8\n"
170 "b 1b\n"
171 "2:\n"
172 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
173 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
174 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
175 );
176 break;
177
178 case 3:
179 __asm __volatile(
180 "1:\n"
181 "whilelt p0.h, %[inpos], %[inwidth]\n"
182 "b.none 2f\n"
183 "mov z4.h, #0\n"
184 "ld1h z0.h, p0/z, [%[inptr0]]\n"
185 "zip1 z8.s, z0.s, z4.s\n"
186 "ld1h z1.h, p0/z, [%[inptr1]]\n"
187 "zip2 z9.s, z0.s, z4.s\n"
188 "ld1h z2.h, p0/z, [%[inptr2]]\n"
189 "zip1 z10.s, z1.s, z4.s\n"
190 "inch %[inpos], all, mul #1\n"
191 "zip2 z11.s, z1.s, z4.s\n"
192 "addvl %[inptr0], %[inptr0], #1\n"
193 "zip1 z12.s, z2.s, z4.s\n"
194 "addvl %[inptr1], %[inptr1], #1\n"
195 "zip2 z13.s, z2.s, z4.s\n"
196 "addvl %[inptr2], %[inptr2], #1\n"
197 "zip1 z0.s, z8.s, z12.s\n"
198 "whilelt p0.s, %[outpos], %[outwidth]\n"
199 "zip2 z1.s, z8.s, z12.s\n"
200 "incw %[outpos], all, mul #1\n"
201 "zip1 z2.s, z9.s, z13.s\n"
202 "whilelt p1.s, %[outpos], %[outwidth]\n"
203 "zip2 z3.s, z9.s, z13.s\n"
204 "incw %[outpos], all, mul #1\n"
205 "mov z14.h, #0\n"
206 "whilelt p2.s, %[outpos], %[outwidth]\n"
207 "zip1 z4.s, z10.s, z14.s\n"
208 "incw %[outpos], all, mul #1\n"
209 "zip2 z5.s, z10.s, z14.s\n"
210 "whilelt p3.s, %[outpos], %[outwidth]\n"
211 "zip1 z6.s, z11.s, z14.s\n"
212 "incw %[outpos], all, mul #1\n"
213 "zip2 z7.s, z11.s, z14.s\n"
214 "zip1 z8.s, z0.s, z4.s\n"
215 "st1w z8.s, p0, [%[outptr]]\n"
216 "zip2 z9.s, z0.s, z4.s\n"
217 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
218 "zip1 z10.s, z1.s, z5.s\n"
219 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
220 "zip2 z11.s, z1.s, z5.s\n"
221 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
222 "zip1 z12.s, z2.s, z6.s\n"
223 "whilelt p0.s, %[outpos], %[outwidth]\n"
224 "zip2 z13.s, z2.s, z6.s\n"
225 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
226 "zip1 z14.s, z3.s, z7.s\n"
227 "incw %[outpos], all, mul #1\n"
228 "zip2 z15.s, z3.s, z7.s\n"
229 "whilelt p1.s, %[outpos], %[outwidth]\n"
230 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
231 "incw %[outpos], all, mul #1\n"
232 "whilelt p2.s, %[outpos], %[outwidth]\n"
233 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
234 "incw %[outpos], all, mul #1\n"
235 "whilelt p3.s, %[outpos], %[outwidth]\n"
236 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
237 "incw %[outpos], all, mul #1\n"
238 "addvl %[outptr], %[outptr], #8\n"
239 "b 1b\n"
240 "2:\n"
241 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
242 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
243 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
244 );
245 break;
246
247 case 4:
248 __asm __volatile(
249 "1:\n"
250 "whilelt p0.h, %[inpos], %[inwidth]\n"
251 "b.none 2f\n"
252 "mov z4.h, #0\n"
253 "ld1h z0.h, p0/z, [%[inptr0]]\n"
254 "zip1 z8.s, z0.s, z4.s\n"
255 "ld1h z1.h, p0/z, [%[inptr1]]\n"
256 "zip2 z9.s, z0.s, z4.s\n"
257 "ld1h z2.h, p0/z, [%[inptr2]]\n"
258 "zip1 z10.s, z1.s, z4.s\n"
259 "ld1h z3.h, p0/z, [%[inptr3]]\n"
260 "zip2 z11.s, z1.s, z4.s\n"
261 "inch %[inpos], all, mul #1\n"
262 "zip1 z12.s, z2.s, z4.s\n"
263 "addvl %[inptr0], %[inptr0], #1\n"
264 "zip2 z13.s, z2.s, z4.s\n"
265 "addvl %[inptr1], %[inptr1], #1\n"
266 "zip1 z14.s, z3.s, z4.s\n"
267 "addvl %[inptr2], %[inptr2], #1\n"
268 "zip2 z15.s, z3.s, z4.s\n"
269 "addvl %[inptr3], %[inptr3], #1\n"
270 "zip1 z0.s, z8.s, z12.s\n"
271 "whilelt p0.s, %[outpos], %[outwidth]\n"
272 "zip2 z1.s, z8.s, z12.s\n"
273 "incw %[outpos], all, mul #1\n"
274 "zip1 z2.s, z9.s, z13.s\n"
275 "whilelt p1.s, %[outpos], %[outwidth]\n"
276 "zip2 z3.s, z9.s, z13.s\n"
277 "incw %[outpos], all, mul #1\n"
278 "zip1 z4.s, z10.s, z14.s\n"
279 "whilelt p2.s, %[outpos], %[outwidth]\n"
280 "zip2 z5.s, z10.s, z14.s\n"
281 "incw %[outpos], all, mul #1\n"
282 "zip1 z6.s, z11.s, z15.s\n"
283 "whilelt p3.s, %[outpos], %[outwidth]\n"
284 "zip2 z7.s, z11.s, z15.s\n"
285 "incw %[outpos], all, mul #1\n"
286 "zip1 z8.s, z0.s, z4.s\n"
287 "st1w z8.s, p0, [%[outptr]]\n"
288 "zip2 z9.s, z0.s, z4.s\n"
289 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
290 "zip1 z10.s, z1.s, z5.s\n"
291 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
292 "zip2 z11.s, z1.s, z5.s\n"
293 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
294 "zip1 z12.s, z2.s, z6.s\n"
295 "whilelt p0.s, %[outpos], %[outwidth]\n"
296 "zip2 z13.s, z2.s, z6.s\n"
297 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
298 "zip1 z14.s, z3.s, z7.s\n"
299 "incw %[outpos], all, mul #1\n"
300 "zip2 z15.s, z3.s, z7.s\n"
301 "whilelt p1.s, %[outpos], %[outwidth]\n"
302 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
303 "incw %[outpos], all, mul #1\n"
304 "whilelt p2.s, %[outpos], %[outwidth]\n"
305 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
306 "incw %[outpos], all, mul #1\n"
307 "whilelt p3.s, %[outpos], %[outwidth]\n"
308 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
309 "incw %[outpos], all, mul #1\n"
310 "addvl %[outptr], %[outptr], #8\n"
311 "b 1b\n"
312 "2:\n"
313 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
314 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
315 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
316 );
317 break;
318
319 case 5:
320 __asm __volatile(
321 "1:\n"
322 "whilelt p0.h, %[inpos], %[inwidth]\n"
323 "b.none 2f\n"
324 "mov z5.h, #0\n"
325 "ld1h z0.h, p0/z, [%[inptr0]]\n"
326 "ld1h z1.h, p0/z, [%[inptr1]]\n"
327 "inch %[inpos], all, mul #1\n"
328 "zip1 z10.s, z1.s, z5.s\n"
329 "ld1h z2.h, p0/z, [%[inptr2]]\n"
330 "zip2 z11.s, z1.s, z5.s\n"
331 "ld1h z3.h, p0/z, [%[inptr3]]\n"
332 "zip1 z12.s, z2.s, z5.s\n"
333 "ld1h z4.h, p0/z, [%[inptr4]]\n"
334 "zip1 z8.s, z0.s, z4.s\n"
335 "addvl %[inptr0], %[inptr0], #1\n"
336 "zip2 z9.s, z0.s, z4.s\n"
337 "addvl %[inptr1], %[inptr1], #1\n"
338 "zip2 z13.s, z2.s, z5.s\n"
339 "addvl %[inptr2], %[inptr2], #1\n"
340 "zip1 z14.s, z3.s, z5.s\n"
341 "addvl %[inptr3], %[inptr3], #1\n"
342 "zip2 z15.s, z3.s, z5.s\n"
343 "addvl %[inptr4], %[inptr4], #1\n"
344 "zip1 z0.s, z8.s, z12.s\n"
345 "whilelt p0.s, %[outpos], %[outwidth]\n"
346 "zip2 z1.s, z8.s, z12.s\n"
347 "incw %[outpos], all, mul #1\n"
348 "zip1 z2.s, z9.s, z13.s\n"
349 "whilelt p1.s, %[outpos], %[outwidth]\n"
350 "zip2 z3.s, z9.s, z13.s\n"
351 "incw %[outpos], all, mul #1\n"
352 "zip1 z4.s, z10.s, z14.s\n"
353 "whilelt p2.s, %[outpos], %[outwidth]\n"
354 "zip2 z5.s, z10.s, z14.s\n"
355 "incw %[outpos], all, mul #1\n"
356 "zip1 z6.s, z11.s, z15.s\n"
357 "whilelt p3.s, %[outpos], %[outwidth]\n"
358 "zip2 z7.s, z11.s, z15.s\n"
359 "incw %[outpos], all, mul #1\n"
360 "zip1 z8.s, z0.s, z4.s\n"
361 "st1w z8.s, p0, [%[outptr]]\n"
362 "zip2 z9.s, z0.s, z4.s\n"
363 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
364 "zip1 z10.s, z1.s, z5.s\n"
365 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
366 "zip2 z11.s, z1.s, z5.s\n"
367 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
368 "zip1 z12.s, z2.s, z6.s\n"
369 "whilelt p0.s, %[outpos], %[outwidth]\n"
370 "zip2 z13.s, z2.s, z6.s\n"
371 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
372 "zip1 z14.s, z3.s, z7.s\n"
373 "incw %[outpos], all, mul #1\n"
374 "zip2 z15.s, z3.s, z7.s\n"
375 "whilelt p1.s, %[outpos], %[outwidth]\n"
376 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
377 "incw %[outpos], all, mul #1\n"
378 "whilelt p2.s, %[outpos], %[outwidth]\n"
379 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
380 "incw %[outpos], all, mul #1\n"
381 "whilelt p3.s, %[outpos], %[outwidth]\n"
382 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
383 "incw %[outpos], all, mul #1\n"
384 "addvl %[outptr], %[outptr], #8\n"
385 "b 1b\n"
386 "2:\n"
387 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
388 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
389 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
390 );
391 break;
392
393 case 6:
394 __asm __volatile(
395 "1:\n"
396 "whilelt p0.h, %[inpos], %[inwidth]\n"
397 "b.none 2f\n"
398 "mov z6.h, #0\n"
399 "ld1h z0.h, p0/z, [%[inptr0]]\n"
400 "ld1h z1.h, p0/z, [%[inptr1]]\n"
401 "inch %[inpos], all, mul #1\n"
402 "ld1h z2.h, p0/z, [%[inptr2]]\n"
403 "addvl %[inptr0], %[inptr0], #1\n"
404 "zip1 z12.s, z2.s, z6.s\n"
405 "ld1h z3.h, p0/z, [%[inptr3]]\n"
406 "zip2 z13.s, z2.s, z6.s\n"
407 "ld1h z4.h, p0/z, [%[inptr4]]\n"
408 "zip1 z8.s, z0.s, z4.s\n"
409 "ld1h z5.h, p0/z, [%[inptr5]]\n"
410 "zip2 z9.s, z0.s, z4.s\n"
411 "addvl %[inptr1], %[inptr1], #1\n"
412 "zip1 z10.s, z1.s, z5.s\n"
413 "addvl %[inptr2], %[inptr2], #1\n"
414 "zip2 z11.s, z1.s, z5.s\n"
415 "addvl %[inptr3], %[inptr3], #1\n"
416 "zip1 z14.s, z3.s, z6.s\n"
417 "addvl %[inptr4], %[inptr4], #1\n"
418 "zip2 z15.s, z3.s, z6.s\n"
419 "addvl %[inptr5], %[inptr5], #1\n"
420 "zip1 z0.s, z8.s, z12.s\n"
421 "whilelt p0.s, %[outpos], %[outwidth]\n"
422 "zip2 z1.s, z8.s, z12.s\n"
423 "incw %[outpos], all, mul #1\n"
424 "zip1 z2.s, z9.s, z13.s\n"
425 "whilelt p1.s, %[outpos], %[outwidth]\n"
426 "zip2 z3.s, z9.s, z13.s\n"
427 "incw %[outpos], all, mul #1\n"
428 "zip1 z4.s, z10.s, z14.s\n"
429 "whilelt p2.s, %[outpos], %[outwidth]\n"
430 "zip2 z5.s, z10.s, z14.s\n"
431 "incw %[outpos], all, mul #1\n"
432 "zip1 z6.s, z11.s, z15.s\n"
433 "whilelt p3.s, %[outpos], %[outwidth]\n"
434 "zip2 z7.s, z11.s, z15.s\n"
435 "incw %[outpos], all, mul #1\n"
436 "zip1 z8.s, z0.s, z4.s\n"
437 "st1w z8.s, p0, [%[outptr]]\n"
438 "zip2 z9.s, z0.s, z4.s\n"
439 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
440 "zip1 z10.s, z1.s, z5.s\n"
441 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
442 "zip2 z11.s, z1.s, z5.s\n"
443 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
444 "zip1 z12.s, z2.s, z6.s\n"
445 "whilelt p0.s, %[outpos], %[outwidth]\n"
446 "zip2 z13.s, z2.s, z6.s\n"
447 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
448 "zip1 z14.s, z3.s, z7.s\n"
449 "incw %[outpos], all, mul #1\n"
450 "zip2 z15.s, z3.s, z7.s\n"
451 "whilelt p1.s, %[outpos], %[outwidth]\n"
452 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
453 "incw %[outpos], all, mul #1\n"
454 "whilelt p2.s, %[outpos], %[outwidth]\n"
455 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
456 "incw %[outpos], all, mul #1\n"
457 "whilelt p3.s, %[outpos], %[outwidth]\n"
458 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
459 "incw %[outpos], all, mul #1\n"
460 "addvl %[outptr], %[outptr], #8\n"
461 "b 1b\n"
462 "2:\n"
463 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
464 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
465 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
466 );
467 break;
468
469 case 7:
470 __asm __volatile(
471 "1:\n"
472 "whilelt p0.h, %[inpos], %[inwidth]\n"
473 "b.none 2f\n"
474 "mov z7.h, #0\n"
475 "ld1h z0.h, p0/z, [%[inptr0]]\n"
476 "ld1h z1.h, p0/z, [%[inptr1]]\n"
477 "inch %[inpos], all, mul #1\n"
478 "ld1h z2.h, p0/z, [%[inptr2]]\n"
479 "addvl %[inptr0], %[inptr0], #1\n"
480 "ld1h z3.h, p0/z, [%[inptr3]]\n"
481 "addvl %[inptr1], %[inptr1], #1\n"
482 "zip1 z14.s, z3.s, z7.s\n"
483 "ld1h z4.h, p0/z, [%[inptr4]]\n"
484 "zip1 z8.s, z0.s, z4.s\n"
485 "ld1h z5.h, p0/z, [%[inptr5]]\n"
486 "zip2 z9.s, z0.s, z4.s\n"
487 "ld1h z6.h, p0/z, [%[inptr6]]\n"
488 "zip1 z10.s, z1.s, z5.s\n"
489 "addvl %[inptr2], %[inptr2], #1\n"
490 "zip2 z11.s, z1.s, z5.s\n"
491 "addvl %[inptr3], %[inptr3], #1\n"
492 "zip1 z12.s, z2.s, z6.s\n"
493 "addvl %[inptr4], %[inptr4], #1\n"
494 "zip2 z13.s, z2.s, z6.s\n"
495 "addvl %[inptr5], %[inptr5], #1\n"
496 "zip2 z15.s, z3.s, z7.s\n"
497 "addvl %[inptr6], %[inptr6], #1\n"
498 "zip1 z0.s, z8.s, z12.s\n"
499 "whilelt p0.s, %[outpos], %[outwidth]\n"
500 "zip2 z1.s, z8.s, z12.s\n"
501 "incw %[outpos], all, mul #1\n"
502 "zip1 z2.s, z9.s, z13.s\n"
503 "whilelt p1.s, %[outpos], %[outwidth]\n"
504 "zip2 z3.s, z9.s, z13.s\n"
505 "incw %[outpos], all, mul #1\n"
506 "zip1 z4.s, z10.s, z14.s\n"
507 "whilelt p2.s, %[outpos], %[outwidth]\n"
508 "zip2 z5.s, z10.s, z14.s\n"
509 "incw %[outpos], all, mul #1\n"
510 "zip1 z6.s, z11.s, z15.s\n"
511 "whilelt p3.s, %[outpos], %[outwidth]\n"
512 "zip2 z7.s, z11.s, z15.s\n"
513 "incw %[outpos], all, mul #1\n"
514 "zip1 z8.s, z0.s, z4.s\n"
515 "st1w z8.s, p0, [%[outptr]]\n"
516 "zip2 z9.s, z0.s, z4.s\n"
517 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
518 "zip1 z10.s, z1.s, z5.s\n"
519 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
520 "zip2 z11.s, z1.s, z5.s\n"
521 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
522 "zip1 z12.s, z2.s, z6.s\n"
523 "whilelt p0.s, %[outpos], %[outwidth]\n"
524 "zip2 z13.s, z2.s, z6.s\n"
525 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
526 "zip1 z14.s, z3.s, z7.s\n"
527 "incw %[outpos], all, mul #1\n"
528 "zip2 z15.s, z3.s, z7.s\n"
529 "whilelt p1.s, %[outpos], %[outwidth]\n"
530 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
531 "incw %[outpos], all, mul #1\n"
532 "whilelt p2.s, %[outpos], %[outwidth]\n"
533 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
534 "incw %[outpos], all, mul #1\n"
535 "whilelt p3.s, %[outpos], %[outwidth]\n"
536 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
537 "incw %[outpos], all, mul #1\n"
538 "addvl %[outptr], %[outptr], #8\n"
539 "b 1b\n"
540 "2:\n"
541 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
542 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
543 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
544 );
545 break;
546
547 default:
548 case 8:
549 __asm __volatile(
550 "1:\n"
551 "whilelt p0.h, %[inpos], %[inwidth]\n"
552 "b.none 2f\n"
553 "ld1h z0.h, p0/z, [%[inptr0]]\n"
554 "inch %[inpos], all, mul #1\n"
555 "ld1h z1.h, p0/z, [%[inptr1]]\n"
556 "addvl %[inptr0], %[inptr0], #1\n"
557 "ld1h z2.h, p0/z, [%[inptr2]]\n"
558 "addvl %[inptr1], %[inptr1], #1\n"
559 "ld1h z3.h, p0/z, [%[inptr3]]\n"
560 "addvl %[inptr2], %[inptr2], #1\n"
561 "ld1h z4.h, p0/z, [%[inptr4]]\n"
562 "addvl %[inptr3], %[inptr3], #1\n"
563 "zip1 z8.s, z0.s, z4.s\n"
564 "ld1h z5.h, p0/z, [%[inptr5]]\n"
565 "zip2 z9.s, z0.s, z4.s\n"
566 "ld1h z6.h, p0/z, [%[inptr6]]\n"
567 "zip1 z10.s, z1.s, z5.s\n"
568 "ld1h z7.h, p0/z, [%[inptr7]]\n"
569 "zip2 z11.s, z1.s, z5.s\n"
570 "addvl %[inptr4], %[inptr4], #1\n"
571 "zip1 z12.s, z2.s, z6.s\n"
572 "addvl %[inptr5], %[inptr5], #1\n"
573 "zip2 z13.s, z2.s, z6.s\n"
574 "addvl %[inptr6], %[inptr6], #1\n"
575 "zip1 z14.s, z3.s, z7.s\n"
576 "addvl %[inptr7], %[inptr7], #1\n"
577 "zip2 z15.s, z3.s, z7.s\n"
578 "whilelt p0.s, %[outpos], %[outwidth]\n"
579 "zip1 z0.s, z8.s, z12.s\n"
580 "incw %[outpos], all, mul #1\n"
581 "zip2 z1.s, z8.s, z12.s\n"
582 "whilelt p1.s, %[outpos], %[outwidth]\n"
583 "zip1 z2.s, z9.s, z13.s\n"
584 "incw %[outpos], all, mul #1\n"
585 "zip2 z3.s, z9.s, z13.s\n"
586 "whilelt p2.s, %[outpos], %[outwidth]\n"
587 "zip1 z4.s, z10.s, z14.s\n"
588 "incw %[outpos], all, mul #1\n"
589 "zip2 z5.s, z10.s, z14.s\n"
590 "whilelt p3.s, %[outpos], %[outwidth]\n"
591 "zip1 z6.s, z11.s, z15.s\n"
592 "incw %[outpos], all, mul #1\n"
593 "zip2 z7.s, z11.s, z15.s\n"
594 "zip1 z8.s, z0.s, z4.s\n"
595 "st1w z8.s, p0, [%[outptr]]\n"
596 "zip2 z9.s, z0.s, z4.s\n"
597 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
598 "zip1 z10.s, z1.s, z5.s\n"
599 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
600 "zip2 z11.s, z1.s, z5.s\n"
601 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
602 "zip1 z12.s, z2.s, z6.s\n"
603 "whilelt p0.s, %[outpos], %[outwidth]\n"
604 "zip2 z13.s, z2.s, z6.s\n"
605 "st1w z12.s, p0, [%[outptr], #4, MUL VL]\n"
606 "zip1 z14.s, z3.s, z7.s\n"
607 "incw %[outpos], all, mul #1\n"
608 "zip2 z15.s, z3.s, z7.s\n"
609 "whilelt p1.s, %[outpos], %[outwidth]\n"
610 "st1w z13.s, p1, [%[outptr], #5, MUL VL]\n"
611 "incw %[outpos], all, mul #1\n"
612 "whilelt p2.s, %[outpos], %[outwidth]\n"
613 "st1w z14.s, p2, [%[outptr], #6, MUL VL]\n"
614 "incw %[outpos], all, mul #1\n"
615 "whilelt p3.s, %[outpos], %[outwidth]\n"
616 "st1w z15.s, p3, [%[outptr], #7, MUL VL]\n"
617 "incw %[outpos], all, mul #1\n"
618 "addvl %[outptr], %[outptr], #8\n"
619 "b 1b\n"
620 "2:\n"
621 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
622 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
623 : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
624 );
625 break;
626
627
628 }
629 }
630}
631
632#endif // __ARM_FEATURE_SVE