blob: 234433a0f1ef5983535e83e10f8281bed456f4c3 [file] [log] [blame]
Georgios Pinitas94672fb2020-01-22 18:36:27 +00001/*
2 * Copyright (c) 2019 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29template<typename T>
30inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
31{
32 uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
33 const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
34
35 for (int y=y0; y<ymax; y+=8)
36 {
37 const int height = ymax-y;
38 const long inwidth = (kmax - k0);
39 const long outwidth = ((inwidth + 1) / 2) * 16;
40 long inpos = 0;
41 long outpos = 0;
42
43 uint16_t *outptr = master_outptr;
44 master_outptr += outwidth;
45
46 const uint16_t *inptr0 = inptr + y * ldin + k0;
47 const uint16_t *inptr1 = inptr0 + ldin;
48 const uint16_t *inptr2 = inptr1 + ldin;
49 const uint16_t *inptr3 = inptr2 + ldin;
50 const uint16_t *inptr4 = inptr3 + ldin;
51 const uint16_t *inptr5 = inptr4 + ldin;
52 const uint16_t *inptr6 = inptr5 + ldin;
53 const uint16_t *inptr7 = inptr6 + ldin;
54
55 switch(height)
56 {
57 case 1:
58 __asm __volatile(
59 "1:\n"
60 "whilelt p0.h, %[inpos], %[inwidth]\n"
61 "b.none 2f\n"
62 "mov z4.h, #0\n"
63 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
64 "inch %[inpos], all, mul #1\n"
65 "whilelt p0.h, %[outpos], %[outwidth]\n"
66 "inch %[outpos], all, mul #1\n"
67 "zip1 z8.s, z0.s, z4.s\n"
68 "zip2 z9.s, z0.s, z4.s\n"
69 "whilelt p1.h, %[outpos], %[outwidth]\n"
70 "zip1 z0.s, z8.s, z4.s\n"
71 "inch %[outpos], all, mul #1\n"
72 "zip2 z1.s, z8.s, z4.s\n"
73 "zip1 z2.s, z9.s, z4.s\n"
74 "zip2 z3.s, z9.s, z4.s\n"
75 "whilelt p2.h, %[outpos], %[outwidth]\n"
76 "zip1 z8.s, z0.s, z4.s\n"
77 "inch %[outpos], all, mul #1\n"
78 "zip2 z9.s, z0.s, z4.s\n"
79 "zip1 z10.s, z1.s, z4.s\n"
80 "st1h z8.h, p0, [%[outptr]]\n"
81 "zip2 z11.s, z1.s, z4.s\n"
82 "whilelt p3.h, %[outpos], %[outwidth]\n"
83 "zip1 z12.s, z2.s, z4.s\n"
84 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
85 "zip2 z13.s, z2.s, z4.s\n"
86 "inch %[outpos], all, mul #1\n"
87 "zip1 z14.s, z3.s, z4.s\n"
88 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
89 "zip2 z15.s, z3.s, z4.s\n"
90 "whilelt p4.h, %[outpos], %[outwidth]\n"
91 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
92 "inch %[outpos], all, mul #1\n"
93 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
94 "whilelt p5.h, %[outpos], %[outwidth]\n"
95 "inch %[outpos], all, mul #1\n"
96 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
97 "whilelt p6.h, %[outpos], %[outwidth]\n"
98 "inch %[outpos], all, mul #1\n"
99 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
100 "whilelt p7.h, %[outpos], %[outwidth]\n"
101 "inch %[outpos], all, mul #1\n"
102 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
103 "addvl %[outptr], %[outptr], #8\n"
104 "b 1b\n"
105 "2:\n"
106 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
107 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
108 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
109 );
110 break;
111
112 case 2:
113 __asm __volatile(
114 "1:\n"
115 "whilelt p0.h, %[inpos], %[inwidth]\n"
116 "b.none 2f\n"
117 "mov z4.h, #0\n"
118 "mov z14.h, #0\n"
119 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
120 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
121 "inch %[inpos], all, mul #1\n"
122 "whilelt p0.h, %[outpos], %[outwidth]\n"
123 "zip1 z8.s, z0.s, z4.s\n"
124 "inch %[outpos], all, mul #1\n"
125 "zip2 z9.s, z0.s, z4.s\n"
126 "zip1 z10.s, z1.s, z4.s\n"
127 "zip2 z11.s, z1.s, z4.s\n"
128 "whilelt p1.h, %[outpos], %[outwidth]\n"
129 "zip1 z0.s, z8.s, z4.s\n"
130 "inch %[outpos], all, mul #1\n"
131 "zip2 z1.s, z8.s, z4.s\n"
132 "zip1 z2.s, z9.s, z4.s\n"
133 "zip2 z3.s, z9.s, z4.s\n"
134 "whilelt p2.h, %[outpos], %[outwidth]\n"
135 "zip1 z4.s, z10.s, z14.s\n"
136 "inch %[outpos], all, mul #1\n"
137 "zip2 z5.s, z10.s, z14.s\n"
138 "zip1 z6.s, z11.s, z14.s\n"
139 "zip2 z7.s, z11.s, z14.s\n"
140 "whilelt p3.h, %[outpos], %[outwidth]\n"
141 "zip1 z8.s, z0.s, z4.s\n"
142 "inch %[outpos], all, mul #1\n"
143 "zip2 z9.s, z0.s, z4.s\n"
144 "zip1 z10.s, z1.s, z5.s\n"
145 "st1h z8.h, p0, [%[outptr]]\n"
146 "zip2 z11.s, z1.s, z5.s\n"
147 "whilelt p4.h, %[outpos], %[outwidth]\n"
148 "zip1 z12.s, z2.s, z6.s\n"
149 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
150 "zip2 z13.s, z2.s, z6.s\n"
151 "inch %[outpos], all, mul #1\n"
152 "zip1 z14.s, z3.s, z7.s\n"
153 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
154 "zip2 z15.s, z3.s, z7.s\n"
155 "whilelt p5.h, %[outpos], %[outwidth]\n"
156 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
157 "inch %[outpos], all, mul #1\n"
158 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
159 "whilelt p6.h, %[outpos], %[outwidth]\n"
160 "inch %[outpos], all, mul #1\n"
161 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
162 "whilelt p7.h, %[outpos], %[outwidth]\n"
163 "inch %[outpos], all, mul #1\n"
164 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
165 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
166 "addvl %[outptr], %[outptr], #8\n"
167 "b 1b\n"
168 "2:\n"
169 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
170 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
171 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
172 );
173 break;
174
175 case 3:
176 __asm __volatile(
177 "1:\n"
178 "whilelt p0.h, %[inpos], %[inwidth]\n"
179 "b.none 2f\n"
180 "mov z4.h, #0\n"
181 "mov z14.h, #0\n"
182 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
183 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
184 "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
185 "inch %[inpos], all, mul #1\n"
186 "zip1 z8.s, z0.s, z4.s\n"
187 "whilelt p0.h, %[outpos], %[outwidth]\n"
188 "zip2 z9.s, z0.s, z4.s\n"
189 "inch %[outpos], all, mul #1\n"
190 "zip1 z10.s, z1.s, z4.s\n"
191 "zip2 z11.s, z1.s, z4.s\n"
192 "zip1 z12.s, z2.s, z4.s\n"
193 "whilelt p1.h, %[outpos], %[outwidth]\n"
194 "zip2 z13.s, z2.s, z4.s\n"
195 "inch %[outpos], all, mul #1\n"
196 "zip1 z0.s, z8.s, z12.s\n"
197 "zip2 z1.s, z8.s, z12.s\n"
198 "zip1 z2.s, z9.s, z13.s\n"
199 "whilelt p2.h, %[outpos], %[outwidth]\n"
200 "zip2 z3.s, z9.s, z13.s\n"
201 "inch %[outpos], all, mul #1\n"
202 "zip1 z4.s, z10.s, z14.s\n"
203 "zip2 z5.s, z10.s, z14.s\n"
204 "zip1 z6.s, z11.s, z14.s\n"
205 "whilelt p3.h, %[outpos], %[outwidth]\n"
206 "zip2 z7.s, z11.s, z14.s\n"
207 "inch %[outpos], all, mul #1\n"
208 "zip1 z8.s, z0.s, z4.s\n"
209 "zip2 z9.s, z0.s, z4.s\n"
210 "zip1 z10.s, z1.s, z5.s\n"
211 "whilelt p4.h, %[outpos], %[outwidth]\n"
212 "zip2 z11.s, z1.s, z5.s\n"
213 "st1h z8.h, p0, [%[outptr]]\n"
214 "zip1 z12.s, z2.s, z6.s\n"
215 "inch %[outpos], all, mul #1\n"
216 "zip2 z13.s, z2.s, z6.s\n"
217 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
218 "zip1 z14.s, z3.s, z7.s\n"
219 "zip2 z15.s, z3.s, z7.s\n"
220 "whilelt p5.h, %[outpos], %[outwidth]\n"
221 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
222 "inch %[outpos], all, mul #1\n"
223 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
224 "whilelt p6.h, %[outpos], %[outwidth]\n"
225 "inch %[outpos], all, mul #1\n"
226 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
227 "whilelt p7.h, %[outpos], %[outwidth]\n"
228 "inch %[outpos], all, mul #1\n"
229 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
230 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
231 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
232 "addvl %[outptr], %[outptr], #8\n"
233 "b 1b\n"
234 "2:\n"
235 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
236 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
237 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
238 );
239 break;
240
241 case 4:
242 __asm __volatile(
243 "1:\n"
244 "whilelt p0.h, %[inpos], %[inwidth]\n"
245 "b.none 2f\n"
246 "mov z4.h, #0\n"
247 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
248 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
249 "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
250 "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
251 "inch %[inpos], all, mul #1\n"
252 "zip1 z8.s, z0.s, z4.s\n"
253 "whilelt p0.h, %[outpos], %[outwidth]\n"
254 "zip2 z9.s, z0.s, z4.s\n"
255 "inch %[outpos], all, mul #1\n"
256 "zip1 z10.s, z1.s, z4.s\n"
257 "zip2 z11.s, z1.s, z4.s\n"
258 "zip1 z12.s, z2.s, z4.s\n"
259 "whilelt p1.h, %[outpos], %[outwidth]\n"
260 "zip2 z13.s, z2.s, z4.s\n"
261 "inch %[outpos], all, mul #1\n"
262 "zip1 z14.s, z3.s, z4.s\n"
263 "zip2 z15.s, z3.s, z4.s\n"
264 "zip1 z0.s, z8.s, z12.s\n"
265 "whilelt p2.h, %[outpos], %[outwidth]\n"
266 "zip2 z1.s, z8.s, z12.s\n"
267 "inch %[outpos], all, mul #1\n"
268 "zip1 z2.s, z9.s, z13.s\n"
269 "zip2 z3.s, z9.s, z13.s\n"
270 "zip1 z4.s, z10.s, z14.s\n"
271 "whilelt p3.h, %[outpos], %[outwidth]\n"
272 "zip2 z5.s, z10.s, z14.s\n"
273 "inch %[outpos], all, mul #1\n"
274 "zip1 z6.s, z11.s, z15.s\n"
275 "zip2 z7.s, z11.s, z15.s\n"
276 "zip1 z8.s, z0.s, z4.s\n"
277 "whilelt p4.h, %[outpos], %[outwidth]\n"
278 "zip2 z9.s, z0.s, z4.s\n"
279 "inch %[outpos], all, mul #1\n"
280 "zip1 z10.s, z1.s, z5.s\n"
281 "st1h z8.h, p0, [%[outptr]]\n"
282 "zip2 z11.s, z1.s, z5.s\n"
283 "zip1 z12.s, z2.s, z6.s\n"
284 "whilelt p5.h, %[outpos], %[outwidth]\n"
285 "zip2 z13.s, z2.s, z6.s\n"
286 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
287 "zip1 z14.s, z3.s, z7.s\n"
288 "inch %[outpos], all, mul #1\n"
289 "zip2 z15.s, z3.s, z7.s\n"
290 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
291 "whilelt p6.h, %[outpos], %[outwidth]\n"
292 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
293 "inch %[outpos], all, mul #1\n"
294 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
295 "whilelt p7.h, %[outpos], %[outwidth]\n"
296 "inch %[outpos], all, mul #1\n"
297 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
298 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
299 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
300 "addvl %[outptr], %[outptr], #8\n"
301 "b 1b\n"
302 "2:\n"
303 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
304 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
305 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
306 );
307 break;
308
309 case 5:
310 __asm __volatile(
311 "1:\n"
312 "whilelt p0.h, %[inpos], %[inwidth]\n"
313 "b.none 2f\n"
314 "mov z5.h, #0\n"
315 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
316 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
317 "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
318 "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
319 "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
320 "inch %[inpos], all, mul #1\n"
321 "zip1 z10.s, z1.s, z5.s\n"
322 "whilelt p0.h, %[outpos], %[outwidth]\n"
323 "zip1 z8.s, z0.s, z4.s\n"
324 "inch %[outpos], all, mul #1\n"
325 "zip2 z9.s, z0.s, z4.s\n"
326 "zip2 z11.s, z1.s, z5.s\n"
327 "zip1 z12.s, z2.s, z5.s\n"
328 "whilelt p1.h, %[outpos], %[outwidth]\n"
329 "zip2 z13.s, z2.s, z5.s\n"
330 "inch %[outpos], all, mul #1\n"
331 "zip1 z14.s, z3.s, z5.s\n"
332 "zip2 z15.s, z3.s, z5.s\n"
333 "zip1 z0.s, z8.s, z12.s\n"
334 "whilelt p2.h, %[outpos], %[outwidth]\n"
335 "zip2 z1.s, z8.s, z12.s\n"
336 "inch %[outpos], all, mul #1\n"
337 "zip1 z2.s, z9.s, z13.s\n"
338 "zip2 z3.s, z9.s, z13.s\n"
339 "zip1 z4.s, z10.s, z14.s\n"
340 "whilelt p3.h, %[outpos], %[outwidth]\n"
341 "zip2 z5.s, z10.s, z14.s\n"
342 "inch %[outpos], all, mul #1\n"
343 "zip1 z6.s, z11.s, z15.s\n"
344 "zip2 z7.s, z11.s, z15.s\n"
345 "zip1 z8.s, z0.s, z4.s\n"
346 "whilelt p4.h, %[outpos], %[outwidth]\n"
347 "zip2 z9.s, z0.s, z4.s\n"
348 "inch %[outpos], all, mul #1\n"
349 "zip1 z10.s, z1.s, z5.s\n"
350 "st1h z8.h, p0, [%[outptr]]\n"
351 "zip2 z11.s, z1.s, z5.s\n"
352 "zip1 z12.s, z2.s, z6.s\n"
353 "whilelt p5.h, %[outpos], %[outwidth]\n"
354 "zip2 z13.s, z2.s, z6.s\n"
355 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
356 "zip1 z14.s, z3.s, z7.s\n"
357 "inch %[outpos], all, mul #1\n"
358 "zip2 z15.s, z3.s, z7.s\n"
359 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
360 "whilelt p6.h, %[outpos], %[outwidth]\n"
361 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
362 "inch %[outpos], all, mul #1\n"
363 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
364 "whilelt p7.h, %[outpos], %[outwidth]\n"
365 "inch %[outpos], all, mul #1\n"
366 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
367 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
368 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
369 "addvl %[outptr], %[outptr], #8\n"
370 "b 1b\n"
371 "2:\n"
372 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
373 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
374 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
375 );
376 break;
377
378 case 6:
379 __asm __volatile(
380 "1:\n"
381 "whilelt p0.h, %[inpos], %[inwidth]\n"
382 "b.none 2f\n"
383 "mov z6.h, #0\n"
384 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
385 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
386 "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
387 "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
388 "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
389 "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
390 "inch %[inpos], all, mul #1\n"
391 "zip1 z12.s, z2.s, z6.s\n"
392 "whilelt p0.h, %[outpos], %[outwidth]\n"
393 "zip1 z8.s, z0.s, z4.s\n"
394 "inch %[outpos], all, mul #1\n"
395 "zip2 z9.s, z0.s, z4.s\n"
396 "zip1 z10.s, z1.s, z5.s\n"
397 "zip2 z11.s, z1.s, z5.s\n"
398 "whilelt p1.h, %[outpos], %[outwidth]\n"
399 "zip2 z13.s, z2.s, z6.s\n"
400 "inch %[outpos], all, mul #1\n"
401 "zip1 z14.s, z3.s, z6.s\n"
402 "zip2 z15.s, z3.s, z6.s\n"
403 "zip1 z0.s, z8.s, z12.s\n"
404 "whilelt p2.h, %[outpos], %[outwidth]\n"
405 "zip2 z1.s, z8.s, z12.s\n"
406 "inch %[outpos], all, mul #1\n"
407 "zip1 z2.s, z9.s, z13.s\n"
408 "zip2 z3.s, z9.s, z13.s\n"
409 "zip1 z4.s, z10.s, z14.s\n"
410 "whilelt p3.h, %[outpos], %[outwidth]\n"
411 "zip2 z5.s, z10.s, z14.s\n"
412 "inch %[outpos], all, mul #1\n"
413 "zip1 z6.s, z11.s, z15.s\n"
414 "zip2 z7.s, z11.s, z15.s\n"
415 "zip1 z8.s, z0.s, z4.s\n"
416 "whilelt p4.h, %[outpos], %[outwidth]\n"
417 "zip2 z9.s, z0.s, z4.s\n"
418 "inch %[outpos], all, mul #1\n"
419 "zip1 z10.s, z1.s, z5.s\n"
420 "st1h z8.h, p0, [%[outptr]]\n"
421 "zip2 z11.s, z1.s, z5.s\n"
422 "zip1 z12.s, z2.s, z6.s\n"
423 "whilelt p5.h, %[outpos], %[outwidth]\n"
424 "zip2 z13.s, z2.s, z6.s\n"
425 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
426 "zip1 z14.s, z3.s, z7.s\n"
427 "inch %[outpos], all, mul #1\n"
428 "zip2 z15.s, z3.s, z7.s\n"
429 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
430 "whilelt p6.h, %[outpos], %[outwidth]\n"
431 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
432 "inch %[outpos], all, mul #1\n"
433 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
434 "whilelt p7.h, %[outpos], %[outwidth]\n"
435 "inch %[outpos], all, mul #1\n"
436 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
437 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
438 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
439 "addvl %[outptr], %[outptr], #8\n"
440 "b 1b\n"
441 "2:\n"
442 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
443 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
444 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
445 );
446 break;
447
448 case 7:
449 __asm __volatile(
450 "1:\n"
451 "whilelt p0.h, %[inpos], %[inwidth]\n"
452 "b.none 2f\n"
453 "mov z7.h, #0\n"
454 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
455 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
456 "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
457 "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
458 "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
459 "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
460 "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
461 "inch %[inpos], all, mul #1\n"
462 "zip1 z8.s, z0.s, z4.s\n"
463 "whilelt p0.h, %[outpos], %[outwidth]\n"
464 "zip2 z9.s, z0.s, z4.s\n"
465 "inch %[outpos], all, mul #1\n"
466 "zip1 z10.s, z1.s, z5.s\n"
467 "zip2 z11.s, z1.s, z5.s\n"
468 "zip1 z12.s, z2.s, z6.s\n"
469 "whilelt p1.h, %[outpos], %[outwidth]\n"
470 "zip2 z13.s, z2.s, z6.s\n"
471 "inch %[outpos], all, mul #1\n"
472 "zip1 z14.s, z3.s, z7.s\n"
473 "zip2 z15.s, z3.s, z7.s\n"
474 "zip1 z0.s, z8.s, z12.s\n"
475 "whilelt p2.h, %[outpos], %[outwidth]\n"
476 "zip2 z1.s, z8.s, z12.s\n"
477 "inch %[outpos], all, mul #1\n"
478 "zip1 z2.s, z9.s, z13.s\n"
479 "zip2 z3.s, z9.s, z13.s\n"
480 "zip1 z4.s, z10.s, z14.s\n"
481 "whilelt p3.h, %[outpos], %[outwidth]\n"
482 "zip2 z5.s, z10.s, z14.s\n"
483 "inch %[outpos], all, mul #1\n"
484 "zip1 z6.s, z11.s, z15.s\n"
485 "zip2 z7.s, z11.s, z15.s\n"
486 "zip1 z8.s, z0.s, z4.s\n"
487 "whilelt p4.h, %[outpos], %[outwidth]\n"
488 "zip2 z9.s, z0.s, z4.s\n"
489 "inch %[outpos], all, mul #1\n"
490 "zip1 z10.s, z1.s, z5.s\n"
491 "st1h z8.h, p0, [%[outptr]]\n"
492 "zip2 z11.s, z1.s, z5.s\n"
493 "zip1 z12.s, z2.s, z6.s\n"
494 "whilelt p5.h, %[outpos], %[outwidth]\n"
495 "zip2 z13.s, z2.s, z6.s\n"
496 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
497 "zip1 z14.s, z3.s, z7.s\n"
498 "inch %[outpos], all, mul #1\n"
499 "zip2 z15.s, z3.s, z7.s\n"
500 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
501 "whilelt p6.h, %[outpos], %[outwidth]\n"
502 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
503 "inch %[outpos], all, mul #1\n"
504 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
505 "whilelt p7.h, %[outpos], %[outwidth]\n"
506 "inch %[outpos], all, mul #1\n"
507 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
508 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
509 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
510 "addvl %[outptr], %[outptr], #8\n"
511 "b 1b\n"
512 "2:\n"
513 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
514 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
515 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
516 );
517 break;
518
519 default:
520 case 8:
521 __asm __volatile(
522 "1:\n"
523 "whilelt p0.h, %[inpos], %[inwidth]\n"
524 "b.none 2f\n"
525 "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
526 "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
527 "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
528 "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
529 "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
530 "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
531 "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
532 "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
533 "inch %[inpos], all, mul #1\n"
534 "zip1 z8.s, z0.s, z4.s\n"
535 "whilelt p0.h, %[outpos], %[outwidth]\n"
536 "zip2 z9.s, z0.s, z4.s\n"
537 "inch %[outpos], all, mul #1\n"
538 "zip1 z10.s, z1.s, z5.s\n"
539 "zip2 z11.s, z1.s, z5.s\n"
540 "zip1 z12.s, z2.s, z6.s\n"
541 "whilelt p1.h, %[outpos], %[outwidth]\n"
542 "zip2 z13.s, z2.s, z6.s\n"
543 "inch %[outpos], all, mul #1\n"
544 "zip1 z14.s, z3.s, z7.s\n"
545 "zip2 z15.s, z3.s, z7.s\n"
546 "zip1 z0.s, z8.s, z12.s\n"
547 "whilelt p2.h, %[outpos], %[outwidth]\n"
548 "zip2 z1.s, z8.s, z12.s\n"
549 "inch %[outpos], all, mul #1\n"
550 "zip1 z2.s, z9.s, z13.s\n"
551 "zip2 z3.s, z9.s, z13.s\n"
552 "zip1 z4.s, z10.s, z14.s\n"
553 "whilelt p3.h, %[outpos], %[outwidth]\n"
554 "zip2 z5.s, z10.s, z14.s\n"
555 "inch %[outpos], all, mul #1\n"
556 "zip1 z6.s, z11.s, z15.s\n"
557 "zip2 z7.s, z11.s, z15.s\n"
558 "zip1 z8.s, z0.s, z4.s\n"
559 "whilelt p4.h, %[outpos], %[outwidth]\n"
560 "zip2 z9.s, z0.s, z4.s\n"
561 "inch %[outpos], all, mul #1\n"
562 "zip1 z10.s, z1.s, z5.s\n"
563 "st1h z8.h, p0, [%[outptr]]\n"
564 "zip2 z11.s, z1.s, z5.s\n"
565 "zip1 z12.s, z2.s, z6.s\n"
566 "whilelt p5.h, %[outpos], %[outwidth]\n"
567 "zip2 z13.s, z2.s, z6.s\n"
568 "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
569 "zip1 z14.s, z3.s, z7.s\n"
570 "inch %[outpos], all, mul #1\n"
571 "zip2 z15.s, z3.s, z7.s\n"
572 "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
573 "whilelt p6.h, %[outpos], %[outwidth]\n"
574 "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
575 "inch %[outpos], all, mul #1\n"
576 "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
577 "whilelt p7.h, %[outpos], %[outwidth]\n"
578 "inch %[outpos], all, mul #1\n"
579 "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
580 "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
581 "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
582 "addvl %[outptr], %[outptr], #8\n"
583 "b 1b\n"
584 "2:\n"
585 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
586 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
587 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
588 );
589 break;
590
591
592 }
593 }
594}
595
596#endif // __ARM_FEATURE_SVE