blob: f21933b8de468cbeb0aed2c8298861461356969d [file] [log] [blame]
Georgios Pinitas421405b2018-10-26 19:05:32 +01001/*
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01002 * Copyright (c) 2019 Arm Limited.
Georgios Pinitas421405b2018-10-26 19:05:32 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29template<typename T>
30inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
31{
32 uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
33 const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
34
35 for (int y=y0; y<ymax; y+=8)
36 {
37 const int height = ymax-y;
38 const long inwidth = (kmax - k0);
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010039 const long outwidth = ((inwidth + 1) / 2) * 16;
Georgios Pinitas421405b2018-10-26 19:05:32 +010040 long inpos = 0;
41 long outpos = 0;
42
43 uint32_t *outptr = master_outptr;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010044 master_outptr += outwidth;
Georgios Pinitas421405b2018-10-26 19:05:32 +010045
46 const uint32_t *inptr0 = inptr + y * ldin + k0;
47 const uint32_t *inptr1 = inptr0 + ldin;
48 const uint32_t *inptr2 = inptr1 + ldin;
49 const uint32_t *inptr3 = inptr2 + ldin;
50 const uint32_t *inptr4 = inptr3 + ldin;
51 const uint32_t *inptr5 = inptr4 + ldin;
52 const uint32_t *inptr6 = inptr5 + ldin;
53 const uint32_t *inptr7 = inptr6 + ldin;
54
55 switch(height)
56 {
57 case 1:
58 __asm __volatile(
59 "1:\n"
60 "whilelt p0.s, %[inpos], %[inwidth]\n"
61 "b.none 2f\n"
62 "mov z4.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010063 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010064 "incw %[inpos], all, mul #1\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010065 "whilelt p0.s, %[outpos], %[outwidth]\n"
66 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010067 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010068 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010069 "whilelt p1.s, %[outpos], %[outwidth]\n"
70 "zip1 z0.d, z8.d, z4.d\n"
71 "incw %[outpos], all, mul #1\n"
72 "zip2 z1.d, z8.d, z4.d\n"
73 "zip1 z2.d, z9.d, z4.d\n"
74 "zip2 z3.d, z9.d, z4.d\n"
75 "whilelt p2.s, %[outpos], %[outwidth]\n"
76 "zip1 z8.d, z0.d, z4.d\n"
77 "incw %[outpos], all, mul #1\n"
78 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010079 "zip1 z10.d, z1.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010080 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010081 "zip2 z11.d, z1.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010082 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010083 "zip1 z12.d, z2.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010084 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010085 "zip2 z13.d, z2.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010086 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010087 "zip1 z14.d, z3.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010088 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010089 "zip2 z15.d, z3.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +010090 "whilelt p4.s, %[outpos], %[outwidth]\n"
91 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
92 "incw %[outpos], all, mul #1\n"
93 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
94 "whilelt p5.s, %[outpos], %[outwidth]\n"
95 "incw %[outpos], all, mul #1\n"
96 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
97 "whilelt p6.s, %[outpos], %[outwidth]\n"
98 "incw %[outpos], all, mul #1\n"
99 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
100 "whilelt p7.s, %[outpos], %[outwidth]\n"
101 "incw %[outpos], all, mul #1\n"
102 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100103 "addvl %[outptr], %[outptr], #8\n"
104 "b 1b\n"
105 "2:\n"
106 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
107 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100108 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100109 );
110 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100111
Georgios Pinitas421405b2018-10-26 19:05:32 +0100112 case 2:
113 __asm __volatile(
114 "1:\n"
115 "whilelt p0.s, %[inpos], %[inwidth]\n"
116 "b.none 2f\n"
117 "mov z4.s, #0\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100118 "mov z14.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100119 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
120 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
121 "incw %[inpos], all, mul #1\n"
122 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100123 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100124 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100125 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100126 "zip1 z10.d, z1.d, z4.d\n"
127 "zip2 z11.d, z1.d, z4.d\n"
128 "whilelt p1.s, %[outpos], %[outwidth]\n"
129 "zip1 z0.d, z8.d, z4.d\n"
130 "incw %[outpos], all, mul #1\n"
131 "zip2 z1.d, z8.d, z4.d\n"
132 "zip1 z2.d, z9.d, z4.d\n"
133 "zip2 z3.d, z9.d, z4.d\n"
134 "whilelt p2.s, %[outpos], %[outwidth]\n"
135 "zip1 z4.d, z10.d, z14.d\n"
136 "incw %[outpos], all, mul #1\n"
137 "zip2 z5.d, z10.d, z14.d\n"
138 "zip1 z6.d, z11.d, z14.d\n"
139 "zip2 z7.d, z11.d, z14.d\n"
140 "whilelt p3.s, %[outpos], %[outwidth]\n"
141 "zip1 z8.d, z0.d, z4.d\n"
142 "incw %[outpos], all, mul #1\n"
143 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100144 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100145 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100146 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100147 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100148 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100149 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100150 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100151 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100152 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100153 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100154 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100155 "whilelt p5.s, %[outpos], %[outwidth]\n"
156 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
157 "incw %[outpos], all, mul #1\n"
158 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
159 "whilelt p6.s, %[outpos], %[outwidth]\n"
160 "incw %[outpos], all, mul #1\n"
161 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
162 "whilelt p7.s, %[outpos], %[outwidth]\n"
163 "incw %[outpos], all, mul #1\n"
164 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
165 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100166 "addvl %[outptr], %[outptr], #8\n"
167 "b 1b\n"
168 "2:\n"
169 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
170 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100171 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100172 );
173 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100174
Georgios Pinitas421405b2018-10-26 19:05:32 +0100175 case 3:
176 __asm __volatile(
177 "1:\n"
178 "whilelt p0.s, %[inpos], %[inwidth]\n"
179 "b.none 2f\n"
180 "mov z4.s, #0\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100181 "mov z14.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100182 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
183 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
184 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
185 "incw %[inpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100186 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100187 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100188 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100189 "incw %[outpos], all, mul #1\n"
190 "zip1 z10.d, z1.d, z4.d\n"
191 "zip2 z11.d, z1.d, z4.d\n"
192 "zip1 z12.d, z2.d, z4.d\n"
193 "whilelt p1.s, %[outpos], %[outwidth]\n"
194 "zip2 z13.d, z2.d, z4.d\n"
195 "incw %[outpos], all, mul #1\n"
196 "zip1 z0.d, z8.d, z12.d\n"
197 "zip2 z1.d, z8.d, z12.d\n"
198 "zip1 z2.d, z9.d, z13.d\n"
199 "whilelt p2.s, %[outpos], %[outwidth]\n"
200 "zip2 z3.d, z9.d, z13.d\n"
201 "incw %[outpos], all, mul #1\n"
202 "zip1 z4.d, z10.d, z14.d\n"
203 "zip2 z5.d, z10.d, z14.d\n"
204 "zip1 z6.d, z11.d, z14.d\n"
205 "whilelt p3.s, %[outpos], %[outwidth]\n"
206 "zip2 z7.d, z11.d, z14.d\n"
207 "incw %[outpos], all, mul #1\n"
208 "zip1 z8.d, z0.d, z4.d\n"
209 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100210 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100211 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100212 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100213 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100214 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100215 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100216 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100217 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100218 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100219 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100220 "whilelt p5.s, %[outpos], %[outwidth]\n"
221 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
222 "incw %[outpos], all, mul #1\n"
223 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
224 "whilelt p6.s, %[outpos], %[outwidth]\n"
225 "incw %[outpos], all, mul #1\n"
226 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
227 "whilelt p7.s, %[outpos], %[outwidth]\n"
228 "incw %[outpos], all, mul #1\n"
229 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
230 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
231 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100232 "addvl %[outptr], %[outptr], #8\n"
233 "b 1b\n"
234 "2:\n"
235 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
236 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100237 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100238 );
239 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100240
Georgios Pinitas421405b2018-10-26 19:05:32 +0100241 case 4:
242 __asm __volatile(
243 "1:\n"
244 "whilelt p0.s, %[inpos], %[inwidth]\n"
245 "b.none 2f\n"
246 "mov z4.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100247 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
248 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
249 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
250 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100251 "incw %[inpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100252 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100253 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100254 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100255 "incw %[outpos], all, mul #1\n"
256 "zip1 z10.d, z1.d, z4.d\n"
257 "zip2 z11.d, z1.d, z4.d\n"
258 "zip1 z12.d, z2.d, z4.d\n"
259 "whilelt p1.s, %[outpos], %[outwidth]\n"
260 "zip2 z13.d, z2.d, z4.d\n"
261 "incw %[outpos], all, mul #1\n"
262 "zip1 z14.d, z3.d, z4.d\n"
263 "zip2 z15.d, z3.d, z4.d\n"
264 "zip1 z0.d, z8.d, z12.d\n"
265 "whilelt p2.s, %[outpos], %[outwidth]\n"
266 "zip2 z1.d, z8.d, z12.d\n"
267 "incw %[outpos], all, mul #1\n"
268 "zip1 z2.d, z9.d, z13.d\n"
269 "zip2 z3.d, z9.d, z13.d\n"
270 "zip1 z4.d, z10.d, z14.d\n"
271 "whilelt p3.s, %[outpos], %[outwidth]\n"
272 "zip2 z5.d, z10.d, z14.d\n"
273 "incw %[outpos], all, mul #1\n"
274 "zip1 z6.d, z11.d, z15.d\n"
275 "zip2 z7.d, z11.d, z15.d\n"
276 "zip1 z8.d, z0.d, z4.d\n"
277 "whilelt p4.s, %[outpos], %[outwidth]\n"
278 "zip2 z9.d, z0.d, z4.d\n"
279 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100280 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100281 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100282 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100283 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100284 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100285 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100286 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100287 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100288 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100289 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100290 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
291 "whilelt p6.s, %[outpos], %[outwidth]\n"
292 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
293 "incw %[outpos], all, mul #1\n"
294 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
295 "whilelt p7.s, %[outpos], %[outwidth]\n"
296 "incw %[outpos], all, mul #1\n"
297 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
298 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
299 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100300 "addvl %[outptr], %[outptr], #8\n"
301 "b 1b\n"
302 "2:\n"
303 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
304 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100305 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100306 );
307 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100308
Georgios Pinitas421405b2018-10-26 19:05:32 +0100309 case 5:
310 __asm __volatile(
311 "1:\n"
312 "whilelt p0.s, %[inpos], %[inwidth]\n"
313 "b.none 2f\n"
314 "mov z5.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100315 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
316 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
317 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
318 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
319 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100320 "incw %[inpos], all, mul #1\n"
321 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100322 "whilelt p0.s, %[outpos], %[outwidth]\n"
323 "zip1 z8.d, z0.d, z4.d\n"
324 "incw %[outpos], all, mul #1\n"
325 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100326 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100327 "zip1 z12.d, z2.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100328 "whilelt p1.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100329 "zip2 z13.d, z2.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100330 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100331 "zip1 z14.d, z3.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100332 "zip2 z15.d, z3.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100333 "zip1 z0.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100334 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100335 "zip2 z1.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100336 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100337 "zip1 z2.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100338 "zip2 z3.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100339 "zip1 z4.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100340 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100341 "zip2 z5.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100342 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100343 "zip1 z6.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100344 "zip2 z7.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100345 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100346 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100347 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100348 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100349 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100350 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100351 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100352 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100353 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100354 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100355 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100356 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100357 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100358 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100359 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
360 "whilelt p6.s, %[outpos], %[outwidth]\n"
361 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
362 "incw %[outpos], all, mul #1\n"
363 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
364 "whilelt p7.s, %[outpos], %[outwidth]\n"
365 "incw %[outpos], all, mul #1\n"
366 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
367 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
368 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100369 "addvl %[outptr], %[outptr], #8\n"
370 "b 1b\n"
371 "2:\n"
372 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
373 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100374 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100375 );
376 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100377
Georgios Pinitas421405b2018-10-26 19:05:32 +0100378 case 6:
379 __asm __volatile(
380 "1:\n"
381 "whilelt p0.s, %[inpos], %[inwidth]\n"
382 "b.none 2f\n"
383 "mov z6.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100384 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
385 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
386 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
387 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
388 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
389 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100390 "incw %[inpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100391 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100392 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100393 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100394 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100395 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100396 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100397 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100398 "whilelt p1.s, %[outpos], %[outwidth]\n"
399 "zip2 z13.d, z2.d, z6.d\n"
400 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100401 "zip1 z14.d, z3.d, z6.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100402 "zip2 z15.d, z3.d, z6.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100403 "zip1 z0.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100404 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100405 "zip2 z1.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100406 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100407 "zip1 z2.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100408 "zip2 z3.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100409 "zip1 z4.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100410 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100411 "zip2 z5.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100412 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100413 "zip1 z6.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100414 "zip2 z7.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100415 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100416 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100417 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100418 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100419 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100420 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100421 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100422 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100423 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100424 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100425 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100426 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100427 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100428 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100429 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
430 "whilelt p6.s, %[outpos], %[outwidth]\n"
431 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
432 "incw %[outpos], all, mul #1\n"
433 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
434 "whilelt p7.s, %[outpos], %[outwidth]\n"
435 "incw %[outpos], all, mul #1\n"
436 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
437 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
438 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100439 "addvl %[outptr], %[outptr], #8\n"
440 "b 1b\n"
441 "2:\n"
442 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
443 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100444 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100445 );
446 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100447
Georgios Pinitas421405b2018-10-26 19:05:32 +0100448 case 7:
449 __asm __volatile(
450 "1:\n"
451 "whilelt p0.s, %[inpos], %[inwidth]\n"
452 "b.none 2f\n"
453 "mov z7.s, #0\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100454 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
455 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
456 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
457 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
458 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
459 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
460 "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100461 "incw %[inpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100462 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100463 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100464 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100465 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100466 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100467 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100468 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100469 "whilelt p1.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100470 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100471 "incw %[outpos], all, mul #1\n"
472 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100473 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100474 "zip1 z0.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100475 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100476 "zip2 z1.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100477 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100478 "zip1 z2.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100479 "zip2 z3.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100480 "zip1 z4.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100481 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100482 "zip2 z5.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100483 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100484 "zip1 z6.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100485 "zip2 z7.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100486 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100487 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100488 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100489 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100490 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100491 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100492 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100493 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100494 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100495 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100496 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100497 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100498 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100499 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100500 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
501 "whilelt p6.s, %[outpos], %[outwidth]\n"
502 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
503 "incw %[outpos], all, mul #1\n"
504 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
505 "whilelt p7.s, %[outpos], %[outwidth]\n"
506 "incw %[outpos], all, mul #1\n"
507 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
508 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
509 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100510 "addvl %[outptr], %[outptr], #8\n"
511 "b 1b\n"
512 "2:\n"
513 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
514 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100515 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100516 );
517 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100518
Georgios Pinitas421405b2018-10-26 19:05:32 +0100519 default:
520 case 8:
521 __asm __volatile(
522 "1:\n"
523 "whilelt p0.s, %[inpos], %[inwidth]\n"
524 "b.none 2f\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100525 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
526 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
527 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
528 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
529 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
530 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
531 "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
532 "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100533 "incw %[inpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100534 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100535 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100536 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100537 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100538 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100539 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100540 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100541 "whilelt p1.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100542 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100543 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100544 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100545 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100546 "zip1 z0.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100547 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100548 "zip2 z1.d, z8.d, z12.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100549 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100550 "zip1 z2.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100551 "zip2 z3.d, z9.d, z13.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100552 "zip1 z4.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100553 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100554 "zip2 z5.d, z10.d, z14.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100555 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100556 "zip1 z6.d, z11.d, z15.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100557 "zip2 z7.d, z11.d, z15.d\n"
558 "zip1 z8.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100559 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100560 "zip2 z9.d, z0.d, z4.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100561 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100562 "zip1 z10.d, z1.d, z5.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100563 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100564 "zip2 z11.d, z1.d, z5.d\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100565 "zip1 z12.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100566 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100567 "zip2 z13.d, z2.d, z6.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100568 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100569 "zip1 z14.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100570 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100571 "zip2 z15.d, z3.d, z7.d\n"
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100572 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
573 "whilelt p6.s, %[outpos], %[outwidth]\n"
574 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
575 "incw %[outpos], all, mul #1\n"
576 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
577 "whilelt p7.s, %[outpos], %[outwidth]\n"
578 "incw %[outpos], all, mul #1\n"
579 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
580 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
581 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100582 "addvl %[outptr], %[outptr], #8\n"
583 "b 1b\n"
584 "2:\n"
585 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
586 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100587 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100588 );
589 break;
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +0100590
591
Georgios Pinitas421405b2018-10-26 19:05:32 +0100592 }
593 }
594}
595
596#endif // __ARM_FEATURE_SVE