blob: 348d78e3f50158becd53faeeb64712c7e50d9fa9 [file] [log] [blame]
Georgios Pinitas421405b2018-10-26 19:05:32 +01001/*
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01002 * Copyright (c) 2019 Arm Limited.
Georgios Pinitas421405b2018-10-26 19:05:32 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29template<typename T>
30inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
31{
32 uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
33 const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
34
35 for (int y=y0; y<ymax; y+=8)
36 {
37 const int height = ymax-y;
38 const long inwidth = (kmax - k0);
39 const long outwidth = inwidth * 8;
40 long inpos = 0;
41 long outpos = 0;
42
43 uint32_t *outptr = master_outptr;
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010044 master_outptr += outwidth;
Georgios Pinitas421405b2018-10-26 19:05:32 +010045
46 const uint32_t *inptr0 = inptr + y * ldin + k0;
47 const uint32_t *inptr1 = inptr0 + ldin;
48 const uint32_t *inptr2 = inptr1 + ldin;
49 const uint32_t *inptr3 = inptr2 + ldin;
50 const uint32_t *inptr4 = inptr3 + ldin;
51 const uint32_t *inptr5 = inptr4 + ldin;
52 const uint32_t *inptr6 = inptr5 + ldin;
53 const uint32_t *inptr7 = inptr6 + ldin;
54
55 switch(height)
56 {
57 case 1:
58 __asm __volatile(
59 "1:\n"
60 "whilelt p0.s, %[inpos], %[inwidth]\n"
61 "b.none 2f\n"
62 "mov z4.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010063 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010064 "incw %[inpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000065 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000066 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010067 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000068 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000069 "whilelt p1.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010070 "zip1 z0.s, z8.s, z4.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +010071 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010072 "zip2 z1.s, z8.s, z4.s\n"
73 "zip1 z2.s, z9.s, z4.s\n"
74 "zip2 z3.s, z9.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000075 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010076 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000077 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010078 "zip2 z9.s, z0.s, z4.s\n"
79 "zip1 z10.s, z1.s, z4.s\n"
80 "st1w z8.s, p0, [%[outptr]]\n"
81 "zip2 z11.s, z1.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000082 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010083 "zip1 z12.s, z2.s, z4.s\n"
84 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
85 "zip2 z13.s, z2.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +000086 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +010087 "zip1 z14.s, z3.s, z4.s\n"
88 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
89 "zip2 z15.s, z3.s, z4.s\n"
90 "whilelt p4.s, %[outpos], %[outwidth]\n"
91 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
92 "incw %[outpos], all, mul #1\n"
93 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
94 "whilelt p5.s, %[outpos], %[outwidth]\n"
95 "incw %[outpos], all, mul #1\n"
96 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
97 "whilelt p6.s, %[outpos], %[outwidth]\n"
98 "incw %[outpos], all, mul #1\n"
99 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
100 "whilelt p7.s, %[outpos], %[outwidth]\n"
101 "incw %[outpos], all, mul #1\n"
102 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100103 "addvl %[outptr], %[outptr], #8\n"
104 "b 1b\n"
105 "2:\n"
106 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
107 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100108 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100109 );
110 break;
111
112 case 2:
113 __asm __volatile(
114 "1:\n"
115 "whilelt p0.s, %[inpos], %[inwidth]\n"
116 "b.none 2f\n"
117 "mov z4.s, #0\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000118 "mov z14.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100119 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
120 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
121 "incw %[inpos], all, mul #1\n"
122 "whilelt p0.s, %[outpos], %[outwidth]\n"
123 "zip1 z8.s, z0.s, z4.s\n"
124 "incw %[outpos], all, mul #1\n"
125 "zip2 z9.s, z0.s, z4.s\n"
126 "zip1 z10.s, z1.s, z4.s\n"
127 "zip2 z11.s, z1.s, z4.s\n"
128 "whilelt p1.s, %[outpos], %[outwidth]\n"
129 "zip1 z0.s, z8.s, z4.s\n"
130 "incw %[outpos], all, mul #1\n"
131 "zip2 z1.s, z8.s, z4.s\n"
132 "zip1 z2.s, z9.s, z4.s\n"
133 "zip2 z3.s, z9.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000134 "whilelt p2.s, %[outpos], %[outwidth]\n"
135 "zip1 z4.s, z10.s, z14.s\n"
136 "incw %[outpos], all, mul #1\n"
137 "zip2 z5.s, z10.s, z14.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100138 "zip1 z6.s, z11.s, z14.s\n"
139 "zip2 z7.s, z11.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000140 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100141 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000142 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100143 "zip2 z9.s, z0.s, z4.s\n"
144 "zip1 z10.s, z1.s, z5.s\n"
145 "st1w z8.s, p0, [%[outptr]]\n"
146 "zip2 z11.s, z1.s, z5.s\n"
147 "whilelt p4.s, %[outpos], %[outwidth]\n"
148 "zip1 z12.s, z2.s, z6.s\n"
149 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
150 "zip2 z13.s, z2.s, z6.s\n"
151 "incw %[outpos], all, mul #1\n"
152 "zip1 z14.s, z3.s, z7.s\n"
153 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
154 "zip2 z15.s, z3.s, z7.s\n"
155 "whilelt p5.s, %[outpos], %[outwidth]\n"
156 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
157 "incw %[outpos], all, mul #1\n"
158 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
159 "whilelt p6.s, %[outpos], %[outwidth]\n"
160 "incw %[outpos], all, mul #1\n"
161 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
162 "whilelt p7.s, %[outpos], %[outwidth]\n"
163 "incw %[outpos], all, mul #1\n"
164 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
165 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100166 "addvl %[outptr], %[outptr], #8\n"
167 "b 1b\n"
168 "2:\n"
169 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
170 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100171 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100172 );
173 break;
174
175 case 3:
176 __asm __volatile(
177 "1:\n"
178 "whilelt p0.s, %[inpos], %[inwidth]\n"
179 "b.none 2f\n"
180 "mov z4.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100181 "mov z14.s, #0\n"
182 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
183 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
184 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000185 "incw %[inpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100186 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000187 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100188 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000189 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100190 "zip1 z10.s, z1.s, z4.s\n"
191 "zip2 z11.s, z1.s, z4.s\n"
192 "zip1 z12.s, z2.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000193 "whilelt p1.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100194 "zip2 z13.s, z2.s, z4.s\n"
195 "incw %[outpos], all, mul #1\n"
196 "zip1 z0.s, z8.s, z12.s\n"
197 "zip2 z1.s, z8.s, z12.s\n"
198 "zip1 z2.s, z9.s, z13.s\n"
199 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100200 "zip2 z3.s, z9.s, z13.s\n"
201 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000202 "zip1 z4.s, z10.s, z14.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100203 "zip2 z5.s, z10.s, z14.s\n"
204 "zip1 z6.s, z11.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000205 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100206 "zip2 z7.s, z11.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000207 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100208 "zip1 z8.s, z0.s, z4.s\n"
209 "zip2 z9.s, z0.s, z4.s\n"
210 "zip1 z10.s, z1.s, z5.s\n"
211 "whilelt p4.s, %[outpos], %[outwidth]\n"
212 "zip2 z11.s, z1.s, z5.s\n"
213 "st1w z8.s, p0, [%[outptr]]\n"
214 "zip1 z12.s, z2.s, z6.s\n"
215 "incw %[outpos], all, mul #1\n"
216 "zip2 z13.s, z2.s, z6.s\n"
217 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
218 "zip1 z14.s, z3.s, z7.s\n"
219 "zip2 z15.s, z3.s, z7.s\n"
220 "whilelt p5.s, %[outpos], %[outwidth]\n"
221 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
222 "incw %[outpos], all, mul #1\n"
223 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
224 "whilelt p6.s, %[outpos], %[outwidth]\n"
225 "incw %[outpos], all, mul #1\n"
226 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
227 "whilelt p7.s, %[outpos], %[outwidth]\n"
228 "incw %[outpos], all, mul #1\n"
229 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
230 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
231 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100232 "addvl %[outptr], %[outptr], #8\n"
233 "b 1b\n"
234 "2:\n"
235 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
236 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100237 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100238 );
239 break;
240
241 case 4:
242 __asm __volatile(
243 "1:\n"
244 "whilelt p0.s, %[inpos], %[inwidth]\n"
245 "b.none 2f\n"
246 "mov z4.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100247 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
248 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
249 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
250 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000251 "incw %[inpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100252 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000253 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100254 "zip2 z9.s, z0.s, z4.s\n"
255 "incw %[outpos], all, mul #1\n"
256 "zip1 z10.s, z1.s, z4.s\n"
257 "zip2 z11.s, z1.s, z4.s\n"
258 "zip1 z12.s, z2.s, z4.s\n"
259 "whilelt p1.s, %[outpos], %[outwidth]\n"
260 "zip2 z13.s, z2.s, z4.s\n"
261 "incw %[outpos], all, mul #1\n"
262 "zip1 z14.s, z3.s, z4.s\n"
263 "zip2 z15.s, z3.s, z4.s\n"
264 "zip1 z0.s, z8.s, z12.s\n"
265 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100266 "zip2 z1.s, z8.s, z12.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000267 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100268 "zip1 z2.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100269 "zip2 z3.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100270 "zip1 z4.s, z10.s, z14.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100271 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100272 "zip2 z5.s, z10.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000273 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100274 "zip1 z6.s, z11.s, z15.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000275 "zip2 z7.s, z11.s, z15.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100276 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100277 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000278 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100279 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000280 "zip1 z10.s, z1.s, z5.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100281 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000282 "zip2 z11.s, z1.s, z5.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000283 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100284 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000285 "zip2 z13.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100286 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000287 "zip1 z14.s, z3.s, z7.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100288 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000289 "zip2 z15.s, z3.s, z7.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100290 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
291 "whilelt p6.s, %[outpos], %[outwidth]\n"
292 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100293 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100294 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
295 "whilelt p7.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000296 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100297 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
298 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
299 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100300 "addvl %[outptr], %[outptr], #8\n"
301 "b 1b\n"
302 "2:\n"
303 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
304 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100305 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100306 );
307 break;
308
309 case 5:
310 __asm __volatile(
311 "1:\n"
312 "whilelt p0.s, %[inpos], %[inwidth]\n"
313 "b.none 2f\n"
314 "mov z5.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100315 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
316 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
317 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
318 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
319 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100320 "incw %[inpos], all, mul #1\n"
321 "zip1 z10.s, z1.s, z5.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000322 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100323 "zip1 z8.s, z0.s, z4.s\n"
324 "incw %[outpos], all, mul #1\n"
325 "zip2 z9.s, z0.s, z4.s\n"
326 "zip2 z11.s, z1.s, z5.s\n"
327 "zip1 z12.s, z2.s, z5.s\n"
328 "whilelt p1.s, %[outpos], %[outwidth]\n"
329 "zip2 z13.s, z2.s, z5.s\n"
330 "incw %[outpos], all, mul #1\n"
331 "zip1 z14.s, z3.s, z5.s\n"
332 "zip2 z15.s, z3.s, z5.s\n"
333 "zip1 z0.s, z8.s, z12.s\n"
334 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100335 "zip2 z1.s, z8.s, z12.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000336 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100337 "zip1 z2.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100338 "zip2 z3.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100339 "zip1 z4.s, z10.s, z14.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100340 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100341 "zip2 z5.s, z10.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000342 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100343 "zip1 z6.s, z11.s, z15.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000344 "zip2 z7.s, z11.s, z15.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100345 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100346 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000347 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100348 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000349 "zip1 z10.s, z1.s, z5.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100350 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000351 "zip2 z11.s, z1.s, z5.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000352 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100353 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000354 "zip2 z13.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100355 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000356 "zip1 z14.s, z3.s, z7.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100357 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000358 "zip2 z15.s, z3.s, z7.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100359 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
360 "whilelt p6.s, %[outpos], %[outwidth]\n"
361 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100362 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100363 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
364 "whilelt p7.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000365 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100366 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
367 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
368 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100369 "addvl %[outptr], %[outptr], #8\n"
370 "b 1b\n"
371 "2:\n"
372 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
373 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100374 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100375 );
376 break;
377
378 case 6:
379 __asm __volatile(
380 "1:\n"
381 "whilelt p0.s, %[inpos], %[inwidth]\n"
382 "b.none 2f\n"
383 "mov z6.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100384 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
385 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
386 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
387 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
388 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
389 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100390 "incw %[inpos], all, mul #1\n"
391 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000392 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100393 "zip1 z8.s, z0.s, z4.s\n"
394 "incw %[outpos], all, mul #1\n"
395 "zip2 z9.s, z0.s, z4.s\n"
396 "zip1 z10.s, z1.s, z5.s\n"
397 "zip2 z11.s, z1.s, z5.s\n"
398 "whilelt p1.s, %[outpos], %[outwidth]\n"
399 "zip2 z13.s, z2.s, z6.s\n"
400 "incw %[outpos], all, mul #1\n"
401 "zip1 z14.s, z3.s, z6.s\n"
402 "zip2 z15.s, z3.s, z6.s\n"
403 "zip1 z0.s, z8.s, z12.s\n"
404 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100405 "zip2 z1.s, z8.s, z12.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000406 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100407 "zip1 z2.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100408 "zip2 z3.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100409 "zip1 z4.s, z10.s, z14.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100410 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100411 "zip2 z5.s, z10.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000412 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100413 "zip1 z6.s, z11.s, z15.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000414 "zip2 z7.s, z11.s, z15.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100415 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100416 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000417 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100418 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000419 "zip1 z10.s, z1.s, z5.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100420 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000421 "zip2 z11.s, z1.s, z5.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000422 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100423 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000424 "zip2 z13.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100425 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000426 "zip1 z14.s, z3.s, z7.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100427 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000428 "zip2 z15.s, z3.s, z7.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100429 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
430 "whilelt p6.s, %[outpos], %[outwidth]\n"
431 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100432 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100433 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
434 "whilelt p7.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000435 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100436 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
437 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
438 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100439 "addvl %[outptr], %[outptr], #8\n"
440 "b 1b\n"
441 "2:\n"
442 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
443 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100444 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100445 );
446 break;
447
448 case 7:
449 __asm __volatile(
450 "1:\n"
451 "whilelt p0.s, %[inpos], %[inwidth]\n"
452 "b.none 2f\n"
453 "mov z7.s, #0\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100454 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
455 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
456 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
457 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
458 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
459 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
460 "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100461 "incw %[inpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100462 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000463 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100464 "zip2 z9.s, z0.s, z4.s\n"
465 "incw %[outpos], all, mul #1\n"
466 "zip1 z10.s, z1.s, z5.s\n"
467 "zip2 z11.s, z1.s, z5.s\n"
468 "zip1 z12.s, z2.s, z6.s\n"
469 "whilelt p1.s, %[outpos], %[outwidth]\n"
470 "zip2 z13.s, z2.s, z6.s\n"
471 "incw %[outpos], all, mul #1\n"
472 "zip1 z14.s, z3.s, z7.s\n"
473 "zip2 z15.s, z3.s, z7.s\n"
474 "zip1 z0.s, z8.s, z12.s\n"
475 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100476 "zip2 z1.s, z8.s, z12.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000477 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100478 "zip1 z2.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100479 "zip2 z3.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100480 "zip1 z4.s, z10.s, z14.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100481 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100482 "zip2 z5.s, z10.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000483 "incw %[outpos], all, mul #1\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100484 "zip1 z6.s, z11.s, z15.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000485 "zip2 z7.s, z11.s, z15.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100486 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100487 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000488 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100489 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000490 "zip1 z10.s, z1.s, z5.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100491 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000492 "zip2 z11.s, z1.s, z5.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000493 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100494 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000495 "zip2 z13.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100496 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000497 "zip1 z14.s, z3.s, z7.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100498 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000499 "zip2 z15.s, z3.s, z7.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100500 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
501 "whilelt p6.s, %[outpos], %[outwidth]\n"
502 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100503 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100504 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
505 "whilelt p7.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000506 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100507 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
508 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
509 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100510 "addvl %[outptr], %[outptr], #8\n"
511 "b 1b\n"
512 "2:\n"
513 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
514 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100515 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100516 );
517 break;
518
519 default:
520 case 8:
521 __asm __volatile(
522 "1:\n"
523 "whilelt p0.s, %[inpos], %[inwidth]\n"
524 "b.none 2f\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100525 "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
526 "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
527 "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
528 "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
529 "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
530 "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
531 "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
532 "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100533 "incw %[inpos], all, mul #1\n"
534 "zip1 z8.s, z0.s, z4.s\n"
535 "whilelt p0.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100536 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000537 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100538 "zip1 z10.s, z1.s, z5.s\n"
539 "zip2 z11.s, z1.s, z5.s\n"
540 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000541 "whilelt p1.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100542 "zip2 z13.s, z2.s, z6.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000543 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100544 "zip1 z14.s, z3.s, z7.s\n"
545 "zip2 z15.s, z3.s, z7.s\n"
546 "zip1 z0.s, z8.s, z12.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000547 "whilelt p2.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100548 "zip2 z1.s, z8.s, z12.s\n"
549 "incw %[outpos], all, mul #1\n"
550 "zip1 z2.s, z9.s, z13.s\n"
551 "zip2 z3.s, z9.s, z13.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100552 "zip1 z4.s, z10.s, z14.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100553 "whilelt p3.s, %[outpos], %[outwidth]\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100554 "zip2 z5.s, z10.s, z14.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000555 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100556 "zip1 z6.s, z11.s, z15.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000557 "zip2 z7.s, z11.s, z15.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100558 "zip1 z8.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100559 "whilelt p4.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000560 "zip2 z9.s, z0.s, z4.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100561 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000562 "zip1 z10.s, z1.s, z5.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100563 "st1w z8.s, p0, [%[outptr]]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000564 "zip2 z11.s, z1.s, z5.s\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000565 "zip1 z12.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100566 "whilelt p5.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000567 "zip2 z13.s, z2.s, z6.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100568 "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000569 "zip1 z14.s, z3.s, z7.s\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100570 "incw %[outpos], all, mul #1\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000571 "zip2 z15.s, z3.s, z7.s\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100572 "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
573 "whilelt p6.s, %[outpos], %[outwidth]\n"
574 "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100575 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100576 "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
577 "whilelt p7.s, %[outpos], %[outwidth]\n"
Georgios Pinitas7cd26d42019-01-09 18:35:17 +0000578 "incw %[outpos], all, mul #1\n"
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100579 "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
580 "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
581 "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100582 "addvl %[outptr], %[outptr], #8\n"
583 "b 1b\n"
584 "2:\n"
585 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
586 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
Georgios Pinitascfa2bba2019-06-27 17:00:52 +0100587 : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
Georgios Pinitas421405b2018-10-26 19:05:32 +0100588 );
589 break;
590
591
592 }
593 }
594}
595
596#endif // __ARM_FEATURE_SVE