blob: cf1d10329bfc3228e735d973efe8eae1679f22ad [file] [log] [blame]
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001/*
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01002 * Copyright (c) 2019-2020 Arm Limited.
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#pragma once
25
26#ifdef __ARM_FEATURE_SVE
27
28template<>
29void MergeResults<3, 8, true>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const __fp16 *bias, Activation act, bool append)
30{
31 const __fp16 *inptr = in;
Georgios Pinitasc7b183a2020-03-06 18:12:09 +000032 __fp16 nullbias[384];
Georgios Pinitas48b3ef82019-10-14 19:03:09 +010033 __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
34 __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
35
36 switch(act.type)
37 {
38 default:
39 case Activation::Type::None:
40 break;
41 case Activation::Type::BoundedReLU:
42 maxval = static_cast<__fp16>(act.param1);
43 /* fall through */
44 case Activation::Type::ReLU:
45 minval = 0.0f;
46 break;
47 }
48
49 if (!append && !bias)
50 {
51 memset(nullbias, 0, (3 * get_vector_length<__fp16>() * sizeof(__fp16)));
52 }
53
54 for (int y=y0; y<ymax; y+=8)
55 {
56 __fp16 *outptr0 = out + (y * ldout) + x0;
57 __fp16 *outptr1 = outptr0 + ldout;
58 __fp16 *outptr2 = outptr1 + ldout;
59 __fp16 *outptr3 = outptr2 + ldout;
60 __fp16 *outptr4 = outptr3 + ldout;
61 __fp16 *outptr5 = outptr4 + ldout;
62 __fp16 *outptr6 = outptr5 + ldout;
63 __fp16 *outptr7 = outptr6 + ldout;
64
65 const int height = ymax - y;
66
67 for (int i=x0; i<xmax; i+=(3 * get_vector_length<__fp16>()))
68 {
69 if (append)
70 {
71 switch(height)
72 {
73 case 1:
74 {
75 long w = xmax - i;
76 long p = 0;
77 /* Optimized routine to copy an entire block */
78 __asm __volatile (
79 "mov z0.h, %h[maxval]\n"
80 "addvl x8, %[inptr], #16\n"
81 "mov z1.h, %h[minval]\n"
82 "whilelt p0.h, %[p], %[w]\n"
83 "inch %[p], all, mul #1\n"
84 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
85 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
86 "ld1h z2.h, p0/z, [%[outptr0]]\n"
87 "whilelt p1.h, %[p], %[w]\n"
88 "ld1h z10.h, p0/z, [%[inptr]]\n"
89 "inch %[p], all, mul #1\n"
90 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
91 "fadd z10.h, z10.h, z2.h\n"
92 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
93 "whilelt p2.h, %[p], %[w]\n"
94 "fmin z10.h, p0/m, z10.h, z0.h\n"
95 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
96 "fadd z11.h, z11.h, z3.h\n"
97 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
98 "addvl %[inptr], %[inptr], #24\n"
99 "fmax z10.h, p0/m, z10.h, z1.h\n"
100 "fmin z11.h, p1/m, z11.h, z0.h\n"
101 "fadd z12.h, z12.h, z4.h\n"
102 "st1h z10.h, p0, [%[outptr0]]\n"
103 "fmax z11.h, p1/m, z11.h, z1.h\n"
104 "fmin z12.h, p2/m, z12.h, z0.h\n"
105 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
106 "fmax z12.h, p2/m, z12.h, z1.h\n"
107 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
108 "addvl %[outptr0], %[outptr0], #3\n"
109 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
110 [inptr] "+r" (inptr), [p] "+r" (p)
111 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
112 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
113 );
114 }
115 break;
116
117 case 2:
118 {
119 long w = xmax - i;
120 long p = 0;
121 /* Optimized routine to copy an entire block */
122 __asm __volatile (
123 "mov z0.h, %h[maxval]\n"
124 "addvl x8, %[inptr], #16\n"
125 "mov z1.h, %h[minval]\n"
126 "whilelt p0.h, %[p], %[w]\n"
127 "inch %[p], all, mul #1\n"
128 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
129 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
130 "ld1h z2.h, p0/z, [%[outptr0]]\n"
131 "whilelt p1.h, %[p], %[w]\n"
132 "ld1h z10.h, p0/z, [%[inptr]]\n"
133 "inch %[p], all, mul #1\n"
134 "ld1h z5.h, p0/z, [%[outptr1]]\n"
135 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
136 "fadd z10.h, z10.h, z2.h\n"
137 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
138 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
139 "whilelt p2.h, %[p], %[w]\n"
140 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
141 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
142 "fmin z10.h, p0/m, z10.h, z0.h\n"
143 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
144 "fadd z11.h, z11.h, z3.h\n"
145 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
146 "fadd z13.h, z13.h, z5.h\n"
147 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
148 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
149 "fmax z10.h, p0/m, z10.h, z1.h\n"
150 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
151 "fmin z11.h, p1/m, z11.h, z0.h\n"
152 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
153 "fadd z12.h, z12.h, z4.h\n"
154 "addvl %[inptr], %[inptr], #24\n"
155 "fmin z13.h, p0/m, z13.h, z0.h\n"
156 "st1h z10.h, p0, [%[outptr0]]\n"
157 "fmax z11.h, p1/m, z11.h, z1.h\n"
158 "fmin z12.h, p2/m, z12.h, z0.h\n"
159 "fadd z14.h, z14.h, z6.h\n"
160 "fmax z13.h, p0/m, z13.h, z1.h\n"
161 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
162 "fadd z15.h, z15.h, z7.h\n"
163 "fmax z12.h, p2/m, z12.h, z1.h\n"
164 "fmin z14.h, p1/m, z14.h, z0.h\n"
165 "fmin z15.h, p2/m, z15.h, z0.h\n"
166 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
167 "addvl %[outptr0], %[outptr0], #3\n"
168 "fmax z14.h, p1/m, z14.h, z1.h\n"
169 "fmax z15.h, p2/m, z15.h, z1.h\n"
170 "st1h z13.h, p0, [%[outptr1]]\n"
171 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
172 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
173 "addvl %[outptr1], %[outptr1], #3\n"
174 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
175 [inptr] "+r" (inptr), [p] "+r" (p)
176 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
177 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
178 );
179 }
180 break;
181
182 case 3:
183 {
184 long w = xmax - i;
185 long p = 0;
186 /* Optimized routine to copy an entire block */
187 __asm __volatile (
188 "mov z0.h, %h[maxval]\n"
189 "addvl x8, %[inptr], #16\n"
190 "mov z1.h, %h[minval]\n"
191 "whilelt p0.h, %[p], %[w]\n"
192 "inch %[p], all, mul #1\n"
193 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
194 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
195 "ld1h z2.h, p0/z, [%[outptr0]]\n"
196 "whilelt p1.h, %[p], %[w]\n"
197 "ld1h z10.h, p0/z, [%[inptr]]\n"
198 "inch %[p], all, mul #1\n"
199 "ld1h z5.h, p0/z, [%[outptr1]]\n"
200 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
201 "fadd z10.h, z10.h, z2.h\n"
202 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
203 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
204 "whilelt p2.h, %[p], %[w]\n"
205 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
206 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
207 "fmin z10.h, p0/m, z10.h, z0.h\n"
208 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
209 "fadd z11.h, z11.h, z3.h\n"
210 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
211 "fadd z13.h, z13.h, z5.h\n"
212 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
213 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
214 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
215 "fmax z10.h, p0/m, z10.h, z1.h\n"
216 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
217 "fmin z11.h, p1/m, z11.h, z0.h\n"
218 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
219 "fadd z12.h, z12.h, z4.h\n"
220 "ld1h z8.h, p0/z, [%[outptr2]]\n"
221 "fmin z13.h, p0/m, z13.h, z0.h\n"
222 "st1h z10.h, p0, [%[outptr0]]\n"
223 "fadd z14.h, z14.h, z6.h\n"
224 "ld1h z16.h, p0/z, [%[inptr], #6, MUL VL]\n"
225 "fmax z11.h, p1/m, z11.h, z1.h\n"
226 "ld1h z9.h, p1/z, [%[outptr2], #1, MUL VL]\n"
227 "fmin z12.h, p2/m, z12.h, z0.h\n"
228 "ld1h z17.h, p1/z, [%[inptr], #7, MUL VL]\n"
229 "fmax z13.h, p0/m, z13.h, z1.h\n"
230 "ld1h z2.h, p2/z, [%[outptr2], #2, MUL VL]\n"
231 "fmin z14.h, p1/m, z14.h, z0.h\n"
232 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
233 "fadd z15.h, z15.h, z7.h\n"
234 "ld1h z10.h, p2/z, [x8, #-8, MUL VL]\n"
235 "fmax z12.h, p2/m, z12.h, z1.h\n"
236 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
237 "fmax z14.h, p1/m, z14.h, z1.h\n"
238 "addvl %[inptr], %[inptr], #24\n"
239 "fmin z15.h, p2/m, z15.h, z0.h\n"
240 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
241 "fadd z16.h, z16.h, z8.h\n"
242 "addvl %[outptr0], %[outptr0], #3\n"
243 "fadd z17.h, z17.h, z9.h\n"
244 "st1h z13.h, p0, [%[outptr1]]\n"
245 "fmax z15.h, p2/m, z15.h, z1.h\n"
246 "fmin z16.h, p0/m, z16.h, z0.h\n"
247 "fadd z10.h, z10.h, z2.h\n"
248 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
249 "fmin z17.h, p1/m, z17.h, z0.h\n"
250 "fmax z16.h, p0/m, z16.h, z1.h\n"
251 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
252 "fmin z10.h, p2/m, z10.h, z0.h\n"
253 "addvl %[outptr1], %[outptr1], #3\n"
254 "fmax z17.h, p1/m, z17.h, z1.h\n"
255 "st1h z16.h, p0, [%[outptr2]]\n"
256 "fmax z10.h, p2/m, z10.h, z1.h\n"
257 "st1h z17.h, p1, [%[outptr2], #1, MUL VL]\n"
258 "st1h z10.h, p2, [%[outptr2], #2, MUL VL]\n"
259 "addvl %[outptr2], %[outptr2], #3\n"
260 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
261 [inptr] "+r" (inptr), [p] "+r" (p)
262 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
263 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
264 );
265 }
266 break;
267
268 case 4:
269 {
270 long w = xmax - i;
271 long p = 0;
272 /* Optimized routine to copy an entire block */
273 __asm __volatile (
274 "mov z0.h, %h[maxval]\n"
275 "addvl x8, %[inptr], #16\n"
276 "mov z1.h, %h[minval]\n"
277 "whilelt p0.h, %[p], %[w]\n"
278 "inch %[p], all, mul #1\n"
279 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
280 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
281 "ld1h z2.h, p0/z, [%[outptr0]]\n"
282 "whilelt p1.h, %[p], %[w]\n"
283 "ld1h z10.h, p0/z, [%[inptr]]\n"
284 "inch %[p], all, mul #1\n"
285 "ld1h z5.h, p0/z, [%[outptr1]]\n"
286 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
287 "fadd z10.h, z10.h, z2.h\n"
288 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
289 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
290 "whilelt p2.h, %[p], %[w]\n"
291 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
292 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
293 "fmin z10.h, p0/m, z10.h, z0.h\n"
294 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
295 "fadd z11.h, z11.h, z3.h\n"
296 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
297 "fadd z13.h, z13.h, z5.h\n"
298 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
299 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
300 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
301 "fmax z10.h, p0/m, z10.h, z1.h\n"
302 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
303 "fmin z11.h, p1/m, z11.h, z0.h\n"
304 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
305 "fadd z12.h, z12.h, z4.h\n"
306 "ld1h z8.h, p0/z, [%[outptr2]]\n"
307 "fmin z13.h, p0/m, z13.h, z0.h\n"
308 "st1h z10.h, p0, [%[outptr0]]\n"
309 "fadd z14.h, z14.h, z6.h\n"
310 "ld1h z16.h, p0/z, [%[inptr], #6, MUL VL]\n"
311 "fmax z11.h, p1/m, z11.h, z1.h\n"
312 "ld1h z9.h, p1/z, [%[outptr2], #1, MUL VL]\n"
313 "fmin z12.h, p2/m, z12.h, z0.h\n"
314 "ld1h z17.h, p1/z, [%[inptr], #7, MUL VL]\n"
315 "fmax z13.h, p0/m, z13.h, z1.h\n"
316 "ld1h z2.h, p2/z, [%[outptr2], #2, MUL VL]\n"
317 "fmin z14.h, p1/m, z14.h, z0.h\n"
318 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
319 "fadd z15.h, z15.h, z7.h\n"
320 "ld1h z10.h, p2/z, [x8, #-8, MUL VL]\n"
321 "fmax z12.h, p2/m, z12.h, z1.h\n"
322 "ld1h z3.h, p0/z, [%[outptr3]]\n"
323 "fadd z16.h, z16.h, z8.h\n"
324 "ld1h z11.h, p0/z, [x8, #-7, MUL VL]\n"
325 "fmax z14.h, p1/m, z14.h, z1.h\n"
326 "ld1h z4.h, p1/z, [%[outptr3], #1, MUL VL]\n"
327 "fmin z15.h, p2/m, z15.h, z0.h\n"
328 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
329 "fadd z17.h, z17.h, z9.h\n"
330 "ld1h z12.h, p1/z, [x8, #-6, MUL VL]\n"
331 "fmin z16.h, p0/m, z16.h, z0.h\n"
332 "ld1h z5.h, p2/z, [%[outptr3], #2, MUL VL]\n"
333 "fadd z10.h, z10.h, z2.h\n"
334 "st1h z13.h, p0, [%[outptr1]]\n"
335 "fmax z15.h, p2/m, z15.h, z1.h\n"
336 "ld1h z13.h, p2/z, [x8, #-5, MUL VL]\n"
337 "fmin z17.h, p1/m, z17.h, z0.h\n"
338 "addvl %[outptr0], %[outptr0], #3\n"
339 "fmax z16.h, p0/m, z16.h, z1.h\n"
340 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
341 "fmin z10.h, p2/m, z10.h, z0.h\n"
342 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
343 "fmax z17.h, p1/m, z17.h, z1.h\n"
344 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
345 "fadd z11.h, z11.h, z3.h\n"
346 "addvl %[outptr1], %[outptr1], #3\n"
347 "fmax z10.h, p2/m, z10.h, z1.h\n"
348 "st1h z16.h, p0, [%[outptr2]]\n"
349 "fadd z12.h, z12.h, z4.h\n"
350 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
351 "fmin z11.h, p0/m, z11.h, z0.h\n"
352 "st1h z17.h, p1, [%[outptr2], #1, MUL VL]\n"
353 "fadd z13.h, z13.h, z5.h\n"
354 "addvl %[inptr], %[inptr], #24\n"
355 "fmin z12.h, p1/m, z12.h, z0.h\n"
356 "st1h z10.h, p2, [%[outptr2], #2, MUL VL]\n"
357 "fmax z11.h, p0/m, z11.h, z1.h\n"
358 "addvl %[outptr2], %[outptr2], #3\n"
359 "fmin z13.h, p2/m, z13.h, z0.h\n"
360 "fmax z12.h, p1/m, z12.h, z1.h\n"
361 "st1h z11.h, p0, [%[outptr3]]\n"
362 "fmax z13.h, p2/m, z13.h, z1.h\n"
363 "st1h z12.h, p1, [%[outptr3], #1, MUL VL]\n"
364 "st1h z13.h, p2, [%[outptr3], #2, MUL VL]\n"
365 "addvl %[outptr3], %[outptr3], #3\n"
366 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
367 [inptr] "+r" (inptr), [p] "+r" (p)
368 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
369 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
370 );
371 }
372 break;
373
374 case 5:
375 {
376 long w = xmax - i;
377 long p = 0;
378 /* Optimized routine to copy an entire block */
379 __asm __volatile (
380 "mov z0.h, %h[maxval]\n"
381 "addvl x8, %[inptr], #16\n"
382 "mov z1.h, %h[minval]\n"
383 "whilelt p0.h, %[p], %[w]\n"
384 "inch %[p], all, mul #1\n"
385 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
386 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
387 "ld1h z2.h, p0/z, [%[outptr0]]\n"
388 "whilelt p1.h, %[p], %[w]\n"
389 "ld1h z10.h, p0/z, [%[inptr]]\n"
390 "inch %[p], all, mul #1\n"
391 "ld1h z5.h, p0/z, [%[outptr1]]\n"
392 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
393 "fadd z10.h, z10.h, z2.h\n"
394 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
395 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
396 "whilelt p2.h, %[p], %[w]\n"
397 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
398 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
399 "fmin z10.h, p0/m, z10.h, z0.h\n"
400 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
401 "fadd z11.h, z11.h, z3.h\n"
402 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
403 "fadd z13.h, z13.h, z5.h\n"
404 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
405 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
406 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
407 "fmax z10.h, p0/m, z10.h, z1.h\n"
408 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
409 "fmin z11.h, p1/m, z11.h, z0.h\n"
410 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
411 "fadd z12.h, z12.h, z4.h\n"
412 "ld1h z8.h, p0/z, [%[outptr2]]\n"
413 "fmin z13.h, p0/m, z13.h, z0.h\n"
414 "st1h z10.h, p0, [%[outptr0]]\n"
415 "fadd z14.h, z14.h, z6.h\n"
416 "ld1h z16.h, p0/z, [%[inptr], #6, MUL VL]\n"
417 "fmax z11.h, p1/m, z11.h, z1.h\n"
418 "ld1h z9.h, p1/z, [%[outptr2], #1, MUL VL]\n"
419 "fmin z12.h, p2/m, z12.h, z0.h\n"
420 "ld1h z17.h, p1/z, [%[inptr], #7, MUL VL]\n"
421 "fmax z13.h, p0/m, z13.h, z1.h\n"
422 "ld1h z2.h, p2/z, [%[outptr2], #2, MUL VL]\n"
423 "fmin z14.h, p1/m, z14.h, z0.h\n"
424 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
425 "fadd z15.h, z15.h, z7.h\n"
426 "ld1h z10.h, p2/z, [x8, #-8, MUL VL]\n"
427 "fmax z12.h, p2/m, z12.h, z1.h\n"
428 "ld1h z3.h, p0/z, [%[outptr3]]\n"
429 "fadd z16.h, z16.h, z8.h\n"
430 "ld1h z11.h, p0/z, [x8, #-7, MUL VL]\n"
431 "fmax z14.h, p1/m, z14.h, z1.h\n"
432 "ld1h z4.h, p1/z, [%[outptr3], #1, MUL VL]\n"
433 "fmin z15.h, p2/m, z15.h, z0.h\n"
434 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
435 "fadd z17.h, z17.h, z9.h\n"
436 "ld1h z12.h, p1/z, [x8, #-6, MUL VL]\n"
437 "fmin z16.h, p0/m, z16.h, z0.h\n"
438 "ld1h z5.h, p2/z, [%[outptr3], #2, MUL VL]\n"
439 "fadd z10.h, z10.h, z2.h\n"
440 "st1h z13.h, p0, [%[outptr1]]\n"
441 "fmax z15.h, p2/m, z15.h, z1.h\n"
442 "ld1h z13.h, p2/z, [x8, #-5, MUL VL]\n"
443 "fmin z17.h, p1/m, z17.h, z0.h\n"
444 "ld1h z6.h, p0/z, [%[outptr4]]\n"
445 "fmax z16.h, p0/m, z16.h, z1.h\n"
446 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
447 "fmin z10.h, p2/m, z10.h, z0.h\n"
448 "ld1h z14.h, p0/z, [x8, #-4, MUL VL]\n"
449 "fadd z11.h, z11.h, z3.h\n"
450 "ld1h z7.h, p1/z, [%[outptr4], #1, MUL VL]\n"
451 "fmax z17.h, p1/m, z17.h, z1.h\n"
452 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
453 "fadd z12.h, z12.h, z4.h\n"
454 "ld1h z15.h, p1/z, [x8, #-3, MUL VL]\n"
455 "fmax z10.h, p2/m, z10.h, z1.h\n"
456 "ld1h z8.h, p2/z, [%[outptr4], #2, MUL VL]\n"
457 "fmin z11.h, p0/m, z11.h, z0.h\n"
458 "st1h z16.h, p0, [%[outptr2]]\n"
459 "fadd z13.h, z13.h, z5.h\n"
460 "ld1h z16.h, p2/z, [x8, #-2, MUL VL]\n"
461 "fmin z12.h, p1/m, z12.h, z0.h\n"
462 "addvl %[outptr0], %[outptr0], #3\n"
463 "fmax z11.h, p0/m, z11.h, z1.h\n"
464 "st1h z17.h, p1, [%[outptr2], #1, MUL VL]\n"
465 "fmin z13.h, p2/m, z13.h, z0.h\n"
466 "addvl %[outptr1], %[outptr1], #3\n"
467 "fmax z12.h, p1/m, z12.h, z1.h\n"
468 "st1h z10.h, p2, [%[outptr2], #2, MUL VL]\n"
469 "fadd z14.h, z14.h, z6.h\n"
470 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
471 "fmax z13.h, p2/m, z13.h, z1.h\n"
472 "st1h z11.h, p0, [%[outptr3]]\n"
473 "fadd z15.h, z15.h, z7.h\n"
474 "addvl %[outptr2], %[outptr2], #3\n"
475 "fmin z14.h, p0/m, z14.h, z0.h\n"
476 "st1h z12.h, p1, [%[outptr3], #1, MUL VL]\n"
477 "fadd z16.h, z16.h, z8.h\n"
478 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
479 "fmin z15.h, p1/m, z15.h, z0.h\n"
480 "st1h z13.h, p2, [%[outptr3], #2, MUL VL]\n"
481 "fmax z14.h, p0/m, z14.h, z1.h\n"
482 "addvl %[outptr3], %[outptr3], #3\n"
483 "fmin z16.h, p2/m, z16.h, z0.h\n"
484 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
485 "fmax z15.h, p1/m, z15.h, z1.h\n"
486 "st1h z14.h, p0, [%[outptr4]]\n"
487 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
488 "fmax z16.h, p2/m, z16.h, z1.h\n"
489 "addvl %[inptr], %[inptr], #24\n"
490 "st1h z15.h, p1, [%[outptr4], #1, MUL VL]\n"
491 "st1h z16.h, p2, [%[outptr4], #2, MUL VL]\n"
492 "addvl %[outptr4], %[outptr4], #3\n"
493 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
494 [inptr] "+r" (inptr), [p] "+r" (p)
495 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
496 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
497 );
498 }
499 break;
500
501 case 6:
502 {
503 long w = xmax - i;
504 long p = 0;
505 /* Optimized routine to copy an entire block */
506 __asm __volatile (
507 "mov z0.h, %h[maxval]\n"
508 "addvl x8, %[inptr], #16\n"
509 "mov z1.h, %h[minval]\n"
510 "whilelt p0.h, %[p], %[w]\n"
511 "inch %[p], all, mul #1\n"
512 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
513 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
514 "ld1h z2.h, p0/z, [%[outptr0]]\n"
515 "whilelt p1.h, %[p], %[w]\n"
516 "ld1h z10.h, p0/z, [%[inptr]]\n"
517 "inch %[p], all, mul #1\n"
518 "ld1h z5.h, p0/z, [%[outptr1]]\n"
519 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
520 "fadd z10.h, z10.h, z2.h\n"
521 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
522 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
523 "whilelt p2.h, %[p], %[w]\n"
524 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
525 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
526 "fmin z10.h, p0/m, z10.h, z0.h\n"
527 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
528 "fadd z11.h, z11.h, z3.h\n"
529 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
530 "fadd z13.h, z13.h, z5.h\n"
531 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
532 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
533 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
534 "fmax z10.h, p0/m, z10.h, z1.h\n"
535 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
536 "fmin z11.h, p1/m, z11.h, z0.h\n"
537 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
538 "fadd z12.h, z12.h, z4.h\n"
539 "ld1h z8.h, p0/z, [%[outptr2]]\n"
540 "fmin z13.h, p0/m, z13.h, z0.h\n"
541 "st1h z10.h, p0, [%[outptr0]]\n"
542 "fadd z14.h, z14.h, z6.h\n"
543 "ld1h z16.h, p0/z, [%[inptr], #6, MUL VL]\n"
544 "fmax z11.h, p1/m, z11.h, z1.h\n"
545 "ld1h z9.h, p1/z, [%[outptr2], #1, MUL VL]\n"
546 "fmin z12.h, p2/m, z12.h, z0.h\n"
547 "ld1h z17.h, p1/z, [%[inptr], #7, MUL VL]\n"
548 "fmax z13.h, p0/m, z13.h, z1.h\n"
549 "ld1h z2.h, p2/z, [%[outptr2], #2, MUL VL]\n"
550 "fmin z14.h, p1/m, z14.h, z0.h\n"
551 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
552 "fadd z15.h, z15.h, z7.h\n"
553 "ld1h z10.h, p2/z, [x8, #-8, MUL VL]\n"
554 "fmax z12.h, p2/m, z12.h, z1.h\n"
555 "ld1h z3.h, p0/z, [%[outptr3]]\n"
556 "fadd z16.h, z16.h, z8.h\n"
557 "ld1h z11.h, p0/z, [x8, #-7, MUL VL]\n"
558 "fmax z14.h, p1/m, z14.h, z1.h\n"
559 "ld1h z4.h, p1/z, [%[outptr3], #1, MUL VL]\n"
560 "fmin z15.h, p2/m, z15.h, z0.h\n"
561 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
562 "fadd z17.h, z17.h, z9.h\n"
563 "ld1h z12.h, p1/z, [x8, #-6, MUL VL]\n"
564 "fmin z16.h, p0/m, z16.h, z0.h\n"
565 "ld1h z5.h, p2/z, [%[outptr3], #2, MUL VL]\n"
566 "fadd z10.h, z10.h, z2.h\n"
567 "st1h z13.h, p0, [%[outptr1]]\n"
568 "fmax z15.h, p2/m, z15.h, z1.h\n"
569 "ld1h z13.h, p2/z, [x8, #-5, MUL VL]\n"
570 "fmin z17.h, p1/m, z17.h, z0.h\n"
571 "ld1h z6.h, p0/z, [%[outptr4]]\n"
572 "fmax z16.h, p0/m, z16.h, z1.h\n"
573 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
574 "fmin z10.h, p2/m, z10.h, z0.h\n"
575 "ld1h z14.h, p0/z, [x8, #-4, MUL VL]\n"
576 "fadd z11.h, z11.h, z3.h\n"
577 "ld1h z7.h, p1/z, [%[outptr4], #1, MUL VL]\n"
578 "fmax z17.h, p1/m, z17.h, z1.h\n"
579 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
580 "fadd z12.h, z12.h, z4.h\n"
581 "ld1h z15.h, p1/z, [x8, #-3, MUL VL]\n"
582 "fmax z10.h, p2/m, z10.h, z1.h\n"
583 "ld1h z8.h, p2/z, [%[outptr4], #2, MUL VL]\n"
584 "fmin z11.h, p0/m, z11.h, z0.h\n"
585 "st1h z16.h, p0, [%[outptr2]]\n"
586 "fadd z13.h, z13.h, z5.h\n"
587 "ld1h z16.h, p2/z, [x8, #-2, MUL VL]\n"
588 "fmin z12.h, p1/m, z12.h, z0.h\n"
589 "ld1h z9.h, p0/z, [%[outptr5]]\n"
590 "fadd z14.h, z14.h, z6.h\n"
591 "st1h z17.h, p1, [%[outptr2], #1, MUL VL]\n"
592 "fmax z11.h, p0/m, z11.h, z1.h\n"
593 "ld1h z17.h, p0/z, [x8, #-1, MUL VL]\n"
594 "fmin z13.h, p2/m, z13.h, z0.h\n"
595 "ld1h z2.h, p1/z, [%[outptr5], #1, MUL VL]\n"
596 "fmax z12.h, p1/m, z12.h, z1.h\n"
597 "st1h z10.h, p2, [%[outptr2], #2, MUL VL]\n"
598 "fmin z14.h, p0/m, z14.h, z0.h\n"
599 "ld1h z10.h, p1/z, [x8]\n"
600 "fadd z15.h, z15.h, z7.h\n"
601 "ld1h z3.h, p2/z, [%[outptr5], #2, MUL VL]\n"
602 "fmax z13.h, p2/m, z13.h, z1.h\n"
603 "st1h z11.h, p0, [%[outptr3]]\n"
604 "fadd z16.h, z16.h, z8.h\n"
605 "ld1h z11.h, p2/z, [x8, #1, MUL VL]\n"
606 "fmax z14.h, p0/m, z14.h, z1.h\n"
607 "addvl %[outptr0], %[outptr0], #3\n"
608 "fmin z15.h, p1/m, z15.h, z0.h\n"
609 "st1h z12.h, p1, [%[outptr3], #1, MUL VL]\n"
610 "fmin z16.h, p2/m, z16.h, z0.h\n"
611 "addvl %[outptr1], %[outptr1], #3\n"
612 "fadd z17.h, z17.h, z9.h\n"
613 "st1h z13.h, p2, [%[outptr3], #2, MUL VL]\n"
614 "fmax z15.h, p1/m, z15.h, z1.h\n"
615 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
616 "fmax z16.h, p2/m, z16.h, z1.h\n"
617 "st1h z14.h, p0, [%[outptr4]]\n"
618 "fmin z17.h, p0/m, z17.h, z0.h\n"
619 "addvl %[outptr2], %[outptr2], #3\n"
620 "fadd z10.h, z10.h, z2.h\n"
621 "st1h z15.h, p1, [%[outptr4], #1, MUL VL]\n"
622 "fadd z11.h, z11.h, z3.h\n"
623 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
624 "fmax z17.h, p0/m, z17.h, z1.h\n"
625 "st1h z16.h, p2, [%[outptr4], #2, MUL VL]\n"
626 "fmin z10.h, p1/m, z10.h, z0.h\n"
627 "addvl %[outptr3], %[outptr3], #3\n"
628 "fmin z11.h, p2/m, z11.h, z0.h\n"
629 "st1h z17.h, p0, [%[outptr5]]\n"
630 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
631 "fmax z10.h, p1/m, z10.h, z1.h\n"
632 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
633 "fmax z11.h, p2/m, z11.h, z1.h\n"
634 "addvl %[outptr4], %[outptr4], #3\n"
635 "st1h z10.h, p1, [%[outptr5], #1, MUL VL]\n"
636 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
637 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
638 "addvl %[inptr], %[inptr], #24\n"
639 "st1h z11.h, p2, [%[outptr5], #2, MUL VL]\n"
640 "addvl %[outptr5], %[outptr5], #3\n"
641 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
642 [inptr] "+r" (inptr), [p] "+r" (p)
643 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
644 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
645 );
646 }
647 break;
648
649 case 7:
650 {
651 long w = xmax - i;
652 long p = 0;
653 /* Optimized routine to copy an entire block */
654 __asm __volatile (
655 "mov z0.h, %h[maxval]\n"
656 "addvl x8, %[inptr], #16\n"
657 "mov z1.h, %h[minval]\n"
658 "whilelt p0.h, %[p], %[w]\n"
659 "inch %[p], all, mul #1\n"
660 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
661 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
662 "ld1h z2.h, p0/z, [%[outptr0]]\n"
663 "whilelt p1.h, %[p], %[w]\n"
664 "ld1h z10.h, p0/z, [%[inptr]]\n"
665 "inch %[p], all, mul #1\n"
666 "ld1h z5.h, p0/z, [%[outptr1]]\n"
667 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
668 "fadd z10.h, z10.h, z2.h\n"
669 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
670 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
671 "whilelt p2.h, %[p], %[w]\n"
672 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
673 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
674 "fmin z10.h, p0/m, z10.h, z0.h\n"
675 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
676 "fadd z11.h, z11.h, z3.h\n"
677 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
678 "fadd z13.h, z13.h, z5.h\n"
679 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
680 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
681 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
682 "fmax z10.h, p0/m, z10.h, z1.h\n"
683 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
684 "fmin z11.h, p1/m, z11.h, z0.h\n"
685 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
686 "fadd z12.h, z12.h, z4.h\n"
687 "ld1h z8.h, p0/z, [%[outptr2]]\n"
688 "fmin z13.h, p0/m, z13.h, z0.h\n"
689 "st1h z10.h, p0, [%[outptr0]]\n"
690 "fadd z14.h, z14.h, z6.h\n"
691 "ld1h z16.h, p0/z, [%[inptr], #6, MUL VL]\n"
692 "fmax z11.h, p1/m, z11.h, z1.h\n"
693 "ld1h z9.h, p1/z, [%[outptr2], #1, MUL VL]\n"
694 "fmin z12.h, p2/m, z12.h, z0.h\n"
695 "ld1h z17.h, p1/z, [%[inptr], #7, MUL VL]\n"
696 "fmax z13.h, p0/m, z13.h, z1.h\n"
697 "ld1h z2.h, p2/z, [%[outptr2], #2, MUL VL]\n"
698 "fmin z14.h, p1/m, z14.h, z0.h\n"
699 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
700 "fadd z15.h, z15.h, z7.h\n"
701 "ld1h z10.h, p2/z, [x8, #-8, MUL VL]\n"
702 "fmax z12.h, p2/m, z12.h, z1.h\n"
703 "ld1h z3.h, p0/z, [%[outptr3]]\n"
704 "fadd z16.h, z16.h, z8.h\n"
705 "ld1h z11.h, p0/z, [x8, #-7, MUL VL]\n"
706 "fmax z14.h, p1/m, z14.h, z1.h\n"
707 "ld1h z4.h, p1/z, [%[outptr3], #1, MUL VL]\n"
708 "fmin z15.h, p2/m, z15.h, z0.h\n"
709 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
710 "fadd z17.h, z17.h, z9.h\n"
711 "ld1h z12.h, p1/z, [x8, #-6, MUL VL]\n"
712 "fmin z16.h, p0/m, z16.h, z0.h\n"
713 "ld1h z5.h, p2/z, [%[outptr3], #2, MUL VL]\n"
714 "fadd z10.h, z10.h, z2.h\n"
715 "st1h z13.h, p0, [%[outptr1]]\n"
716 "fmax z15.h, p2/m, z15.h, z1.h\n"
717 "ld1h z13.h, p2/z, [x8, #-5, MUL VL]\n"
718 "fmin z17.h, p1/m, z17.h, z0.h\n"
719 "ld1h z6.h, p0/z, [%[outptr4]]\n"
720 "fmax z16.h, p0/m, z16.h, z1.h\n"
721 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
722 "fmin z10.h, p2/m, z10.h, z0.h\n"
723 "ld1h z14.h, p0/z, [x8, #-4, MUL VL]\n"
724 "fadd z11.h, z11.h, z3.h\n"
725 "ld1h z7.h, p1/z, [%[outptr4], #1, MUL VL]\n"
726 "fmax z17.h, p1/m, z17.h, z1.h\n"
727 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
728 "fadd z12.h, z12.h, z4.h\n"
729 "ld1h z15.h, p1/z, [x8, #-3, MUL VL]\n"
730 "fmax z10.h, p2/m, z10.h, z1.h\n"
731 "ld1h z8.h, p2/z, [%[outptr4], #2, MUL VL]\n"
732 "fmin z11.h, p0/m, z11.h, z0.h\n"
733 "st1h z16.h, p0, [%[outptr2]]\n"
734 "fadd z13.h, z13.h, z5.h\n"
735 "ld1h z16.h, p2/z, [x8, #-2, MUL VL]\n"
736 "fmin z12.h, p1/m, z12.h, z0.h\n"
737 "ld1h z9.h, p0/z, [%[outptr5]]\n"
738 "fadd z14.h, z14.h, z6.h\n"
739 "st1h z17.h, p1, [%[outptr2], #1, MUL VL]\n"
740 "fmax z11.h, p0/m, z11.h, z1.h\n"
741 "ld1h z17.h, p0/z, [x8, #-1, MUL VL]\n"
742 "fmin z13.h, p2/m, z13.h, z0.h\n"
743 "ld1h z2.h, p1/z, [%[outptr5], #1, MUL VL]\n"
744 "fmax z12.h, p1/m, z12.h, z1.h\n"
745 "st1h z10.h, p2, [%[outptr2], #2, MUL VL]\n"
746 "fmin z14.h, p0/m, z14.h, z0.h\n"
747 "ld1h z10.h, p1/z, [x8]\n"
748 "fadd z15.h, z15.h, z7.h\n"
749 "ld1h z3.h, p2/z, [%[outptr5], #2, MUL VL]\n"
750 "fmax z13.h, p2/m, z13.h, z1.h\n"
751 "st1h z11.h, p0, [%[outptr3]]\n"
752 "fadd z16.h, z16.h, z8.h\n"
753 "ld1h z11.h, p2/z, [x8, #1, MUL VL]\n"
754 "fmax z14.h, p0/m, z14.h, z1.h\n"
755 "ld1h z4.h, p0/z, [%[outptr6]]\n"
756 "fmin z15.h, p1/m, z15.h, z0.h\n"
757 "st1h z12.h, p1, [%[outptr3], #1, MUL VL]\n"
758 "fadd z17.h, z17.h, z9.h\n"
759 "ld1h z12.h, p0/z, [x8, #2, MUL VL]\n"
760 "fmin z16.h, p2/m, z16.h, z0.h\n"
761 "ld1h z5.h, p1/z, [%[outptr6], #1, MUL VL]\n"
762 "fadd z10.h, z10.h, z2.h\n"
763 "st1h z13.h, p2, [%[outptr3], #2, MUL VL]\n"
764 "fmax z15.h, p1/m, z15.h, z1.h\n"
765 "ld1h z13.h, p1/z, [x8, #3, MUL VL]\n"
766 "fmin z17.h, p0/m, z17.h, z0.h\n"
767 "ld1h z6.h, p2/z, [%[outptr6], #2, MUL VL]\n"
768 "fmax z16.h, p2/m, z16.h, z1.h\n"
769 "st1h z14.h, p0, [%[outptr4]]\n"
770 "fmin z10.h, p1/m, z10.h, z0.h\n"
771 "ld1h z14.h, p2/z, [x8, #4, MUL VL]\n"
772 "fadd z11.h, z11.h, z3.h\n"
773 "addvl %[outptr0], %[outptr0], #3\n"
774 "fmax z17.h, p0/m, z17.h, z1.h\n"
775 "st1h z15.h, p1, [%[outptr4], #1, MUL VL]\n"
776 "fmax z10.h, p1/m, z10.h, z1.h\n"
777 "addvl %[outptr1], %[outptr1], #3\n"
778 "fmin z11.h, p2/m, z11.h, z0.h\n"
779 "st1h z16.h, p2, [%[outptr4], #2, MUL VL]\n"
780 "fadd z12.h, z12.h, z4.h\n"
781 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
782 "fadd z13.h, z13.h, z5.h\n"
783 "st1h z17.h, p0, [%[outptr5]]\n"
784 "fmax z11.h, p2/m, z11.h, z1.h\n"
785 "addvl %[outptr2], %[outptr2], #3\n"
786 "fmin z12.h, p0/m, z12.h, z0.h\n"
787 "st1h z10.h, p1, [%[outptr5], #1, MUL VL]\n"
788 "fmin z13.h, p1/m, z13.h, z0.h\n"
789 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
790 "fadd z14.h, z14.h, z6.h\n"
791 "st1h z11.h, p2, [%[outptr5], #2, MUL VL]\n"
792 "fmax z12.h, p0/m, z12.h, z1.h\n"
793 "addvl %[outptr3], %[outptr3], #3\n"
794 "fmax z13.h, p1/m, z13.h, z1.h\n"
795 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
796 "fmin z14.h, p2/m, z14.h, z0.h\n"
797 "st1h z12.h, p0, [%[outptr6]]\n"
798 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
799 "addvl %[outptr4], %[outptr4], #3\n"
800 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
801 "fmax z14.h, p2/m, z14.h, z1.h\n"
802 "st1h z13.h, p1, [%[outptr6], #1, MUL VL]\n"
803 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
804 "addvl %[outptr5], %[outptr5], #3\n"
805 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
806 "st1h z14.h, p2, [%[outptr6], #2, MUL VL]\n"
807 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
808 "addvl %[outptr6], %[outptr6], #3\n"
809 "addvl %[inptr], %[inptr], #24\n"
810 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
811 [inptr] "+r" (inptr), [p] "+r" (p)
812 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
813 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
814 );
815 }
816 break;
817
818 default:
819 case 8:
820 {
821 long w = xmax - i;
822 long p = 0;
823 /* Optimized routine to copy an entire block */
824 __asm __volatile (
825 "mov z0.h, %h[maxval]\n"
826 "addvl x8, %[inptr], #16\n"
827 "mov z1.h, %h[minval]\n"
828 "whilelt p0.h, %[p], %[w]\n"
829 "inch %[p], all, mul #1\n"
830 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
831 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
832 "ld1h z2.h, p0/z, [%[outptr0]]\n"
833 "whilelt p1.h, %[p], %[w]\n"
834 "ld1h z10.h, p0/z, [%[inptr]]\n"
835 "inch %[p], all, mul #1\n"
836 "ld1h z5.h, p0/z, [%[outptr1]]\n"
837 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
838 "fadd z10.h, z10.h, z2.h\n"
839 "ld1h z3.h, p1/z, [%[outptr0], #1, MUL VL]\n"
840 "ld1h z11.h, p1/z, [%[inptr], #1, MUL VL]\n"
841 "whilelt p2.h, %[p], %[w]\n"
842 "ld1h z13.h, p0/z, [%[inptr], #3, MUL VL]\n"
843 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
844 "fmin z10.h, p0/m, z10.h, z0.h\n"
845 "ld1h z4.h, p2/z, [%[outptr0], #2, MUL VL]\n"
846 "fadd z11.h, z11.h, z3.h\n"
847 "ld1h z12.h, p2/z, [%[inptr], #2, MUL VL]\n"
848 "fadd z13.h, z13.h, z5.h\n"
849 "ld1h z6.h, p1/z, [%[outptr1], #1, MUL VL]\n"
850 "ld1h z14.h, p1/z, [%[inptr], #4, MUL VL]\n"
851 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
852 "fmax z10.h, p0/m, z10.h, z1.h\n"
853 "ld1h z7.h, p2/z, [%[outptr1], #2, MUL VL]\n"
854 "fmin z11.h, p1/m, z11.h, z0.h\n"
855 "ld1h z15.h, p2/z, [%[inptr], #5, MUL VL]\n"
856 "fadd z12.h, z12.h, z4.h\n"
857 "ld1h z8.h, p0/z, [%[outptr2]]\n"
858 "fmin z13.h, p0/m, z13.h, z0.h\n"
859 "st1h z10.h, p0, [%[outptr0]]\n"
860 "fadd z14.h, z14.h, z6.h\n"
861 "ld1h z16.h, p0/z, [%[inptr], #6, MUL VL]\n"
862 "fmax z11.h, p1/m, z11.h, z1.h\n"
863 "ld1h z9.h, p1/z, [%[outptr2], #1, MUL VL]\n"
864 "fmin z12.h, p2/m, z12.h, z0.h\n"
865 "ld1h z17.h, p1/z, [%[inptr], #7, MUL VL]\n"
866 "fmax z13.h, p0/m, z13.h, z1.h\n"
867 "ld1h z2.h, p2/z, [%[outptr2], #2, MUL VL]\n"
868 "fmin z14.h, p1/m, z14.h, z0.h\n"
869 "st1h z11.h, p1, [%[outptr0], #1, MUL VL]\n"
870 "fadd z15.h, z15.h, z7.h\n"
871 "ld1h z10.h, p2/z, [x8, #-8, MUL VL]\n"
872 "fmax z12.h, p2/m, z12.h, z1.h\n"
873 "ld1h z3.h, p0/z, [%[outptr3]]\n"
874 "fadd z16.h, z16.h, z8.h\n"
875 "ld1h z11.h, p0/z, [x8, #-7, MUL VL]\n"
876 "fmax z14.h, p1/m, z14.h, z1.h\n"
877 "ld1h z4.h, p1/z, [%[outptr3], #1, MUL VL]\n"
878 "fmin z15.h, p2/m, z15.h, z0.h\n"
879 "st1h z12.h, p2, [%[outptr0], #2, MUL VL]\n"
880 "fadd z17.h, z17.h, z9.h\n"
881 "ld1h z12.h, p1/z, [x8, #-6, MUL VL]\n"
882 "fmin z16.h, p0/m, z16.h, z0.h\n"
883 "ld1h z5.h, p2/z, [%[outptr3], #2, MUL VL]\n"
884 "fadd z10.h, z10.h, z2.h\n"
885 "st1h z13.h, p0, [%[outptr1]]\n"
886 "fmax z15.h, p2/m, z15.h, z1.h\n"
887 "ld1h z13.h, p2/z, [x8, #-5, MUL VL]\n"
888 "fmin z17.h, p1/m, z17.h, z0.h\n"
889 "ld1h z6.h, p0/z, [%[outptr4]]\n"
890 "fmax z16.h, p0/m, z16.h, z1.h\n"
891 "st1h z14.h, p1, [%[outptr1], #1, MUL VL]\n"
892 "fmin z10.h, p2/m, z10.h, z0.h\n"
893 "ld1h z14.h, p0/z, [x8, #-4, MUL VL]\n"
894 "fadd z11.h, z11.h, z3.h\n"
895 "ld1h z7.h, p1/z, [%[outptr4], #1, MUL VL]\n"
896 "fmax z17.h, p1/m, z17.h, z1.h\n"
897 "st1h z15.h, p2, [%[outptr1], #2, MUL VL]\n"
898 "fadd z12.h, z12.h, z4.h\n"
899 "ld1h z15.h, p1/z, [x8, #-3, MUL VL]\n"
900 "fmax z10.h, p2/m, z10.h, z1.h\n"
901 "ld1h z8.h, p2/z, [%[outptr4], #2, MUL VL]\n"
902 "fmin z11.h, p0/m, z11.h, z0.h\n"
903 "st1h z16.h, p0, [%[outptr2]]\n"
904 "fadd z13.h, z13.h, z5.h\n"
905 "ld1h z16.h, p2/z, [x8, #-2, MUL VL]\n"
906 "fmin z12.h, p1/m, z12.h, z0.h\n"
907 "ld1h z9.h, p0/z, [%[outptr5]]\n"
908 "fadd z14.h, z14.h, z6.h\n"
909 "st1h z17.h, p1, [%[outptr2], #1, MUL VL]\n"
910 "fmax z11.h, p0/m, z11.h, z1.h\n"
911 "ld1h z17.h, p0/z, [x8, #-1, MUL VL]\n"
912 "fmin z13.h, p2/m, z13.h, z0.h\n"
913 "ld1h z2.h, p1/z, [%[outptr5], #1, MUL VL]\n"
914 "fmax z12.h, p1/m, z12.h, z1.h\n"
915 "st1h z10.h, p2, [%[outptr2], #2, MUL VL]\n"
916 "fmin z14.h, p0/m, z14.h, z0.h\n"
917 "ld1h z10.h, p1/z, [x8]\n"
918 "fadd z15.h, z15.h, z7.h\n"
919 "ld1h z3.h, p2/z, [%[outptr5], #2, MUL VL]\n"
920 "fmax z13.h, p2/m, z13.h, z1.h\n"
921 "st1h z11.h, p0, [%[outptr3]]\n"
922 "fadd z16.h, z16.h, z8.h\n"
923 "ld1h z11.h, p2/z, [x8, #1, MUL VL]\n"
924 "fmax z14.h, p0/m, z14.h, z1.h\n"
925 "ld1h z4.h, p0/z, [%[outptr6]]\n"
926 "fmin z15.h, p1/m, z15.h, z0.h\n"
927 "st1h z12.h, p1, [%[outptr3], #1, MUL VL]\n"
928 "fadd z17.h, z17.h, z9.h\n"
929 "ld1h z12.h, p0/z, [x8, #2, MUL VL]\n"
930 "fmin z16.h, p2/m, z16.h, z0.h\n"
931 "ld1h z5.h, p1/z, [%[outptr6], #1, MUL VL]\n"
932 "fadd z10.h, z10.h, z2.h\n"
933 "st1h z13.h, p2, [%[outptr3], #2, MUL VL]\n"
934 "fmax z15.h, p1/m, z15.h, z1.h\n"
935 "ld1h z13.h, p1/z, [x8, #3, MUL VL]\n"
936 "fmin z17.h, p0/m, z17.h, z0.h\n"
937 "ld1h z6.h, p2/z, [%[outptr6], #2, MUL VL]\n"
938 "fmax z16.h, p2/m, z16.h, z1.h\n"
939 "st1h z14.h, p0, [%[outptr4]]\n"
940 "fmin z10.h, p1/m, z10.h, z0.h\n"
941 "ld1h z14.h, p2/z, [x8, #4, MUL VL]\n"
942 "fadd z11.h, z11.h, z3.h\n"
943 "ld1h z7.h, p0/z, [%[outptr7]]\n"
944 "fmax z17.h, p0/m, z17.h, z1.h\n"
945 "st1h z15.h, p1, [%[outptr4], #1, MUL VL]\n"
946 "fadd z12.h, z12.h, z4.h\n"
947 "ld1h z15.h, p0/z, [x8, #5, MUL VL]\n"
948 "fmax z10.h, p1/m, z10.h, z1.h\n"
949 "ld1h z8.h, p1/z, [%[outptr7], #1, MUL VL]\n"
950 "fmin z11.h, p2/m, z11.h, z0.h\n"
951 "st1h z16.h, p2, [%[outptr4], #2, MUL VL]\n"
952 "fadd z13.h, z13.h, z5.h\n"
953 "ld1h z16.h, p1/z, [x8, #6, MUL VL]\n"
954 "fmin z12.h, p0/m, z12.h, z0.h\n"
955 "ld1h z9.h, p2/z, [%[outptr7], #2, MUL VL]\n"
956 "fadd z14.h, z14.h, z6.h\n"
957 "st1h z17.h, p0, [%[outptr5]]\n"
958 "fmax z11.h, p2/m, z11.h, z1.h\n"
959 "ld1h z17.h, p2/z, [x8, #7, MUL VL]\n"
960 "fmin z13.h, p1/m, z13.h, z0.h\n"
961 "addvl %[outptr0], %[outptr0], #3\n"
962 "fmax z12.h, p0/m, z12.h, z1.h\n"
963 "st1h z10.h, p1, [%[outptr5], #1, MUL VL]\n"
964 "fmin z14.h, p2/m, z14.h, z0.h\n"
965 "addvl %[outptr1], %[outptr1], #3\n"
966 "fmax z13.h, p1/m, z13.h, z1.h\n"
967 "st1h z11.h, p2, [%[outptr5], #2, MUL VL]\n"
968 "fadd z15.h, z15.h, z7.h\n"
969 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
970 "fmax z14.h, p2/m, z14.h, z1.h\n"
971 "st1h z12.h, p0, [%[outptr6]]\n"
972 "fadd z16.h, z16.h, z8.h\n"
973 "addvl %[outptr2], %[outptr2], #3\n"
974 "fmin z15.h, p0/m, z15.h, z0.h\n"
975 "st1h z13.h, p1, [%[outptr6], #1, MUL VL]\n"
976 "fadd z17.h, z17.h, z9.h\n"
977 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
978 "fmin z16.h, p1/m, z16.h, z0.h\n"
979 "st1h z14.h, p2, [%[outptr6], #2, MUL VL]\n"
980 "fmax z15.h, p0/m, z15.h, z1.h\n"
981 "addvl %[outptr3], %[outptr3], #3\n"
982 "fmin z17.h, p2/m, z17.h, z0.h\n"
983 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
984 "fmax z16.h, p1/m, z16.h, z1.h\n"
985 "st1h z15.h, p0, [%[outptr7]]\n"
986 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
987 "fmax z17.h, p2/m, z17.h, z1.h\n"
988 "addvl %[outptr4], %[outptr4], #3\n"
989 "st1h z16.h, p1, [%[outptr7], #1, MUL VL]\n"
990 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
991 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
992 "addvl %[outptr5], %[outptr5], #3\n"
993 "st1h z17.h, p2, [%[outptr7], #2, MUL VL]\n"
994 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
995 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
996 "addvl %[outptr6], %[outptr6], #3\n"
997 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
998 "addvl %[outptr7], %[outptr7], #3\n"
999 "addvl %[inptr], %[inptr], #24\n"
1000 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1001 [inptr] "+r" (inptr), [p] "+r" (p)
1002 : [w] "r" (w), [minval] "w" (minval), [maxval] "w" (maxval)
1003 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1004 );
1005 }
1006 break;
1007
1008
1009 }
1010 }
1011 else
1012 {
Georgios Pinitas5aa1a0b2020-07-02 20:02:20 +01001013 const __fp16 *biasptr = bias ? bias + i : nullbias;
Georgios Pinitas48b3ef82019-10-14 19:03:09 +01001014
1015 switch(height)
1016 {
1017 case 1:
1018 {
1019 long w = xmax - i;
1020 long p = 0;
1021 /* Optimized routine to copy an entire block */
1022 __asm __volatile (
1023 "mov z0.h, %h[maxval]\n"
1024 "addvl x8, %[inptr], #16\n"
1025 "mov z1.h, %h[minval]\n"
1026 "whilelt p0.h, %[p], %[w]\n"
1027 "inch %[p], all, mul #1\n"
1028 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1029 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1030 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1031 "whilelt p1.h, %[p], %[w]\n"
1032 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1033 "inch %[p], all, mul #1\n"
1034 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1035 "ld1h z13.h, p0/z, [%[inptr]]\n"
1036 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1037 "whilelt p2.h, %[p], %[w]\n"
1038 "fadd z13.h, z13.h, z2.h\n"
1039 "fadd z14.h, z14.h, z3.h\n"
1040 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1041 "addvl %[inptr], %[inptr], #24\n"
1042 "fmin z13.h, p0/m, z13.h, z0.h\n"
1043 "fmin z14.h, p1/m, z14.h, z0.h\n"
1044 "fadd z15.h, z15.h, z4.h\n"
1045 "fmax z13.h, p0/m, z13.h, z1.h\n"
1046 "fmax z14.h, p1/m, z14.h, z1.h\n"
1047 "fmin z15.h, p2/m, z15.h, z0.h\n"
1048 "st1h z13.h, p0, [%[outptr0]]\n"
1049 "fmax z15.h, p2/m, z15.h, z1.h\n"
1050 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1051 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1052 "addvl %[outptr0], %[outptr0], #3\n"
1053 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1054 [inptr] "+r" (inptr), [p] "+r" (p)
1055 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1056 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1057 );
1058 }
1059 break;
1060
1061 case 2:
1062 {
1063 long w = xmax - i;
1064 long p = 0;
1065 /* Optimized routine to copy an entire block */
1066 __asm __volatile (
1067 "mov z0.h, %h[maxval]\n"
1068 "addvl x8, %[inptr], #16\n"
1069 "mov z1.h, %h[minval]\n"
1070 "whilelt p0.h, %[p], %[w]\n"
1071 "inch %[p], all, mul #1\n"
1072 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1073 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1074 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1075 "whilelt p1.h, %[p], %[w]\n"
1076 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1077 "inch %[p], all, mul #1\n"
1078 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1079 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1080 "ld1h z13.h, p0/z, [%[inptr]]\n"
1081 "whilelt p2.h, %[p], %[w]\n"
1082 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1083 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1084 "fadd z13.h, z13.h, z2.h\n"
1085 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1086 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1087 "fadd z14.h, z14.h, z3.h\n"
1088 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1089 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1090 "addvl %[inptr], %[inptr], #24\n"
1091 "fmin z13.h, p0/m, z13.h, z0.h\n"
1092 "fmin z14.h, p1/m, z14.h, z0.h\n"
1093 "fadd z15.h, z15.h, z4.h\n"
1094 "fadd z16.h, z16.h, z2.h\n"
1095 "fmax z13.h, p0/m, z13.h, z1.h\n"
1096 "fmax z14.h, p1/m, z14.h, z1.h\n"
1097 "fmin z15.h, p2/m, z15.h, z0.h\n"
1098 "fmin z16.h, p0/m, z16.h, z0.h\n"
1099 "st1h z13.h, p0, [%[outptr0]]\n"
1100 "fadd z17.h, z17.h, z3.h\n"
1101 "fadd z18.h, z18.h, z4.h\n"
1102 "fmax z15.h, p2/m, z15.h, z1.h\n"
1103 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1104 "fmax z16.h, p0/m, z16.h, z1.h\n"
1105 "fmin z17.h, p1/m, z17.h, z0.h\n"
1106 "fmin z18.h, p2/m, z18.h, z0.h\n"
1107 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1108 "addvl %[outptr0], %[outptr0], #3\n"
1109 "fmax z17.h, p1/m, z17.h, z1.h\n"
1110 "st1h z16.h, p0, [%[outptr1]]\n"
1111 "fmax z18.h, p2/m, z18.h, z1.h\n"
1112 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1113 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1114 "addvl %[outptr1], %[outptr1], #3\n"
1115 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1116 [inptr] "+r" (inptr), [p] "+r" (p)
1117 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1118 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1119 );
1120 }
1121 break;
1122
1123 case 3:
1124 {
1125 long w = xmax - i;
1126 long p = 0;
1127 /* Optimized routine to copy an entire block */
1128 __asm __volatile (
1129 "mov z0.h, %h[maxval]\n"
1130 "addvl x8, %[inptr], #16\n"
1131 "mov z1.h, %h[minval]\n"
1132 "whilelt p0.h, %[p], %[w]\n"
1133 "inch %[p], all, mul #1\n"
1134 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1135 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1136 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1137 "whilelt p1.h, %[p], %[w]\n"
1138 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1139 "inch %[p], all, mul #1\n"
1140 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1141 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1142 "ld1h z13.h, p0/z, [%[inptr]]\n"
1143 "whilelt p2.h, %[p], %[w]\n"
1144 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1145 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1146 "fadd z13.h, z13.h, z2.h\n"
1147 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1148 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1149 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1150 "fadd z14.h, z14.h, z3.h\n"
1151 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1152 "fmin z13.h, p0/m, z13.h, z0.h\n"
1153 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1154 "fadd z15.h, z15.h, z4.h\n"
1155 "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
1156 "fadd z16.h, z16.h, z2.h\n"
1157 "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
1158 "fmin z14.h, p1/m, z14.h, z0.h\n"
1159 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1160 "fmax z13.h, p0/m, z13.h, z1.h\n"
1161 "addvl %[inptr], %[inptr], #24\n"
1162 "fmax z14.h, p1/m, z14.h, z1.h\n"
1163 "fmin z15.h, p2/m, z15.h, z0.h\n"
1164 "st1h z13.h, p0, [%[outptr0]]\n"
1165 "fmin z16.h, p0/m, z16.h, z0.h\n"
1166 "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
1167 "fadd z17.h, z17.h, z3.h\n"
1168 "fadd z18.h, z18.h, z4.h\n"
1169 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1170 "fmax z15.h, p2/m, z15.h, z1.h\n"
1171 "fmax z16.h, p0/m, z16.h, z1.h\n"
1172 "fmin z17.h, p1/m, z17.h, z0.h\n"
1173 "fmin z18.h, p2/m, z18.h, z0.h\n"
1174 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1175 "fadd z19.h, z19.h, z2.h\n"
1176 "addvl %[outptr0], %[outptr0], #3\n"
1177 "fmax z17.h, p1/m, z17.h, z1.h\n"
1178 "st1h z16.h, p0, [%[outptr1]]\n"
1179 "fmax z18.h, p2/m, z18.h, z1.h\n"
1180 "fmin z19.h, p0/m, z19.h, z0.h\n"
1181 "fadd z20.h, z20.h, z3.h\n"
1182 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1183 "fadd z13.h, z13.h, z4.h\n"
1184 "fmax z19.h, p0/m, z19.h, z1.h\n"
1185 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1186 "fmin z20.h, p1/m, z20.h, z0.h\n"
1187 "addvl %[outptr1], %[outptr1], #3\n"
1188 "fmin z13.h, p2/m, z13.h, z0.h\n"
1189 "st1h z19.h, p0, [%[outptr2]]\n"
1190 "fmax z20.h, p1/m, z20.h, z1.h\n"
1191 "fmax z13.h, p2/m, z13.h, z1.h\n"
1192 "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
1193 "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
1194 "addvl %[outptr2], %[outptr2], #3\n"
1195 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1196 [inptr] "+r" (inptr), [p] "+r" (p)
1197 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1198 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1199 );
1200 }
1201 break;
1202
1203 case 4:
1204 {
1205 long w = xmax - i;
1206 long p = 0;
1207 /* Optimized routine to copy an entire block */
1208 __asm __volatile (
1209 "mov z0.h, %h[maxval]\n"
1210 "addvl x8, %[inptr], #16\n"
1211 "mov z1.h, %h[minval]\n"
1212 "whilelt p0.h, %[p], %[w]\n"
1213 "inch %[p], all, mul #1\n"
1214 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1215 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1216 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1217 "whilelt p1.h, %[p], %[w]\n"
1218 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1219 "inch %[p], all, mul #1\n"
1220 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1221 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1222 "ld1h z13.h, p0/z, [%[inptr]]\n"
1223 "whilelt p2.h, %[p], %[w]\n"
1224 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1225 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1226 "fadd z13.h, z13.h, z2.h\n"
1227 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1228 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1229 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1230 "fadd z14.h, z14.h, z3.h\n"
1231 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1232 "fmin z13.h, p0/m, z13.h, z0.h\n"
1233 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1234 "fadd z15.h, z15.h, z4.h\n"
1235 "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
1236 "fadd z16.h, z16.h, z2.h\n"
1237 "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
1238 "fmin z14.h, p1/m, z14.h, z0.h\n"
1239 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1240 "fmax z13.h, p0/m, z13.h, z1.h\n"
1241 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1242 "fmax z14.h, p1/m, z14.h, z1.h\n"
1243 "addvl %[inptr], %[inptr], #24\n"
1244 "fmin z15.h, p2/m, z15.h, z0.h\n"
1245 "st1h z13.h, p0, [%[outptr0]]\n"
1246 "fmin z16.h, p0/m, z16.h, z0.h\n"
1247 "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
1248 "fadd z17.h, z17.h, z3.h\n"
1249 "fadd z18.h, z18.h, z4.h\n"
1250 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1251 "fmax z15.h, p2/m, z15.h, z1.h\n"
1252 "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
1253 "fmax z16.h, p0/m, z16.h, z1.h\n"
1254 "fmin z17.h, p1/m, z17.h, z0.h\n"
1255 "fmin z18.h, p2/m, z18.h, z0.h\n"
1256 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1257 "fadd z19.h, z19.h, z2.h\n"
1258 "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
1259 "fadd z20.h, z20.h, z3.h\n"
1260 "addvl %[outptr0], %[outptr0], #3\n"
1261 "fmax z17.h, p1/m, z17.h, z1.h\n"
1262 "st1h z16.h, p0, [%[outptr1]]\n"
1263 "fmax z18.h, p2/m, z18.h, z1.h\n"
1264 "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
1265 "fmin z19.h, p0/m, z19.h, z0.h\n"
1266 "fmin z20.h, p1/m, z20.h, z0.h\n"
1267 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1268 "fadd z13.h, z13.h, z4.h\n"
1269 "fadd z14.h, z14.h, z2.h\n"
1270 "fmax z19.h, p0/m, z19.h, z1.h\n"
1271 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1272 "fmax z20.h, p1/m, z20.h, z1.h\n"
1273 "addvl %[outptr1], %[outptr1], #3\n"
1274 "fmin z13.h, p2/m, z13.h, z0.h\n"
1275 "st1h z19.h, p0, [%[outptr2]]\n"
1276 "fmin z14.h, p0/m, z14.h, z0.h\n"
1277 "fadd z15.h, z15.h, z3.h\n"
1278 "fadd z16.h, z16.h, z4.h\n"
1279 "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
1280 "fmax z13.h, p2/m, z13.h, z1.h\n"
1281 "fmax z14.h, p0/m, z14.h, z1.h\n"
1282 "fmin z15.h, p1/m, z15.h, z0.h\n"
1283 "fmin z16.h, p2/m, z16.h, z0.h\n"
1284 "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
1285 "addvl %[outptr2], %[outptr2], #3\n"
1286 "fmax z15.h, p1/m, z15.h, z1.h\n"
1287 "st1h z14.h, p0, [%[outptr3]]\n"
1288 "fmax z16.h, p2/m, z16.h, z1.h\n"
1289 "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
1290 "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
1291 "addvl %[outptr3], %[outptr3], #3\n"
1292 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1293 [inptr] "+r" (inptr), [p] "+r" (p)
1294 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1295 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1296 );
1297 }
1298 break;
1299
1300 case 5:
1301 {
1302 long w = xmax - i;
1303 long p = 0;
1304 /* Optimized routine to copy an entire block */
1305 __asm __volatile (
1306 "mov z0.h, %h[maxval]\n"
1307 "addvl x8, %[inptr], #16\n"
1308 "mov z1.h, %h[minval]\n"
1309 "whilelt p0.h, %[p], %[w]\n"
1310 "inch %[p], all, mul #1\n"
1311 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1312 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1313 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1314 "whilelt p1.h, %[p], %[w]\n"
1315 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1316 "inch %[p], all, mul #1\n"
1317 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1318 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1319 "ld1h z13.h, p0/z, [%[inptr]]\n"
1320 "whilelt p2.h, %[p], %[w]\n"
1321 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1322 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1323 "fadd z13.h, z13.h, z2.h\n"
1324 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1325 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1326 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1327 "fadd z14.h, z14.h, z3.h\n"
1328 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1329 "fmin z13.h, p0/m, z13.h, z0.h\n"
1330 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1331 "fadd z15.h, z15.h, z4.h\n"
1332 "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
1333 "fadd z16.h, z16.h, z2.h\n"
1334 "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
1335 "fmin z14.h, p1/m, z14.h, z0.h\n"
1336 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1337 "fmax z13.h, p0/m, z13.h, z1.h\n"
1338 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1339 "fmax z14.h, p1/m, z14.h, z1.h\n"
1340 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1341 "fmin z15.h, p2/m, z15.h, z0.h\n"
1342 "st1h z13.h, p0, [%[outptr0]]\n"
1343 "fmin z16.h, p0/m, z16.h, z0.h\n"
1344 "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
1345 "fadd z17.h, z17.h, z3.h\n"
1346 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1347 "fmax z15.h, p2/m, z15.h, z1.h\n"
1348 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1349 "fmax z16.h, p0/m, z16.h, z1.h\n"
1350 "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
1351 "fmin z17.h, p1/m, z17.h, z0.h\n"
1352 "addvl %[inptr], %[inptr], #24\n"
1353 "fadd z18.h, z18.h, z4.h\n"
1354 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1355 "fadd z19.h, z19.h, z2.h\n"
1356 "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
1357 "fmax z17.h, p1/m, z17.h, z1.h\n"
1358 "addvl %[outptr0], %[outptr0], #3\n"
1359 "fmin z18.h, p2/m, z18.h, z0.h\n"
1360 "st1h z16.h, p0, [%[outptr1]]\n"
1361 "fmin z19.h, p0/m, z19.h, z0.h\n"
1362 "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
1363 "fadd z20.h, z20.h, z3.h\n"
1364 "fadd z13.h, z13.h, z4.h\n"
1365 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1366 "fmax z18.h, p2/m, z18.h, z1.h\n"
1367 "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
1368 "fmax z19.h, p0/m, z19.h, z1.h\n"
1369 "fmin z20.h, p1/m, z20.h, z0.h\n"
1370 "fmin z13.h, p2/m, z13.h, z0.h\n"
1371 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1372 "fadd z14.h, z14.h, z2.h\n"
1373 "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
1374 "fadd z15.h, z15.h, z3.h\n"
1375 "addvl %[outptr1], %[outptr1], #3\n"
1376 "fmax z20.h, p1/m, z20.h, z1.h\n"
1377 "st1h z19.h, p0, [%[outptr2]]\n"
1378 "fmax z13.h, p2/m, z13.h, z1.h\n"
1379 "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
1380 "fmin z14.h, p0/m, z14.h, z0.h\n"
1381 "fmin z15.h, p1/m, z15.h, z0.h\n"
1382 "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
1383 "fadd z16.h, z16.h, z4.h\n"
1384 "fadd z17.h, z17.h, z2.h\n"
1385 "fmax z14.h, p0/m, z14.h, z1.h\n"
1386 "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
1387 "fmax z15.h, p1/m, z15.h, z1.h\n"
1388 "addvl %[outptr2], %[outptr2], #3\n"
1389 "fmin z16.h, p2/m, z16.h, z0.h\n"
1390 "st1h z14.h, p0, [%[outptr3]]\n"
1391 "fmin z17.h, p0/m, z17.h, z0.h\n"
1392 "fadd z18.h, z18.h, z3.h\n"
1393 "fadd z19.h, z19.h, z4.h\n"
1394 "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
1395 "fmax z16.h, p2/m, z16.h, z1.h\n"
1396 "fmax z17.h, p0/m, z17.h, z1.h\n"
1397 "fmin z18.h, p1/m, z18.h, z0.h\n"
1398 "fmin z19.h, p2/m, z19.h, z0.h\n"
1399 "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
1400 "addvl %[outptr3], %[outptr3], #3\n"
1401 "fmax z18.h, p1/m, z18.h, z1.h\n"
1402 "st1h z17.h, p0, [%[outptr4]]\n"
1403 "fmax z19.h, p2/m, z19.h, z1.h\n"
1404 "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
1405 "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
1406 "addvl %[outptr4], %[outptr4], #3\n"
1407 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1408 [inptr] "+r" (inptr), [p] "+r" (p)
1409 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1410 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1411 );
1412 }
1413 break;
1414
1415 case 6:
1416 {
1417 long w = xmax - i;
1418 long p = 0;
1419 /* Optimized routine to copy an entire block */
1420 __asm __volatile (
1421 "mov z0.h, %h[maxval]\n"
1422 "addvl x8, %[inptr], #16\n"
1423 "mov z1.h, %h[minval]\n"
1424 "whilelt p0.h, %[p], %[w]\n"
1425 "inch %[p], all, mul #1\n"
1426 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1427 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1428 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1429 "whilelt p1.h, %[p], %[w]\n"
1430 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1431 "inch %[p], all, mul #1\n"
1432 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1433 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1434 "ld1h z13.h, p0/z, [%[inptr]]\n"
1435 "whilelt p2.h, %[p], %[w]\n"
1436 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1437 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1438 "fadd z13.h, z13.h, z2.h\n"
1439 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1440 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1441 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1442 "fadd z14.h, z14.h, z3.h\n"
1443 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1444 "fmin z13.h, p0/m, z13.h, z0.h\n"
1445 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1446 "fadd z15.h, z15.h, z4.h\n"
1447 "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
1448 "fadd z16.h, z16.h, z2.h\n"
1449 "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
1450 "fmin z14.h, p1/m, z14.h, z0.h\n"
1451 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1452 "fmax z13.h, p0/m, z13.h, z1.h\n"
1453 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1454 "fmax z14.h, p1/m, z14.h, z1.h\n"
1455 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1456 "fmin z15.h, p2/m, z15.h, z0.h\n"
1457 "st1h z13.h, p0, [%[outptr0]]\n"
1458 "fmin z16.h, p0/m, z16.h, z0.h\n"
1459 "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
1460 "fadd z17.h, z17.h, z3.h\n"
1461 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1462 "fmax z15.h, p2/m, z15.h, z1.h\n"
1463 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1464 "fmax z16.h, p0/m, z16.h, z1.h\n"
1465 "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
1466 "fmin z17.h, p1/m, z17.h, z0.h\n"
1467 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1468 "fadd z18.h, z18.h, z4.h\n"
1469 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1470 "fadd z19.h, z19.h, z2.h\n"
1471 "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
1472 "fmax z17.h, p1/m, z17.h, z1.h\n"
1473 "addvl %[outptr0], %[outptr0], #3\n"
1474 "fmin z18.h, p2/m, z18.h, z0.h\n"
1475 "st1h z16.h, p0, [%[outptr1]]\n"
1476 "fmin z19.h, p0/m, z19.h, z0.h\n"
1477 "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
1478 "fadd z20.h, z20.h, z3.h\n"
1479 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1480 "fmax z18.h, p2/m, z18.h, z1.h\n"
1481 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1482 "fmax z19.h, p0/m, z19.h, z1.h\n"
1483 "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
1484 "fmin z20.h, p1/m, z20.h, z0.h\n"
1485 "addvl %[inptr], %[inptr], #24\n"
1486 "fadd z13.h, z13.h, z4.h\n"
1487 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1488 "fadd z14.h, z14.h, z2.h\n"
1489 "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
1490 "fmax z20.h, p1/m, z20.h, z1.h\n"
1491 "addvl %[outptr1], %[outptr1], #3\n"
1492 "fmin z13.h, p2/m, z13.h, z0.h\n"
1493 "st1h z19.h, p0, [%[outptr2]]\n"
1494 "fmin z14.h, p0/m, z14.h, z0.h\n"
1495 "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
1496 "fadd z15.h, z15.h, z3.h\n"
1497 "fadd z16.h, z16.h, z4.h\n"
1498 "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
1499 "fmax z13.h, p2/m, z13.h, z1.h\n"
1500 "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
1501 "fmax z14.h, p0/m, z14.h, z1.h\n"
1502 "fmin z15.h, p1/m, z15.h, z0.h\n"
1503 "fmin z16.h, p2/m, z16.h, z0.h\n"
1504 "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
1505 "fadd z17.h, z17.h, z2.h\n"
1506 "ld1h z13.h, p1/z, [x8]\n"
1507 "fadd z18.h, z18.h, z3.h\n"
1508 "addvl %[outptr2], %[outptr2], #3\n"
1509 "fmax z15.h, p1/m, z15.h, z1.h\n"
1510 "st1h z14.h, p0, [%[outptr3]]\n"
1511 "fmax z16.h, p2/m, z16.h, z1.h\n"
1512 "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
1513 "fmin z17.h, p0/m, z17.h, z0.h\n"
1514 "fmin z18.h, p1/m, z18.h, z0.h\n"
1515 "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
1516 "fadd z19.h, z19.h, z4.h\n"
1517 "fadd z20.h, z20.h, z2.h\n"
1518 "fmax z17.h, p0/m, z17.h, z1.h\n"
1519 "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
1520 "fmax z18.h, p1/m, z18.h, z1.h\n"
1521 "addvl %[outptr3], %[outptr3], #3\n"
1522 "fmin z19.h, p2/m, z19.h, z0.h\n"
1523 "st1h z17.h, p0, [%[outptr4]]\n"
1524 "fmin z20.h, p0/m, z20.h, z0.h\n"
1525 "fadd z13.h, z13.h, z3.h\n"
1526 "fadd z14.h, z14.h, z4.h\n"
1527 "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
1528 "fmax z19.h, p2/m, z19.h, z1.h\n"
1529 "fmax z20.h, p0/m, z20.h, z1.h\n"
1530 "fmin z13.h, p1/m, z13.h, z0.h\n"
1531 "fmin z14.h, p2/m, z14.h, z0.h\n"
1532 "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
1533 "addvl %[outptr4], %[outptr4], #3\n"
1534 "fmax z13.h, p1/m, z13.h, z1.h\n"
1535 "st1h z20.h, p0, [%[outptr5]]\n"
1536 "fmax z14.h, p2/m, z14.h, z1.h\n"
1537 "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
1538 "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
1539 "addvl %[outptr5], %[outptr5], #3\n"
1540 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1541 [inptr] "+r" (inptr), [p] "+r" (p)
1542 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1543 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1544 );
1545 }
1546 break;
1547
1548 case 7:
1549 {
1550 long w = xmax - i;
1551 long p = 0;
1552 /* Optimized routine to copy an entire block */
1553 __asm __volatile (
1554 "mov z0.h, %h[maxval]\n"
1555 "addvl x8, %[inptr], #16\n"
1556 "mov z1.h, %h[minval]\n"
1557 "whilelt p0.h, %[p], %[w]\n"
1558 "inch %[p], all, mul #1\n"
1559 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1560 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1561 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1562 "whilelt p1.h, %[p], %[w]\n"
1563 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1564 "inch %[p], all, mul #1\n"
1565 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1566 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1567 "ld1h z13.h, p0/z, [%[inptr]]\n"
1568 "whilelt p2.h, %[p], %[w]\n"
1569 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1570 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1571 "fadd z13.h, z13.h, z2.h\n"
1572 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1573 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1574 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1575 "fadd z14.h, z14.h, z3.h\n"
1576 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1577 "fmin z13.h, p0/m, z13.h, z0.h\n"
1578 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1579 "fadd z15.h, z15.h, z4.h\n"
1580 "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
1581 "fadd z16.h, z16.h, z2.h\n"
1582 "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
1583 "fmin z14.h, p1/m, z14.h, z0.h\n"
1584 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1585 "fmax z13.h, p0/m, z13.h, z1.h\n"
1586 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1587 "fmax z14.h, p1/m, z14.h, z1.h\n"
1588 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1589 "fmin z15.h, p2/m, z15.h, z0.h\n"
1590 "st1h z13.h, p0, [%[outptr0]]\n"
1591 "fmin z16.h, p0/m, z16.h, z0.h\n"
1592 "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
1593 "fadd z17.h, z17.h, z3.h\n"
1594 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1595 "fmax z15.h, p2/m, z15.h, z1.h\n"
1596 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1597 "fmax z16.h, p0/m, z16.h, z1.h\n"
1598 "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
1599 "fmin z17.h, p1/m, z17.h, z0.h\n"
1600 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1601 "fadd z18.h, z18.h, z4.h\n"
1602 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1603 "fadd z19.h, z19.h, z2.h\n"
1604 "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
1605 "fmax z17.h, p1/m, z17.h, z1.h\n"
1606 "addvl %[outptr0], %[outptr0], #3\n"
1607 "fmin z18.h, p2/m, z18.h, z0.h\n"
1608 "st1h z16.h, p0, [%[outptr1]]\n"
1609 "fmin z19.h, p0/m, z19.h, z0.h\n"
1610 "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
1611 "fadd z20.h, z20.h, z3.h\n"
1612 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1613 "fmax z18.h, p2/m, z18.h, z1.h\n"
1614 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1615 "fmax z19.h, p0/m, z19.h, z1.h\n"
1616 "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
1617 "fmin z20.h, p1/m, z20.h, z0.h\n"
1618 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1619 "fadd z13.h, z13.h, z4.h\n"
1620 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1621 "fadd z14.h, z14.h, z2.h\n"
1622 "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
1623 "fmax z20.h, p1/m, z20.h, z1.h\n"
1624 "addvl %[outptr1], %[outptr1], #3\n"
1625 "fmin z13.h, p2/m, z13.h, z0.h\n"
1626 "st1h z19.h, p0, [%[outptr2]]\n"
1627 "fmin z14.h, p0/m, z14.h, z0.h\n"
1628 "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
1629 "fadd z15.h, z15.h, z3.h\n"
1630 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1631 "fmax z13.h, p2/m, z13.h, z1.h\n"
1632 "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
1633 "fmax z14.h, p0/m, z14.h, z1.h\n"
1634 "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
1635 "fmin z15.h, p1/m, z15.h, z0.h\n"
1636 "addvl %[inptr], %[inptr], #24\n"
1637 "fadd z16.h, z16.h, z4.h\n"
1638 "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
1639 "fadd z17.h, z17.h, z2.h\n"
1640 "ld1h z13.h, p1/z, [x8]\n"
1641 "fmax z15.h, p1/m, z15.h, z1.h\n"
1642 "addvl %[outptr2], %[outptr2], #3\n"
1643 "fmin z16.h, p2/m, z16.h, z0.h\n"
1644 "st1h z14.h, p0, [%[outptr3]]\n"
1645 "fmin z17.h, p0/m, z17.h, z0.h\n"
1646 "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
1647 "fadd z18.h, z18.h, z3.h\n"
1648 "fadd z19.h, z19.h, z4.h\n"
1649 "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
1650 "fmax z16.h, p2/m, z16.h, z1.h\n"
1651 "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
1652 "fmax z17.h, p0/m, z17.h, z1.h\n"
1653 "fmin z18.h, p1/m, z18.h, z0.h\n"
1654 "fmin z19.h, p2/m, z19.h, z0.h\n"
1655 "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
1656 "fadd z20.h, z20.h, z2.h\n"
1657 "ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
1658 "fadd z13.h, z13.h, z3.h\n"
1659 "addvl %[outptr3], %[outptr3], #3\n"
1660 "fmax z18.h, p1/m, z18.h, z1.h\n"
1661 "st1h z17.h, p0, [%[outptr4]]\n"
1662 "fmax z19.h, p2/m, z19.h, z1.h\n"
1663 "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
1664 "fmin z20.h, p0/m, z20.h, z0.h\n"
1665 "fmin z13.h, p1/m, z13.h, z0.h\n"
1666 "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
1667 "fadd z14.h, z14.h, z4.h\n"
1668 "fadd z15.h, z15.h, z2.h\n"
1669 "fmax z20.h, p0/m, z20.h, z1.h\n"
1670 "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
1671 "fmax z13.h, p1/m, z13.h, z1.h\n"
1672 "addvl %[outptr4], %[outptr4], #3\n"
1673 "fmin z14.h, p2/m, z14.h, z0.h\n"
1674 "st1h z20.h, p0, [%[outptr5]]\n"
1675 "fmin z15.h, p0/m, z15.h, z0.h\n"
1676 "fadd z16.h, z16.h, z3.h\n"
1677 "fadd z17.h, z17.h, z4.h\n"
1678 "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
1679 "fmax z14.h, p2/m, z14.h, z1.h\n"
1680 "fmax z15.h, p0/m, z15.h, z1.h\n"
1681 "fmin z16.h, p1/m, z16.h, z0.h\n"
1682 "fmin z17.h, p2/m, z17.h, z0.h\n"
1683 "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
1684 "addvl %[outptr5], %[outptr5], #3\n"
1685 "fmax z16.h, p1/m, z16.h, z1.h\n"
1686 "st1h z15.h, p0, [%[outptr6]]\n"
1687 "fmax z17.h, p2/m, z17.h, z1.h\n"
1688 "st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
1689 "st1h z17.h, p2, [%[outptr6], #2, MUL VL]\n"
1690 "addvl %[outptr6], %[outptr6], #3\n"
1691 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1692 [inptr] "+r" (inptr), [p] "+r" (p)
1693 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1694 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1695 );
1696 }
1697 break;
1698
1699 default:
1700 case 8:
1701 {
1702 long w = xmax - i;
1703 long p = 0;
1704 /* Optimized routine to copy an entire block */
1705 __asm __volatile (
1706 "mov z0.h, %h[maxval]\n"
1707 "addvl x8, %[inptr], #16\n"
1708 "mov z1.h, %h[minval]\n"
1709 "whilelt p0.h, %[p], %[w]\n"
1710 "inch %[p], all, mul #1\n"
1711 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1712 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1713 "ld1h z2.h, p0/z, [%[biasptr]]\n"
1714 "whilelt p1.h, %[p], %[w]\n"
1715 "ld1h z3.h, p0/z, [%[biasptr], #1, MUL VL]\n"
1716 "inch %[p], all, mul #1\n"
1717 "ld1h z4.h, p0/z, [%[biasptr], #2, MUL VL]\n"
1718 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1719 "ld1h z13.h, p0/z, [%[inptr]]\n"
1720 "whilelt p2.h, %[p], %[w]\n"
1721 "ld1h z14.h, p1/z, [%[inptr], #1, MUL VL]\n"
1722 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1723 "fadd z13.h, z13.h, z2.h\n"
1724 "ld1h z15.h, p2/z, [%[inptr], #2, MUL VL]\n"
1725 "ld1h z16.h, p0/z, [%[inptr], #3, MUL VL]\n"
1726 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1727 "fadd z14.h, z14.h, z3.h\n"
1728 "ld1h z17.h, p1/z, [%[inptr], #4, MUL VL]\n"
1729 "fmin z13.h, p0/m, z13.h, z0.h\n"
1730 "ld1h z18.h, p2/z, [%[inptr], #5, MUL VL]\n"
1731 "fadd z15.h, z15.h, z4.h\n"
1732 "ld1h z19.h, p0/z, [%[inptr], #6, MUL VL]\n"
1733 "fadd z16.h, z16.h, z2.h\n"
1734 "ld1h z20.h, p1/z, [%[inptr], #7, MUL VL]\n"
1735 "fmin z14.h, p1/m, z14.h, z0.h\n"
1736 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1737 "fmax z13.h, p0/m, z13.h, z1.h\n"
1738 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1739 "fmax z14.h, p1/m, z14.h, z1.h\n"
1740 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1741 "fmin z15.h, p2/m, z15.h, z0.h\n"
1742 "st1h z13.h, p0, [%[outptr0]]\n"
1743 "fmin z16.h, p0/m, z16.h, z0.h\n"
1744 "ld1h z13.h, p2/z, [x8, #-8, MUL VL]\n"
1745 "fadd z17.h, z17.h, z3.h\n"
1746 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1747 "fmax z15.h, p2/m, z15.h, z1.h\n"
1748 "st1h z14.h, p1, [%[outptr0], #1, MUL VL]\n"
1749 "fmax z16.h, p0/m, z16.h, z1.h\n"
1750 "ld1h z14.h, p0/z, [x8, #-7, MUL VL]\n"
1751 "fmin z17.h, p1/m, z17.h, z0.h\n"
1752 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1753 "fadd z18.h, z18.h, z4.h\n"
1754 "st1h z15.h, p2, [%[outptr0], #2, MUL VL]\n"
1755 "fadd z19.h, z19.h, z2.h\n"
1756 "ld1h z15.h, p1/z, [x8, #-6, MUL VL]\n"
1757 "fmax z17.h, p1/m, z17.h, z1.h\n"
1758 "addvl %[outptr0], %[outptr0], #3\n"
1759 "fmin z18.h, p2/m, z18.h, z0.h\n"
1760 "st1h z16.h, p0, [%[outptr1]]\n"
1761 "fmin z19.h, p0/m, z19.h, z0.h\n"
1762 "ld1h z16.h, p2/z, [x8, #-5, MUL VL]\n"
1763 "fadd z20.h, z20.h, z3.h\n"
1764 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1765 "fmax z18.h, p2/m, z18.h, z1.h\n"
1766 "st1h z17.h, p1, [%[outptr1], #1, MUL VL]\n"
1767 "fmax z19.h, p0/m, z19.h, z1.h\n"
1768 "ld1h z17.h, p0/z, [x8, #-4, MUL VL]\n"
1769 "fmin z20.h, p1/m, z20.h, z0.h\n"
1770 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1771 "fadd z13.h, z13.h, z4.h\n"
1772 "st1h z18.h, p2, [%[outptr1], #2, MUL VL]\n"
1773 "fadd z14.h, z14.h, z2.h\n"
1774 "ld1h z18.h, p1/z, [x8, #-3, MUL VL]\n"
1775 "fmax z20.h, p1/m, z20.h, z1.h\n"
1776 "addvl %[outptr1], %[outptr1], #3\n"
1777 "fmin z13.h, p2/m, z13.h, z0.h\n"
1778 "st1h z19.h, p0, [%[outptr2]]\n"
1779 "fmin z14.h, p0/m, z14.h, z0.h\n"
1780 "ld1h z19.h, p2/z, [x8, #-2, MUL VL]\n"
1781 "fadd z15.h, z15.h, z3.h\n"
1782 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1783 "fmax z13.h, p2/m, z13.h, z1.h\n"
1784 "st1h z20.h, p1, [%[outptr2], #1, MUL VL]\n"
1785 "fmax z14.h, p0/m, z14.h, z1.h\n"
1786 "ld1h z20.h, p0/z, [x8, #-1, MUL VL]\n"
1787 "fmin z15.h, p1/m, z15.h, z0.h\n"
1788 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1789 "fadd z16.h, z16.h, z4.h\n"
1790 "st1h z13.h, p2, [%[outptr2], #2, MUL VL]\n"
1791 "fadd z17.h, z17.h, z2.h\n"
1792 "ld1h z13.h, p1/z, [x8]\n"
1793 "fmax z15.h, p1/m, z15.h, z1.h\n"
1794 "addvl %[outptr2], %[outptr2], #3\n"
1795 "fmin z16.h, p2/m, z16.h, z0.h\n"
1796 "st1h z14.h, p0, [%[outptr3]]\n"
1797 "fmin z17.h, p0/m, z17.h, z0.h\n"
1798 "ld1h z14.h, p2/z, [x8, #1, MUL VL]\n"
1799 "fadd z18.h, z18.h, z3.h\n"
1800 "addvl %[inptr], %[inptr], #24\n"
1801 "fmax z16.h, p2/m, z16.h, z1.h\n"
1802 "st1h z15.h, p1, [%[outptr3], #1, MUL VL]\n"
1803 "fmax z17.h, p0/m, z17.h, z1.h\n"
1804 "ld1h z15.h, p0/z, [x8, #2, MUL VL]\n"
1805 "fmin z18.h, p1/m, z18.h, z0.h\n"
1806 "fadd z19.h, z19.h, z4.h\n"
1807 "st1h z16.h, p2, [%[outptr3], #2, MUL VL]\n"
1808 "fadd z20.h, z20.h, z2.h\n"
1809 "ld1h z16.h, p1/z, [x8, #3, MUL VL]\n"
1810 "fadd z13.h, z13.h, z3.h\n"
1811 "addvl %[outptr3], %[outptr3], #3\n"
1812 "fmax z18.h, p1/m, z18.h, z1.h\n"
1813 "st1h z17.h, p0, [%[outptr4]]\n"
1814 "fmin z19.h, p2/m, z19.h, z0.h\n"
1815 "ld1h z17.h, p2/z, [x8, #4, MUL VL]\n"
1816 "fmin z20.h, p0/m, z20.h, z0.h\n"
1817 "fmin z13.h, p1/m, z13.h, z0.h\n"
1818 "st1h z18.h, p1, [%[outptr4], #1, MUL VL]\n"
1819 "fadd z14.h, z14.h, z4.h\n"
1820 "ld1h z18.h, p0/z, [x8, #5, MUL VL]\n"
1821 "fmax z19.h, p2/m, z19.h, z1.h\n"
1822 "fmax z20.h, p0/m, z20.h, z1.h\n"
1823 "fmax z13.h, p1/m, z13.h, z1.h\n"
1824 "fmin z14.h, p2/m, z14.h, z0.h\n"
1825 "st1h z19.h, p2, [%[outptr4], #2, MUL VL]\n"
1826 "fadd z15.h, z15.h, z2.h\n"
1827 "ld1h z19.h, p1/z, [x8, #6, MUL VL]\n"
1828 "fadd z16.h, z16.h, z3.h\n"
1829 "addvl %[outptr4], %[outptr4], #3\n"
1830 "fmax z14.h, p2/m, z14.h, z1.h\n"
1831 "st1h z20.h, p0, [%[outptr5]]\n"
1832 "fmin z15.h, p0/m, z15.h, z0.h\n"
1833 "ld1h z20.h, p2/z, [x8, #7, MUL VL]\n"
1834 "fmin z16.h, p1/m, z16.h, z0.h\n"
1835 "fadd z17.h, z17.h, z4.h\n"
1836 "st1h z13.h, p1, [%[outptr5], #1, MUL VL]\n"
1837 "fadd z18.h, z18.h, z2.h\n"
1838 "fmax z15.h, p0/m, z15.h, z1.h\n"
1839 "fmax z16.h, p1/m, z16.h, z1.h\n"
1840 "st1h z14.h, p2, [%[outptr5], #2, MUL VL]\n"
1841 "fmin z17.h, p2/m, z17.h, z0.h\n"
1842 "addvl %[outptr5], %[outptr5], #3\n"
1843 "fmin z18.h, p0/m, z18.h, z0.h\n"
1844 "st1h z15.h, p0, [%[outptr6]]\n"
1845 "fadd z19.h, z19.h, z3.h\n"
1846 "fmax z17.h, p2/m, z17.h, z1.h\n"
1847 "fadd z20.h, z20.h, z4.h\n"
1848 "st1h z16.h, p1, [%[outptr6], #1, MUL VL]\n"
1849 "fmax z18.h, p0/m, z18.h, z1.h\n"
1850 "fmin z19.h, p1/m, z19.h, z0.h\n"
1851 "fmin z20.h, p2/m, z20.h, z0.h\n"
1852 "st1h z17.h, p2, [%[outptr6], #2, MUL VL]\n"
1853 "addvl %[outptr6], %[outptr6], #3\n"
1854 "fmax z19.h, p1/m, z19.h, z1.h\n"
1855 "fmax z20.h, p2/m, z20.h, z1.h\n"
1856 "st1h z18.h, p0, [%[outptr7]]\n"
1857 "st1h z19.h, p1, [%[outptr7], #1, MUL VL]\n"
1858 "st1h z20.h, p2, [%[outptr7], #2, MUL VL]\n"
1859 "addvl %[outptr7], %[outptr7], #3\n"
1860 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1861 [inptr] "+r" (inptr), [p] "+r" (p)
1862 : [w] "r" (w), [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1863 : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "memory", "cc"
1864 );
1865 }
1866 break;
1867
1868
1869 }
1870 }
1871 }
1872 }
1873}
1874
1875#endif // __ARM_FEATURE_SVE